From b36bca2c00e3cbe0fcbb5c4064e325ebf12d58ef Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 8 Sep 2023 11:32:46 -0700
Subject: [PATCH] Squash

---
 .flake8                                       |   24 -
 .github/copy-pr-bot.yaml                      |    4 +
 .github/ops-bot.yaml                          |    1 -
 .github/workflows/build.yaml                  |   33 +-
 .github/workflows/pr.yaml                     |   47 +-
 .github/workflows/test.yaml                   |   25 +-
 .gitignore                                    |    1 +
 .pre-commit-config.yaml                       |   19 +-
 CHANGELOG.md                                  |  428 ++++
 CONTRIBUTING.md                               |    2 +-
 README.md                                     |    4 +-
 build.sh                                      |   34 +-
 ci/build_cpp.sh                               |    4 +-
 ci/build_docs.sh                              |   36 +-
 ci/build_python.sh                            |    9 +-
 ci/build_wheel.sh                             |   55 +
 ci/build_wheel_cudf.sh                        |   16 +
 ci/build_wheel_dask_cudf.sh                   |   11 +
 ci/check_style.sh                             |    2 +-
 ci/checks/copyright.py                        |    1 -
 ci/checks/doxygen.sh                          |    8 +-
 ci/docs/build.sh                              |   61 -
 ci/release/apply_wheel_modifications.sh       |   32 -
 ci/release/update-version.sh                  |   43 +-
 ci/test_cpp.sh                                |   44 +-
 ci/test_wheel_cudf.sh                         |   17 +
 ci/test_wheel_dask_cudf.sh                    |   19 +
 .../all_cuda-118_arch-x86_64.yaml             |   47 +-
 .../all_cuda-120_arch-x86_64.yaml             |   98 +
 conda/recipes/cudf/conda_build_config.yaml    |    5 +-
 conda/recipes/cudf/meta.yaml                  |   60 +-
 .../cudf_kafka/conda_build_config.yaml        |    3 +
 conda/recipes/cudf_kafka/meta.yaml            |   11 +-
 conda/recipes/custreamz/meta.yaml             |   15 +-
 conda/recipes/dask-cudf/meta.yaml             |   18 +-
 conda/recipes/dask-cudf/run_test.sh           |    8 +-
 conda/recipes/libcudf/build.sh                |    8 +-
 conda/recipes/libcudf/conda_build_config.yaml |   41 +-
 conda/recipes/libcudf/meta.yaml               |  104 +-
 conda/recipes/libcudf/nvcomp.txt              |    3 -
 conda/recipes/libcudf/post-link.sh            |    6 -
 cpp/CMakeLists.txt                            |   85 +-
 cpp/benchmarks/CMakeLists.txt                 |   40 +-
 cpp/benchmarks/binaryop/compiled_binaryop.cpp |    4 +
 cpp/benchmarks/common/generate_input.cu       |   47 +-
 cpp/benchmarks/common/generate_input.hpp      |   15 +
 cpp/benchmarks/copying/contiguous_split.cu    |   94 +-
 cpp/benchmarks/copying/copy_if_else.cpp       |    8 +
 cpp/benchmarks/copying/gather.cu              |    4 +-
 cpp/benchmarks/fixture/benchmark_fixture.hpp  |   12 +-
 cpp/benchmarks/fixture/nvbench_fixture.hpp    |   94 +
 cpp/benchmarks/fixture/nvbench_main.cpp       |   21 +-
 cpp/benchmarks/fixture/rmm_pool_raii.hpp      |   81 -
 .../fixture/templated_benchmark_fixture.hpp   |    4 +-
 cpp/benchmarks/groupby/group_max.cpp          |    3 +-
 cpp/benchmarks/groupby/group_no_requests.cpp  |    4 +-
 cpp/benchmarks/groupby/group_nth.cpp          |    2 +-
 cpp/benchmarks/groupby/group_nunique.cpp      |    3 +-
 cpp/benchmarks/groupby/group_rank.cpp         |    1 -
 cpp/benchmarks/groupby/group_scan.cpp         |    4 +-
 cpp/benchmarks/groupby/group_shift.cpp        |    4 +-
 cpp/benchmarks/groupby/group_struct_keys.cpp  |    9 +-
 cpp/benchmarks/groupby/group_sum.cpp          |    4 +-
 cpp/benchmarks/hashing/hash.cpp               |   84 +-
 cpp/benchmarks/hashing/partition.cpp          |    7 +
 cpp/benchmarks/io/csv/csv_reader_input.cpp    |    1 -
 cpp/benchmarks/io/csv/csv_reader_options.cpp  |    1 -
 cpp/benchmarks/io/csv/csv_writer.cpp          |    2 +-
 cpp/benchmarks/io/fst.cu                      |   55 +-
 cpp/benchmarks/io/json/json_reader_input.cpp  |   48 +-
 cpp/benchmarks/io/json/json_writer.cpp        |    2 +-
 cpp/benchmarks/io/json/nested_json.cpp        |   11 +-
 cpp/benchmarks/io/orc/orc_reader_input.cpp    |    3 +-
 cpp/benchmarks/io/orc/orc_reader_options.cpp  |    3 +-
 cpp/benchmarks/io/orc/orc_writer.cpp          |    3 +-
 cpp/benchmarks/io/orc/orc_writer_chunks.cpp   |    3 +-
 .../io/parquet/parquet_reader_input.cpp       |   48 +-
 .../io/parquet/parquet_reader_options.cpp     |    3 +-
 cpp/benchmarks/io/parquet/parquet_writer.cpp  |    3 +-
 .../io/parquet/parquet_writer_chunks.cpp      |    3 +-
 cpp/benchmarks/io/text/multibyte_split.cpp    |    1 -
 cpp/benchmarks/iterator/iterator.cu           |    4 +-
 cpp/benchmarks/join/generate_input_tables.cuh |   34 +-
 cpp/benchmarks/join/join.cu                   |    1 -
 cpp/benchmarks/join/join_common.hpp           |    4 +-
 cpp/benchmarks/join/mixed_join.cu             |    1 -
 cpp/benchmarks/lists/copying/scatter_lists.cu |   24 +-
 cpp/benchmarks/lists/set_operations.cpp       |    1 -
 cpp/benchmarks/null_mask/set_null_mask.cpp    |    2 +-
 cpp/benchmarks/quantiles/quantiles.cpp        |    6 +-
 cpp/benchmarks/reduction/anyall.cpp           |    2 +-
 cpp/benchmarks/reduction/dictionary.cpp       |    2 +-
 cpp/benchmarks/reduction/minmax.cpp           |    2 +-
 cpp/benchmarks/reduction/rank.cpp             |    1 -
 cpp/benchmarks/reduction/reduce.cpp           |    2 +-
 cpp/benchmarks/reduction/scan_structs.cpp     |    1 -
 cpp/benchmarks/reduction/segmented_reduce.cpp |    4 +-
 cpp/benchmarks/search/contains.cpp            |    1 -
 cpp/benchmarks/sort/nested_types_common.hpp   |   11 +-
 cpp/benchmarks/sort/rank.cpp                  |    2 +-
 cpp/benchmarks/sort/rank_structs.cpp          |    2 +-
 cpp/benchmarks/sort/segmented_sort.cpp        |    1 -
 cpp/benchmarks/sort/sort.cpp                  |    4 +-
 cpp/benchmarks/sort/sort_lists.cpp            |    2 +-
 .../stream_compaction/apply_boolean_mask.cpp  |    6 +-
 cpp/benchmarks/stream_compaction/distinct.cpp |    1 -
 .../stream_compaction/stable_distinct.cpp     |   96 +
 cpp/benchmarks/stream_compaction/unique.cpp   |    1 -
 .../stream_compaction/unique_count.cpp        |    1 -
 cpp/benchmarks/string/case.cpp                |    7 +-
 cpp/benchmarks/string/char_types.cpp          |   66 +
 cpp/benchmarks/string/contains.cpp            |  103 +-
 cpp/benchmarks/string/convert_durations.cpp   |    4 +-
 cpp/benchmarks/string/convert_fixed_point.cpp |   18 +-
 cpp/benchmarks/string/count.cpp               |   62 +
 cpp/benchmarks/string/extract.cpp             |   71 +-
 cpp/benchmarks/string/gather.cpp              |   59 +
 cpp/benchmarks/string/join_strings.cpp        |   58 +
 cpp/benchmarks/string/json.cu                 |    4 +-
 cpp/benchmarks/string/lengths.cpp             |    7 +-
 cpp/benchmarks/string/like.cpp                |   60 +-
 cpp/benchmarks/string/replace_re.cpp          |   84 +-
 cpp/benchmarks/string/reverse.cpp             |    7 +-
 cpp/benchmarks/string/slice.cpp               |   10 +-
 cpp/benchmarks/string/split.cpp               |   86 +-
 cpp/benchmarks/string/split_re.cpp            |   60 +
 .../synchronization/synchronization.cpp       |    4 +-
 cpp/benchmarks/text/edit_distance.cpp         |   58 +
 cpp/benchmarks/text/hash_ngrams.cpp           |   60 +
 cpp/benchmarks/text/jaccard.cpp               |   62 +
 cpp/benchmarks/text/minhash.cpp               |   18 +-
 cpp/benchmarks/text/normalize.cpp             |   69 +-
 cpp/benchmarks/text/normalize_spaces.cpp      |   66 -
 cpp/benchmarks/text/replace.cpp               |   59 +-
 cpp/benchmarks/text/subword.cpp               |    9 +-
 cpp/benchmarks/text/tokenize.cpp              |  108 +-
 .../type_dispatcher/type_dispatcher.cu        |   12 +-
 cpp/cmake/thirdparty/get_arrow.cmake          |   36 +-
 cpp/cmake/thirdparty/get_cufile.cmake         |    4 +-
 cpp/cmake/thirdparty/get_libcudacxx.cmake     |   37 +
 cpp/cmake/thirdparty/get_thrust.cmake         |    6 +-
 .../patches/nvbench_global_setup.diff         |   12 +-
 .../thirdparty/patches/nvbench_override.json  |    5 -
 cpp/doxygen/Doxyfile                          |  168 +-
 .../developer_guide/DEVELOPER_GUIDE.md        |   25 +-
 cpp/doxygen/developer_guide/TESTING.md        |   66 +
 cpp/examples/basic/CMakeLists.txt             |    4 +-
 cpp/examples/strings/CMakeLists.txt           |    4 +-
 cpp/examples/strings/common.hpp               |    4 +-
 cpp/examples/strings/custom_prealloc.cu       |    2 +-
 .../cudf/ast/detail/expression_parser.hpp     |   27 +-
 .../ast/detail/expression_transformer.hpp     |   64 +
 cpp/include/cudf/ast/detail/operators.hpp     |   47 +
 cpp/include/cudf/ast/expressions.hpp          |   88 +-
 cpp/include/cudf/column/column.hpp            |   87 +-
 .../cudf/column/column_device_view.cuh        |    6 +-
 cpp/include/cudf/column/column_factories.hpp  |    4 +-
 cpp/include/cudf/column/column_view.hpp       |   62 +-
 cpp/include/cudf/concatenate.hpp              |   42 +-
 cpp/include/cudf/contiguous_split.hpp         |  151 +-
 cpp/include/cudf/copying.hpp                  |  168 +-
 cpp/include/cudf/datetime.hpp                 |    4 +-
 .../cudf/detail/aggregation/result_cache.hpp  |    6 +-
 ...{concatenate.cuh => concatenate_masks.hpp} |   27 +-
 cpp/include/cudf/detail/copy_if.cuh           |   26 +-
 cpp/include/cudf/detail/copy_if_else.cuh      |    8 +-
 cpp/include/cudf/detail/copy_range.cuh        |   27 +-
 cpp/include/cudf/detail/indexalator.cuh       |    6 +-
 cpp/include/cudf/detail/join.hpp              |    4 +-
 cpp/include/cudf/detail/null_mask.cuh         |   29 +-
 cpp/include/cudf/detail/null_mask.hpp         |   15 +-
 cpp/include/cudf/detail/nvtx/nvtx3.hpp        |    6 +-
 cpp/include/cudf/detail/scatter.hpp           |    4 +-
 .../cudf/detail/sizes_to_offsets_iterator.cuh |    8 +-
 cpp/include/cudf/detail/stream_compaction.hpp |   19 +-
 cpp/include/cudf/detail/tdigest/tdigest.hpp   |   95 +-
 cpp/include/cudf/detail/transform.hpp         |    4 +-
 cpp/include/cudf/detail/utilities/cuda.cuh    |   60 +-
 .../cudf/detail/utilities/device_atomics.cuh  |  124 +-
 .../detail/utilities/device_operators.cuh     |   49 +-
 .../cudf/detail/utilities/hash_functions.cuh  |  381 ----
 .../cudf/detail/utilities/int_fastdiv.h       |   36 +-
 .../cudf/detail/utilities/integer_utils.hpp   |    8 +-
 .../detail/utilities/pinned_host_vector.hpp   |    6 +-
 .../cudf/detail/utilities/stacktrace.hpp      |   47 +
 .../detail/utilities/vector_factories.hpp     |    4 +-
 cpp/include/cudf/detail/valid_if.cuh          |    6 +-
 cpp/include/cudf/filling.hpp                  |   25 +-
 cpp/include/cudf/fixed_point/fixed_point.hpp  |    5 +-
 cpp/include/cudf/fixed_point/temporary.hpp    |    8 +-
 cpp/include/cudf/groupby.hpp                  |   11 +-
 cpp/include/cudf/hashing.hpp                  |  112 +-
 .../cudf/hashing/detail/default_hash.cuh      |   35 +
 .../cudf/hashing/detail/hash_functions.cuh    |   71 +
 .../cudf/{ => hashing}/detail/hashing.hpp     |   47 +-
 .../hashing/detail/murmurhash3_x64_128.cuh    |  223 +++
 .../hashing/detail/murmurhash3_x86_32.cuh     |  194 ++
 cpp/include/cudf/io/arrow_io_source.hpp       |   85 +
 cpp/include/cudf/io/csv.hpp                   |   30 +-
 cpp/include/cudf/io/data_sink.hpp             |   10 +-
 cpp/include/cudf/io/datasource.hpp            |  149 +-
 cpp/include/cudf/io/detail/data_casting.cuh   |    6 +-
 cpp/include/cudf/io/detail/json.hpp           |    2 +-
 cpp/include/cudf/io/detail/orc.hpp            |    4 +-
 cpp/include/cudf/io/detail/parquet.hpp        |   12 +-
 cpp/include/cudf/io/detail/tokenize_json.hpp  |    2 +
 cpp/include/cudf/io/json.hpp                  |   41 +-
 cpp/include/cudf/io/orc.hpp                   |   90 +-
 cpp/include/cudf/io/orc_metadata.hpp          |    4 +-
 cpp/include/cudf/io/parquet.hpp               |  172 +-
 cpp/include/cudf/io/parquet_metadata.hpp      |  231 +++
 .../io/text/data_chunk_source_factories.hpp   |    4 +-
 cpp/include/cudf/io/types.hpp                 |  142 +-
 cpp/include/cudf/join.hpp                     |   37 +-
 cpp/include/cudf/lists/combine.hpp            |    4 +-
 cpp/include/cudf/lists/detail/gather.cuh      |   27 +-
 cpp/include/cudf/lists/detail/scatter.cuh     |    8 +-
 cpp/include/cudf/lists/lists_column_view.hpp  |    8 +-
 cpp/include/cudf/null_mask.hpp                |   18 +
 cpp/include/cudf/reduction.hpp                |    2 +-
 .../cudf/reduction/detail/reduction.hpp       |   40 +
 .../reduction/detail/reduction_operators.cuh  |   26 +-
 cpp/include/cudf/replace.hpp                  |   25 +-
 cpp/include/cudf/rolling.hpp                  |   28 +-
 .../cudf/rolling/range_window_bounds.hpp      |    4 +-
 cpp/include/cudf/scalar/scalar.hpp            |    4 +-
 .../cudf/scalar/scalar_device_view.cuh        |    6 +-
 cpp/include/cudf/search.hpp                   |   11 +-
 cpp/include/cudf/stream_compaction.hpp        |   49 +-
 cpp/include/cudf/strings/combine.hpp          |    8 +-
 .../cudf/strings/convert/convert_datetime.hpp |    2 +-
 .../cudf/strings/convert/convert_integers.hpp |    4 +-
 .../cudf/strings/convert/convert_lists.hpp    |    4 +-
 .../cudf/strings/detail/char_tables.hpp       |    6 +-
 .../strings/detail/convert/fixed_point.cuh    |    4 +-
 .../cudf/strings/detail/convert/is_float.cuh  |    4 +-
 .../detail/convert/string_to_float.cuh        |    6 +-
 .../strings/detail/convert/string_to_int.cuh  |    4 +-
 cpp/include/cudf/strings/detail/gather.cuh    |   70 +-
 .../cudf/strings/detail/split_utils.cuh       |   62 +-
 .../cudf/strings/detail/strings_children.cuh  |    4 +-
 .../detail/strings_column_factories.cuh       |    2 +-
 cpp/include/cudf/strings/detail/strip.cuh     |    4 +-
 cpp/include/cudf/strings/detail/utf8.hpp      |    2 +-
 cpp/include/cudf/strings/detail/utilities.cuh |   24 +-
 cpp/include/cudf/strings/find.hpp             |   29 +-
 cpp/include/cudf/strings/repeat_strings.hpp   |    7 +-
 cpp/include/cudf/strings/slice.hpp            |   93 -
 cpp/include/cudf/strings/split/split.hpp      |   56 +-
 cpp/include/cudf/strings/string_view.cuh      |   57 +-
 cpp/include/cudf/strings/string_view.hpp      |   57 +-
 .../cudf/strings/strings_column_view.hpp      |    6 +-
 .../cudf/table/experimental/row_operators.cuh |   34 +-
 cpp/include/cudf/table/row_operators.cuh      |   10 +-
 .../cudf/tdigest/tdigest_column_view.hpp      |    8 +-
 cpp/include/cudf/types.hpp                    |   11 +-
 cpp/include/cudf/utilities/error.hpp          |   32 +-
 cpp/include/cudf/utilities/span.hpp           |    4 +-
 cpp/include/cudf/wrappers/dictionary.hpp      |    2 +-
 cpp/include/cudf_test/base_fixture.hpp        |   29 +-
 cpp/include/cudf_test/column_utilities.hpp    |    4 +-
 cpp/include/cudf_test/column_wrapper.hpp      |   47 +-
 cpp/include/cudf_test/cudf_gtest.hpp          |    2 +-
 cpp/include/cudf_test/cxxopts.hpp             |  262 +--
 cpp/include/cudf_test/file_utilities.hpp      |    6 +-
 .../stream_checking_resource_adaptor.hpp      |    8 +
 cpp/include/cudf_test/tdigest_utilities.cuh   |    4 +-
 cpp/include/doxygen_groups.h                  |    5 +-
 cpp/include/nvtext/bpe_tokenize.hpp           |   15 +-
 cpp/include/nvtext/detail/generate_ngrams.hpp |   37 +
 cpp/include/nvtext/generate_ngrams.hpp        |   35 +-
 cpp/include/nvtext/jaccard.hpp                |   79 +
 cpp/include/nvtext/minhash.hpp                |   91 +-
 cpp/include/nvtext/subword_tokenize.hpp       |    9 +-
 cpp/libcudf_kafka/CMakeLists.txt              |    4 +-
 .../include/cudf_kafka/kafka_callback.hpp     |    4 +-
 cpp/src/ast/expression_parser.cpp             |   12 +-
 cpp/src/ast/expressions.cpp                   |   27 +-
 cpp/src/binaryop/binaryop.cpp                 |    5 +-
 cpp/src/binaryop/compiled/binary_ops.cu       |   11 +-
 cpp/src/binaryop/compiled/binary_ops.cuh      |    4 +-
 cpp/src/bitmask/null_mask.cu                  |   51 +-
 cpp/src/column/column.cu                      |   27 +-
 cpp/src/column/column_factories.cpp           |   56 +-
 cpp/src/column/column_factories.cu            |    2 +-
 cpp/src/column/column_view.cpp                |   18 +-
 cpp/src/copying/concatenate.cu                |  110 +-
 cpp/src/copying/contiguous_split.cu           | 1527 +++++++++++----
 cpp/src/copying/copy.cpp                      |   10 +-
 cpp/src/copying/copy.cu                       |   18 +-
 cpp/src/copying/copy_range.cu                 |   12 +-
 cpp/src/copying/gather.cu                     |   13 +-
 cpp/src/copying/get_element.cu                |    3 +-
 cpp/src/copying/purge_nonempty_nulls.cu       |   14 +-
 cpp/src/copying/reverse.cu                    |   14 +-
 cpp/src/copying/sample.cu                     |    5 +-
 cpp/src/copying/scatter.cu                    |   55 +-
 cpp/src/copying/shift.cu                      |    3 +-
 cpp/src/copying/slice.cu                      |   28 +-
 cpp/src/copying/split.cpp                     |   24 +-
 cpp/src/datetime/datetime_ops.cu              |    4 +-
 cpp/src/datetime/timezone.cpp                 |    8 +-
 cpp/src/dictionary/add_keys.cu                |    2 +-
 cpp/src/dictionary/detail/concatenate.cu      |   10 +-
 cpp/src/dictionary/dictionary_factories.cu    |    5 +-
 cpp/src/dictionary/remove_keys.cu             |    5 +-
 cpp/src/filling/calendrical_month_sequence.cu |    5 +-
 cpp/src/filling/fill.cu                       |    9 +-
 cpp/src/filling/repeat.cu                     |   24 +-
 cpp/src/filling/sequence.cu                   |    6 +-
 cpp/src/groupby/groupby.cu                    |   17 +-
 cpp/src/groupby/hash/groupby.cu               |   34 +-
 cpp/src/groupby/hash/multi_pass_kernels.cuh   |    8 +-
 cpp/src/groupby/sort/aggregate.cpp            |   10 +-
 cpp/src/groupby/sort/group_collect.cu         |    4 +-
 cpp/src/groupby/sort/group_merge_lists.cu     |    6 +-
 cpp/src/groupby/sort/group_scan_util.cuh      |    3 +-
 .../sort/group_single_pass_reduction_util.cuh |   20 +-
 cpp/src/groupby/sort/group_std.cu             |   38 +-
 cpp/src/groupby/sort/scan.cpp                 |    2 +-
 cpp/src/groupby/sort/sort_helper.cu           |   79 +-
 cpp/src/hash/concurrent_unordered_map.cuh     |   72 +-
 cpp/src/hash/hash_allocator.cuh               |  124 +-
 cpp/src/hash/hashing.cu                       |   45 +-
 cpp/src/hash/helper_functions.cuh             |   46 +-
 cpp/src/hash/managed.cuh                      |   88 +-
 cpp/src/hash/md5_hash.cu                      |  118 +-
 cpp/src/hash/murmurhash3_x64_128.cu           |  150 ++
 .../{murmur_hash.cu => murmurhash3_x86_32.cu} |   29 +-
 ...ur_hash.cu => spark_murmurhash3_x86_32.cu} |   87 +-
 cpp/src/hash/unordered_multiset.cuh           |   64 +-
 cpp/src/hash/xxhash_64.cu                     |  337 ++++
 cpp/src/interop/detail/arrow_allocator.cpp    |   45 +-
 cpp/src/interop/detail/arrow_allocator.hpp    |    6 +-
 cpp/src/interop/dlpack.cpp                    |   12 +-
 cpp/src/interop/from_arrow.cu                 |   29 +-
 cpp/src/interop/to_arrow.cu                   |    4 +-
 cpp/src/io/avro/avro.cpp                      |   17 +-
 cpp/src/io/avro/avro.hpp                      |   16 +-
 cpp/src/io/avro/avro_gpu.cu                   |   16 +-
 cpp/src/io/avro/reader_impl.cu                |   39 +-
 cpp/src/io/comp/brotli_dict.cpp               |    4 +-
 cpp/src/io/comp/brotli_dict.hpp               |    4 +-
 cpp/src/io/comp/cpu_unbz2.cpp                 |   24 +-
 cpp/src/io/comp/debrotli.cu                   |   72 +-
 cpp/src/io/comp/gpuinflate.cu                 |   58 +-
 cpp/src/io/comp/gpuinflate.hpp                |   30 +-
 cpp/src/io/comp/nvcomp_adapter.cpp            |    4 +-
 cpp/src/io/comp/snap.cu                       |   18 +-
 cpp/src/io/comp/statistics.cu                 |   62 +
 cpp/src/io/comp/unbz2.hpp                     |    4 +-
 cpp/src/io/comp/uncomp.cpp                    |   65 +-
 cpp/src/io/comp/unsnap.cu                     |   14 +-
 cpp/src/io/csv/csv_gpu.cu                     |   47 +-
 cpp/src/io/csv/csv_gpu.hpp                    |    6 +-
 cpp/src/io/csv/datetime.cuh                   |   10 +-
 cpp/src/io/csv/durations.cu                   |    8 +-
 cpp/src/io/csv/reader_impl.cu                 |   58 +-
 cpp/src/io/csv/writer_impl.cu                 |   10 +-
 cpp/src/io/fst/agent_dfa.cuh                  |   58 +-
 cpp/src/io/fst/in_reg_array.cuh               |    6 +-
 cpp/src/io/fst/logical_stack.cuh              |   53 +-
 cpp/src/io/fst/lookup_tables.cuh              |  591 ++++--
 cpp/src/io/functions.cpp                      |   54 +-
 .../{experimental => }/byte_range_info.cu     |    6 +-
 cpp/src/io/json/json_column.cu                |   97 +-
 cpp/src/io/json/json_tree.cu                  |  113 +-
 cpp/src/io/json/{ => legacy}/json_gpu.cu      |   49 +-
 cpp/src/io/json/{ => legacy}/json_gpu.hpp     |   12 +-
 cpp/src/io/json/legacy/read_json.hpp          |   33 +
 cpp/src/io/json/{ => legacy}/reader_impl.cu   |   54 +-
 cpp/src/io/json/nested_json.hpp               |   37 +-
 cpp/src/io/json/nested_json_gpu.cu            | 1157 +++++++----
 .../read_json.cpp => read_json.cu}            |   50 +-
 .../io/json/{experimental => }/read_json.hpp  |    6 +-
 cpp/src/io/json/write_json.cu                 |  117 +-
 cpp/src/io/orc/aggregate_orc_metadata.cpp     |   26 +-
 cpp/src/io/orc/aggregate_orc_metadata.hpp     |   13 +-
 cpp/src/io/orc/dict_enc.cu                    |  630 ++----
 cpp/src/io/orc/orc.cpp                        |   20 +-
 cpp/src/io/orc/orc.hpp                        |   10 -
 cpp/src/io/orc/orc_field_reader.hpp           |    8 +-
 cpp/src/io/orc/orc_field_writer.hpp           |   18 +-
 cpp/src/io/orc/orc_gpu.hpp                    |  185 +-
 cpp/src/io/orc/reader_impl.cu                 | 1497 +++++++-------
 cpp/src/io/orc/reader_impl.hpp                |  182 +-
 cpp/src/io/orc/stats_enc.cu                   |   34 +-
 cpp/src/io/orc/stripe_data.cu                 |   24 +-
 cpp/src/io/orc/stripe_enc.cu                  |   86 +-
 cpp/src/io/orc/stripe_init.cu                 |   18 +-
 cpp/src/io/orc/writer_impl.cu                 |  642 +++---
 cpp/src/io/orc/writer_impl.hpp                |   26 +-
 cpp/src/io/parquet/chunk_dict.cu              |   16 +-
 .../io/parquet/compact_protocol_reader.cpp    |   23 +-
 .../io/parquet/compact_protocol_reader.hpp    |   19 +-
 .../io/parquet/compact_protocol_writer.cpp    |   80 +-
 .../io/parquet/compact_protocol_writer.hpp    |   49 +-
 cpp/src/io/parquet/decode_preprocess.cu       |  417 ++++
 cpp/src/io/parquet/delta_binary.cuh           |  294 +++
 cpp/src/io/parquet/page_data.cu               | 1725 ++---------------
 cpp/src/io/parquet/page_decode.cuh            | 1363 +++++++++++++
 cpp/src/io/parquet/page_delta_decode.cu       |  176 ++
 cpp/src/io/parquet/page_enc.cu                |  365 ++--
 cpp/src/io/parquet/page_hdr.cu                |   64 +-
 cpp/src/io/parquet/page_string_decode.cu      |  797 ++++++++
 cpp/src/io/parquet/page_string_utils.cuh      |  110 ++
 cpp/src/io/parquet/parquet.hpp                |   19 +-
 cpp/src/io/parquet/parquet_common.hpp         |    1 +
 cpp/src/io/parquet/parquet_gpu.hpp            |  201 +-
 cpp/src/io/parquet/predicate_pushdown.cpp     |  530 +++++
 cpp/src/io/parquet/reader.cpp                 |    3 +-
 cpp/src/io/parquet/reader_impl.cpp            |  284 ++-
 cpp/src/io/parquet/reader_impl.hpp            |   49 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |   62 +-
 cpp/src/io/parquet/reader_impl_helpers.hpp    |  105 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  302 ++-
 cpp/src/io/parquet/rle_stream.cuh             |  365 ++++
 cpp/src/io/parquet/writer_impl.cu             |  228 ++-
 cpp/src/io/parquet/writer_impl.hpp            |   11 +-
 cpp/src/io/statistics/byte_array_view.cuh     |   10 +-
 cpp/src/io/statistics/column_statistics.cuh   |   36 +-
 .../io/statistics/orc_column_statistics.cu    |    8 +-
 .../statistics/parquet_column_statistics.cu   |    8 +-
 cpp/src/io/statistics/statistics.cuh          |    8 +-
 .../statistics_type_identification.cuh        |    6 +-
 .../io/statistics/typed_statistics_chunk.cuh  |   12 +-
 cpp/src/io/text/bgzip_data_chunk_source.cu    |   18 +-
 cpp/src/io/text/bgzip_utils.cpp               |    4 +-
 .../io/text/data_chunk_source_factories.cpp   |   10 +-
 cpp/src/io/text/multibyte_split.cu            |   49 +-
 cpp/src/io/utilities/arrow_io_source.cpp      |   85 +
 cpp/src/io/utilities/block_utils.cuh          |   16 +-
 cpp/src/io/utilities/column_buffer.cpp        |  190 +-
 cpp/src/io/utilities/column_buffer.hpp        |  184 +-
 cpp/src/io/utilities/column_utils.cuh         |    4 +-
 cpp/src/io/utilities/data_sink.cpp            |    4 +-
 cpp/src/io/utilities/datasource.cpp           |   87 +-
 cpp/src/io/utilities/file_io_utilities.cpp    |   10 +-
 cpp/src/io/utilities/hostdevice_span.hpp      |   22 +-
 cpp/src/io/utilities/hostdevice_vector.hpp    |   41 +-
 cpp/src/io/utilities/output_builder.cuh       |   18 +-
 cpp/src/io/utilities/parsing_utils.cu         |   41 +-
 cpp/src/io/utilities/row_selection.cpp        |   15 +-
 cpp/src/io/utilities/row_selection.hpp        |    9 +-
 cpp/src/io/utilities/thread_pool.hpp          |   20 +-
 cpp/src/io/utilities/trie.cu                  |   12 +-
 cpp/src/io/utilities/trie.cuh                 |    2 +-
 cpp/src/jit/cache.cpp                         |    4 +-
 cpp/src/jit/parser.cpp                        |   40 +-
 cpp/src/jit/parser.hpp                        |   42 +-
 cpp/src/jit/util.cpp                          |    8 +-
 cpp/src/jit/util.hpp                          |    4 +-
 cpp/src/join/conditional_join_kernels.cuh     |   48 +-
 cpp/src/join/hash_join.cu                     |   13 +-
 cpp/src/join/join_common_utils.cuh            |   20 +-
 cpp/src/join/join_common_utils.hpp            |   11 +-
 cpp/src/join/mixed_join_common_utils.cuh      |    3 +-
 cpp/src/join/mixed_join_size_kernel.cuh       |    5 +-
 cpp/src/join/mixed_join_size_kernels_semi.cu  |    5 +-
 cpp/src/labeling/label_bins.cu                |   12 +-
 .../combine/concatenate_list_elements.cu      |   53 +-
 cpp/src/lists/combine/concatenate_rows.cu     |   19 +-
 cpp/src/lists/contains.cu                     |  337 +---
 cpp/src/lists/copying/concatenate.cu          |   16 +-
 cpp/src/lists/copying/copying.cu              |    7 +-
 cpp/src/lists/copying/scatter_helper.cu       |   14 +-
 cpp/src/lists/interleave_columns.cu           |   36 +-
 cpp/src/lists/lists_column_factories.cu       |   18 +-
 cpp/src/lists/reverse.cu                      |    2 +-
 cpp/src/lists/sequences.cu                    |   18 +-
 .../stream_compaction/apply_boolean_mask.cu   |   10 +-
 cpp/src/lists/utilities.cu                    |   10 +-
 cpp/src/merge/merge.cu                        |   19 +-
 cpp/src/partitioning/partitioning.cu          |   88 +-
 cpp/src/quantiles/tdigest/tdigest.cu          |   18 +-
 .../quantiles/tdigest/tdigest_aggregation.cu  |   46 +-
 cpp/src/reductions/all.cu                     |   16 +-
 cpp/src/reductions/any.cu                     |   16 +-
 cpp/src/reductions/minmax.cu                  |    6 +-
 .../reductions/nested_type_minmax_util.cuh    |  182 ++
 cpp/src/reductions/scan/scan.cuh              |   12 +-
 cpp/src/reductions/scan/scan_exclusive.cu     |   24 +-
 cpp/src/reductions/scan/scan_inclusive.cu     |   56 +-
 cpp/src/reductions/segmented/simple.cuh       |    2 +-
 cpp/src/reductions/simple.cuh                 |   14 +-
 cpp/src/reductions/struct_minmax_util.cuh     |  155 --
 cpp/src/replace/clamp.cu                      |    6 +-
 cpp/src/replace/nans.cu                       |   15 +-
 cpp/src/replace/nulls.cu                      |   27 +-
 cpp/src/replace/replace.cu                    |  103 +-
 cpp/src/reshape/interleave_columns.cu         |    2 +-
 cpp/src/reshape/tile.cu                       |    6 +-
 cpp/src/rolling/detail/nth_element.cuh        |    4 +-
 .../detail/optimized_unbounded_window.cpp     |  161 ++
 .../detail/optimized_unbounded_window.hpp     |   56 +
 .../rolling/detail/range_comparator_utils.cuh |  143 ++
 .../rolling/detail/range_window_bounds.hpp    |    8 +-
 cpp/src/rolling/detail/rolling.cuh            |    8 +-
 .../rolling/detail/rolling_collect_list.cu    |    4 +-
 cpp/src/rolling/detail/rolling_jit.hpp        |   10 +-
 cpp/src/rolling/grouped_rolling.cu            |  132 +-
 cpp/src/rolling/jit/kernel.cu                 |    4 +-
 cpp/src/rolling/jit/operation.hpp             |    6 +-
 cpp/src/rolling/range_window_bounds.cpp       |   14 +-
 cpp/src/round/round.cu                        |    4 +-
 cpp/src/scalar/scalar.cpp                     |    6 +-
 cpp/src/search/contains_column.cu             |    5 +-
 cpp/src/search/contains_scalar.cu             |    4 +-
 cpp/src/search/contains_table.cu              |  599 +++---
 cpp/src/search/search_ordered.cu              |   10 +-
 cpp/src/sort/segmented_sort_impl.cuh          |    4 +-
 cpp/src/stream_compaction/distinct_count.cu   |   48 +-
 cpp/src/stream_compaction/stable_distinct.cu  |   37 +-
 .../stream_compaction_common.cuh              |    2 +-
 .../stream_compaction_common.hpp              |    1 -
 cpp/src/strings/attributes.cu                 |   10 +-
 cpp/src/strings/capitalize.cu                 |   12 +-
 cpp/src/strings/case.cu                       |   12 +-
 cpp/src/strings/char_types/char_cases.h       |    4 +-
 cpp/src/strings/char_types/char_flags.h       |    4 +-
 cpp/src/strings/char_types/char_types.cu      |   93 +-
 cpp/src/strings/combine/concatenate.cu        |    6 +-
 cpp/src/strings/combine/join.cu               |  189 +-
 cpp/src/strings/combine/join_list_elements.cu |    6 +-
 cpp/src/strings/contains.cu                   |    7 +-
 cpp/src/strings/convert/convert_booleans.cu   |    2 +-
 cpp/src/strings/convert/convert_datetime.cu   |    4 +-
 cpp/src/strings/convert/convert_durations.cu  |   22 +-
 .../strings/convert/convert_fixed_point.cu    |    2 +-
 cpp/src/strings/convert/convert_floats.cu     |    8 +-
 cpp/src/strings/convert/convert_hex.cu        |    8 +-
 cpp/src/strings/convert/convert_integers.cu   |    2 +-
 cpp/src/strings/convert/convert_ipv4.cu       |    4 +-
 cpp/src/strings/convert/convert_lists.cu      |    4 +-
 cpp/src/strings/convert/convert_urls.cu       |   87 +-
 cpp/src/strings/copying/concatenate.cu        |   16 +-
 cpp/src/strings/copying/shift.cu              |   88 +-
 cpp/src/strings/count_matches.cu              |   12 +-
 cpp/src/strings/extract/extract.cu            |   25 +-
 cpp/src/strings/extract/extract_all.cu        |   46 +-
 cpp/src/strings/json/json_path.cu             |   34 +-
 cpp/src/strings/like.cu                       |   27 +-
 cpp/src/strings/padding.cu                    |    4 +-
 cpp/src/strings/regex/regcomp.cpp             |   18 +-
 cpp/src/strings/regex/regcomp.h               |    6 +-
 cpp/src/strings/regex/regex.cuh               |   79 +-
 cpp/src/strings/regex/regex.inl               |   58 +-
 cpp/src/strings/regex/regex_program_impl.h    |    3 +-
 cpp/src/strings/regex/utilities.cuh           |    4 +-
 cpp/src/strings/repeat_strings.cu             |   15 +-
 cpp/src/strings/replace/backref_re.cuh        |   51 +-
 cpp/src/strings/replace/multi_re.cu           |   84 +-
 cpp/src/strings/replace/replace.cu            |    2 +-
 cpp/src/strings/replace/replace_re.cu         |   66 +-
 cpp/src/strings/reverse.cu                    |    6 +-
 cpp/src/strings/search/find.cu                |  180 +-
 cpp/src/strings/search/find_multiple.cu       |    6 +-
 cpp/src/strings/search/findall.cu             |   23 +-
 cpp/src/strings/slice.cu                      |  157 --
 cpp/src/strings/split/partition.cu            |    2 +-
 cpp/src/strings/split/split.cu                |   33 +-
 cpp/src/strings/split/split.cuh               |    7 +-
 cpp/src/strings/split/split_re.cu             |   41 +-
 cpp/src/strings/split/split_record.cu         |   96 +-
 cpp/src/strings/strings_column_factories.cu   |   10 +-
 cpp/src/strings/strings_column_view.cpp       |    4 +-
 cpp/src/strings/utilities.cu                  |   14 +-
 cpp/src/structs/copying/concatenate.cu        |   15 +-
 cpp/src/structs/utilities.cpp                 |   12 +-
 cpp/src/table/row_operators.cu                |  420 ++--
 cpp/src/text/edit_distance.cu                 |  194 +-
 cpp/src/text/generate_ngrams.cu               |  113 +-
 cpp/src/text/jaccard.cu                       |  307 +++
 cpp/src/text/minhash.cu                       |  265 ++-
 cpp/src/text/ngrams_tokenize.cu               |    2 +-
 cpp/src/text/normalize.cu                     |   43 +-
 cpp/src/text/replace.cu                       |    2 +-
 cpp/src/text/stemmer.cu                       |   26 +-
 cpp/src/text/subword/bpe_tokenizer.cu         |   67 +-
 cpp/src/text/subword/bpe_tokenizer.cuh        |   78 +-
 cpp/src/text/subword/data_normalizer.cu       |   51 +-
 .../text/subword/detail/codepoint_metadata.ah |    4 +-
 .../text/subword/detail/data_normalizer.hpp   |   10 +-
 .../text/subword/detail/tokenizer_utils.cuh   |    9 +-
 .../subword/detail/wordpiece_tokenizer.hpp    |   10 +-
 cpp/src/text/subword/load_hash_file.cu        |   14 +-
 cpp/src/text/subword/load_merges_file.cu      |   51 +-
 cpp/src/text/subword/subword_tokenize.cu      |   69 +-
 cpp/src/text/subword/wordpiece_tokenizer.cu   |   47 +-
 cpp/src/text/tokenize.cu                      |   57 +-
 cpp/src/text/utilities/tokenize_ops.cuh       |   55 +-
 cpp/src/transform/compute_column.cu           |    8 +-
 cpp/src/transform/jit/kernel.cu               |   15 +-
 cpp/src/transform/one_hot_encode.cu           |    8 +-
 cpp/src/transform/row_bit_count.cu            |   12 +-
 cpp/src/transform/transform.cpp               |    2 +-
 cpp/src/transpose/transpose.cu                |    8 +-
 cpp/src/utilities/stacktrace.cpp              |   88 +
 cpp/tests/CMakeLists.txt                      |   63 +-
 cpp/tests/ast/transform_tests.cpp             |   28 +
 cpp/tests/binaryop/binop-generic-ptx-test.cpp |    6 +-
 cpp/tests/bitmask/bitmask_tests.cpp           |   12 +-
 cpp/tests/column/column_test.cpp              |   86 +-
 .../column/column_view_device_span_test.cpp   |    6 +-
 cpp/tests/column/factories_test.cpp           |   22 +-
 cpp/tests/copying/concatenate_tests.cpp       |   70 +-
 .../copying/copy_if_else_nested_tests.cpp     |    2 +-
 cpp/tests/copying/copy_tests.cpp              |   46 +-
 cpp/tests/copying/gather_str_tests.cpp        |   18 +-
 cpp/tests/copying/gather_struct_tests.cpp     |    4 +-
 cpp/tests/copying/get_value_tests.cpp         |   20 +-
 .../copying/purge_nonempty_nulls_tests.cpp    |   45 +-
 .../copying/scatter_list_scalar_tests.cpp     |    8 +-
 cpp/tests/copying/scatter_tests.cpp           |   10 +-
 cpp/tests/copying/shift_tests.cpp             |   38 +-
 cpp/tests/copying/split_tests.cpp             |  965 ++++++---
 cpp/tests/copying/utility_tests.cpp           |   20 +-
 cpp/tests/datetime/datetime_ops_test.cpp      |    9 +-
 .../device_atomics/device_atomics_test.cu     |   91 -
 cpp/tests/dictionary/decode_test.cpp          |    2 +-
 cpp/tests/dictionary/remove_keys_test.cpp     |    2 +-
 cpp/tests/dictionary/set_keys_test.cpp        |    2 +-
 cpp/tests/filling/repeat_tests.cpp            |   34 +
 cpp/tests/groupby/collect_list_tests.cpp      |   10 +-
 cpp/tests/groupby/groupby_test_util.cpp       |    2 +-
 cpp/tests/groupby/groupby_test_util.hpp       |    1 +
 cpp/tests/groupby/max_tests.cpp               |   71 +
 cpp/tests/groupby/min_tests.cpp               |   71 +
 cpp/tests/groupby/tdigest_tests.cu            |    8 +-
 cpp/tests/groupby/var_tests.cpp               |   30 +
 cpp/tests/hash_map/map_test.cu                |    5 +-
 cpp/tests/hashing/hash_test.cpp               | 1109 -----------
 cpp/tests/hashing/md5_test.cpp                |  290 +++
 .../hashing/murmurhash3_x64_128_test.cpp      |  113 ++
 cpp/tests/hashing/murmurhash3_x86_32_test.cpp |  405 ++++
 .../hashing/spark_murmurhash3_x86_32_test.cpp |  576 ++++++
 cpp/tests/hashing/xxhash_64_test.cpp          |  177 ++
 cpp/tests/interop/arrow_utils.hpp             |    4 +-
 cpp/tests/interop/dlpack_test.cpp             |    7 +-
 cpp/tests/interop/from_arrow_test.cpp         |    9 +-
 cpp/tests/interop/to_arrow_test.cpp           |   10 +-
 cpp/tests/io/arrow_io_source_test.cpp         |   20 +-
 cpp/tests/io/comp/decomp_test.cpp             |   22 +-
 cpp/tests/io/csv_test.cpp                     |  201 +-
 cpp/tests/io/fst/fst_test.cu                  |   21 +-
 cpp/tests/io/fst/logical_stack_test.cu        |    4 +-
 cpp/tests/io/json_chunked_reader.cpp          |    4 +-
 cpp/tests/io/json_test.cpp                    |  139 +-
 cpp/tests/io/json_tree.cpp                    |   21 +-
 cpp/tests/io/json_type_cast_test.cu           |   60 +-
 cpp/tests/io/json_writer.cpp                  |  115 ++
 cpp/tests/io/nested_json_test.cpp             |  302 ++-
 cpp/tests/io/orc_test.cpp                     |  218 ++-
 cpp/tests/io/parquet_test.cpp                 | 1647 ++++++++++++++--
 cpp/tests/io/row_selection_test.cpp           |    2 +-
 cpp/tests/io/text/data_chunk_source_test.cpp  |    6 +-
 cpp/tests/iterator/iterator_tests.cuh         |    4 +-
 .../optional_iterator_test_numeric.cu         |    4 +-
 .../iterator/pair_iterator_test_numeric.cu    |    6 +-
 .../sizes_to_offsets_iterator_test.cu         |    4 +-
 .../iterator/value_iterator_test_transform.cu |    6 +-
 cpp/tests/join/conditional_join_tests.cu      |   10 +-
 cpp/tests/join/join_tests.cpp                 |   19 +-
 cpp/tests/join/mixed_join_tests.cu            |   10 +-
 cpp/tests/join/semi_anti_join_tests.cpp       |    7 +-
 .../concatenate_list_elements_tests.cpp       |  315 +++
 cpp/tests/lists/contains_tests.cpp            |    4 +-
 cpp/tests/lists/extract_tests.cpp             |    6 +-
 .../partitioning/hash_partition_test.cpp      |    2 +-
 .../quantiles/percentile_approx_test.cpp      |    4 +-
 cpp/tests/reductions/reduction_tests.cpp      |  198 +-
 cpp/tests/reductions/scan_tests.cpp           |    1 +
 cpp/tests/reductions/tdigest_tests.cu         |    6 +-
 cpp/tests/replace/replace_nulls_tests.cpp     |   23 +-
 cpp/tests/replace/replace_tests.cpp           |   38 +-
 .../reshape/interleave_columns_tests.cpp      |    2 +-
 .../rolling/grouped_rolling_range_test.cpp    |  384 +++-
 cpp/tests/rolling/grouped_rolling_test.cpp    |   73 +-
 cpp/tests/rolling/range_comparator_test.cu    |  147 ++
 cpp/tests/rolling/rolling_test.cpp            |    8 +-
 cpp/tests/scalar/scalar_device_view_test.cu   |    2 +-
 cpp/tests/search/search_test.cpp              |  140 +-
 cpp/tests/sort/sort_nested_types_tests.cpp    |   20 +
 .../distinct_count_tests.cpp                  |   10 +
 .../stream_compaction/distinct_tests.cpp      |   14 +-
 .../stable_distinct_tests.cpp                 | 1354 +++++++++++++
 cpp/tests/streams/concatenate_test.cpp        |   51 +
 cpp/tests/streams/copying_test.cpp            |  339 ++++
 cpp/tests/streams/filling_test.cpp            |   76 +
 cpp/tests/streams/groupby_test.cpp            |   67 +
 cpp/tests/streams/hash_test.cpp               |   54 +
 cpp/tests/streams/replace_test.cpp            |  109 ++
 cpp/tests/streams/search_test.cpp             |   69 +
 cpp/tests/strings/array_tests.cpp             |   32 +-
 cpp/tests/strings/attrs_tests.cpp             |    8 +-
 cpp/tests/strings/booleans_tests.cpp          |   13 +-
 cpp/tests/strings/case_tests.cpp              |   16 +-
 cpp/tests/strings/chars_types_tests.cpp       |    8 +-
 .../strings/combine/concatenate_tests.cpp     |   24 +-
 .../strings/combine/join_strings_tests.cpp    |   19 +-
 cpp/tests/strings/concatenate_tests.cpp       |   14 +-
 cpp/tests/strings/contains_tests.cpp          |   16 +-
 cpp/tests/strings/datetime_tests.cpp          |   12 +-
 cpp/tests/strings/durations_tests.cpp         |   10 +-
 cpp/tests/strings/extract_tests.cpp           |   24 +-
 cpp/tests/strings/factories_test.cu           |   24 +-
 cpp/tests/strings/fill_tests.cpp              |   11 +-
 cpp/tests/strings/find_multiple_tests.cpp     |   16 +-
 cpp/tests/strings/find_tests.cpp              |   85 +-
 cpp/tests/strings/findall_tests.cpp           |    4 +-
 cpp/tests/strings/floats_tests.cpp            |   10 +-
 cpp/tests/strings/integers_tests.cpp          |   19 +-
 cpp/tests/strings/ipv4_tests.cpp              |    9 +-
 cpp/tests/strings/pad_tests.cpp               |   26 +-
 cpp/tests/strings/repeat_strings_tests.cpp    |    2 +-
 cpp/tests/strings/replace_regex_tests.cpp     |   37 +-
 cpp/tests/strings/replace_tests.cpp           |   30 +-
 cpp/tests/strings/reverse_tests.cpp           |    4 +-
 cpp/tests/strings/slice_tests.cpp             |  272 +--
 cpp/tests/strings/split_tests.cpp             |   76 +-
 cpp/tests/strings/strip_tests.cpp             |   26 +-
 cpp/tests/strings/translate_tests.cpp         |   12 +-
 cpp/tests/strings/urls_tests.cpp              |   26 +-
 cpp/tests/structs/structs_column_tests.cpp    |    4 +-
 cpp/tests/structs/utilities_tests.cpp         |   44 -
 .../table/experimental_row_operator_tests.cu  |  191 +-
 .../table/row_operator_tests_utilities.cu     |  214 ++
 .../table/row_operator_tests_utilities.hpp    |   45 +
 cpp/tests/text/edit_distance_tests.cpp        |    6 +-
 cpp/tests/text/jaccard_tests.cpp              |   80 +
 cpp/tests/text/minhash_tests.cpp              |   71 +-
 cpp/tests/text/ngrams_tests.cpp               |   41 +-
 cpp/tests/text/ngrams_tokenize_tests.cpp      |    2 +-
 cpp/tests/text/normalize_tests.cpp            |    6 +-
 cpp/tests/text/replace_tests.cpp              |    4 +-
 cpp/tests/text/stemmer_tests.cpp              |    6 +-
 cpp/tests/text/subword_tests.cpp              |  142 +-
 cpp/tests/text/tokenize_tests.cpp             |    6 +-
 .../transform/integration/assert_unary.h      |    6 +-
 .../integration/unary_transform_test.cpp      |   16 +-
 cpp/tests/transform/mask_to_bools_test.cpp    |    2 +-
 cpp/tests/transform/row_bit_count_test.cu     |   47 +-
 cpp/tests/transpose/transpose_test.cpp        |    6 +-
 cpp/tests/types/traits_test.cpp               |    2 +-
 cpp/tests/unary/cast_tests.cpp                |   61 +-
 cpp/tests/utilities/column_utilities.cu       |  117 +-
 cpp/tests/utilities/identify_stream_usage.cpp |  109 +-
 cpp/tests/utilities/tdigest_utilities.cu      |    6 +-
 .../column_utilities_tests.cpp                |   28 +-
 cpp/tests/utilities_tests/span_tests.cu       |   42 +-
 dependencies.yaml                             |  189 +-
 docs/cudf/Makefile                            |    2 +-
 ...lass_without_autosummary.rst => class.rst} |    3 +
 .../autosummary/class_with_autosummary.rst    |   33 -
 docs/cudf/source/api_docs/dataframe.rst       |   13 +-
 .../cudf/source/api_docs/extension_dtypes.rst |  170 ++
 .../source/api_docs/general_functions.rst     |   14 +-
 docs/cudf/source/api_docs/index.rst           |    1 +
 docs/cudf/source/api_docs/index_objects.rst   |   32 +-
 docs/cudf/source/api_docs/io.rst              |    4 +-
 docs/cudf/source/api_docs/list_handling.rst   |    6 +
 docs/cudf/source/api_docs/options.rst         |   19 +-
 docs/cudf/source/api_docs/series.rst          |   22 +-
 docs/cudf/source/api_docs/string_handling.rst |    6 +
 docs/cudf/source/api_docs/struct_handling.rst |    6 +
 .../cudf/source/api_docs/subword_tokenize.rst |    2 +-
 docs/cudf/source/conf.py                      |   52 +-
 .../developer_guide/contributing_guide.md     |    7 +-
 .../source/developer_guide/documentation.md   |   29 +
 docs/cudf/source/developer_guide/index.md     |    1 +
 .../source/developer_guide/library_design.md  |   26 +-
 docs/cudf/source/developer_guide/pylibcudf.md |  155 ++
 docs/cudf/source/user_guide/10min.ipynb       |   74 +-
 .../user_guide/cudf.CategoricalDtype.rst      |   19 -
 .../user_guide/cudf.Decimal128Dtype.rst       |   20 -
 .../source/user_guide/cudf.Decimal32Dtype.rst |   20 -
 .../source/user_guide/cudf.Decimal64Dtype.rst |   20 -
 .../cudf/source/user_guide/cudf.ListDtype.rst |   19 -
 .../source/user_guide/cudf.StructDtype.rst    |   18 -
 .../cudf/source/user_guide/cupy-interop.ipynb |   14 +-
 docs/cudf/source/user_guide/data-types.md     |   41 +-
 docs/cudf/source/user_guide/groupby.md        |    5 +-
 .../source/user_guide/guide-to-udfs.ipynb     |    4 +-
 docs/cudf/source/user_guide/io/io.md          |   19 +-
 .../source/user_guide/pandas-comparison.md    |   66 +-
 docs/dask_cudf/Makefile                       |    3 +-
 docs/dask_cudf/source/conf.py                 |    4 +-
 fetch_rapids.cmake                            |    2 +-
 java/ci/Dockerfile.centos7                    |    2 +-
 java/ci/README.md                             |    4 +-
 java/pom.xml                                  |    2 +-
 .../main/java/ai/rapids/cudf/ChunkedPack.java |  103 +
 .../java/ai/rapids/cudf/ColumnVector.java     |   53 +-
 .../main/java/ai/rapids/cudf/ColumnView.java  |  187 +-
 .../ai/rapids/cudf/ColumnWriterOptions.java   |   30 +
 .../java/ai/rapids/cudf/ContiguousTable.java  |   27 +-
 .../java/ai/rapids/cudf/CudaException.java    |   26 +-
 .../ai/rapids/cudf/CudaFatalException.java    |   12 +-
 .../cudf/CudfColumnSizeOverflowException.java |   34 +
 .../java/ai/rapids/cudf/CudfException.java    |   16 +-
 .../cudf/DefaultHostMemoryAllocator.java      |   36 +
 .../java/ai/rapids/cudf/HostColumnVector.java |   47 +-
 .../ai/rapids/cudf/HostMemoryAllocator.java   |   39 +
 .../ai/rapids/cudf/HostMemoryReservation.java |   32 +
 .../ai/rapids/cudf/JCudfSerialization.java    |   23 +-
 .../java/ai/rapids/cudf/MemoryCleaner.java    |   41 +-
 .../ai/rapids/cudf/PackedColumnMetadata.java  |   74 +
 .../java/ai/rapids/cudf/PinnedMemoryPool.java |  164 +-
 java/src/main/java/ai/rapids/cudf/Scalar.java |   21 +-
 java/src/main/java/ai/rapids/cudf/Schema.java |   13 +-
 java/src/main/java/ai/rapids/cudf/Table.java  |  481 +++--
 .../main/java/ai/rapids/cudf/TableDebug.java  |  280 +++
 .../main/java/ai/rapids/cudf/TableWriter.java |   76 +-
 .../ai/rapids/cudf/ast/UnaryOperator.java     |   53 +-
 .../cudf/nvcomp/BatchedLZ4Compressor.java     |   10 +-
 java/src/main/native/CMakeLists.txt           |    6 +-
 java/src/main/native/include/jni_utils.hpp    |  127 +-
 java/src/main/native/src/ChunkedPackJni.cpp   |   75 +
 java/src/main/native/src/ColumnViewJni.cpp    |   94 +-
 java/src/main/native/src/ColumnViewJni.cu     |    8 +-
 java/src/main/native/src/ColumnViewJni.hpp    |    4 +-
 .../main/native/src/CompiledExpression.cpp    |   51 +-
 .../main/native/src/ContiguousTableJni.cpp    |   30 +-
 java/src/main/native/src/CudfJni.cpp          |   25 +-
 .../native/src/PackedColumnMetadataJni.cpp    |   41 +
 java/src/main/native/src/RmmJni.cpp           |    7 +-
 java/src/main/native/src/TableJni.cpp         |  283 +--
 java/src/main/native/src/cudf_jni_apis.hpp    |    3 +-
 .../main/native/src/jni_writer_data_sink.hpp  |   29 +-
 java/src/main/native/src/row_conversion.cu    |   34 +-
 .../java/ai/rapids/cudf/ColumnVectorTest.java |  151 +-
 .../cudf/ColumnViewNonEmptyNullsTest.java     |    6 +-
 .../test/java/ai/rapids/cudf/CuFileTest.java  |   15 +-
 .../java/ai/rapids/cudf/CudaFatalTest.java    |    6 +-
 .../test/java/ai/rapids/cudf/CudaTest.java    |    6 +-
 .../java/ai/rapids/cudf/GatherMapTest.java    |    8 +-
 .../java/ai/rapids/cudf/LargeTableTest.java   |   66 +
 .../java/ai/rapids/cudf/MemoryBufferTest.java |   42 +-
 .../java/ai/rapids/cudf/ReductionTest.java    |   11 +
 .../src/test/java/ai/rapids/cudf/RmmTest.java |   13 +-
 .../test/java/ai/rapids/cudf/TableTest.java   |  291 ++-
 .../rapids/cudf/UnsafeMemoryAccessorTest.java |    4 +-
 .../cudf/ast/CompiledExpressionTest.java      |   20 +-
 .../ai/rapids/cudf/nvcomp/NvcompTest.java     |    8 +-
 pyproject.toml                                |   13 +
 python/cudf/CMakeLists.txt                    |    4 +-
 python/cudf/benchmarks/API/bench_dataframe.py |   34 +
 .../internal/bench_dataframe_internal.py      |    6 +-
 .../cudf/cmake/Modules/ProtobufHelpers.cmake  |    3 +-
 python/cudf/cmake/Modules/WheelHelpers.cmake  |    4 +-
 python/cudf/cudf/__init__.py                  |   44 +-
 python/cudf/cudf/_fuzz_testing/utils.py       |   12 +-
 python/cudf/cudf/_lib/CMakeLists.txt          |    7 +-
 python/cudf/cudf/_lib/__init__.py             |    1 +
 python/cudf/cudf/_lib/column.pxd              |    6 +-
 python/cudf/cudf/_lib/column.pyi              |   11 +-
 python/cudf/cudf/_lib/column.pyx              |  180 +-
 python/cudf/cudf/_lib/concat.pyx              |    7 +-
 python/cudf/cudf/_lib/copying.pyx             |   37 +-
 python/cudf/cudf/_lib/cpp/CMakeLists.txt      |   23 +
 python/cudf/cudf/_lib/cpp/column/column.pxd   |    8 +-
 .../cudf/cudf/_lib/cpp/column/column_view.pxd |    3 +-
 python/cudf/cudf/_lib/cpp/copying.pxd         |    6 +-
 python/cudf/cudf/_lib/cpp/copying.pyx         |    0
 python/cudf/cudf/_lib/cpp/expressions.pxd     |    9 +-
 .../cudf/cudf/_lib/cpp/io/arrow_io_source.pxd |   15 +
 python/cudf/cudf/_lib/cpp/io/data_sink.pxd    |    8 +
 python/cudf/cudf/_lib/cpp/io/datasource.pxd   |    8 +
 python/cudf/cudf/_lib/cpp/io/orc.pxd          |   16 +-
 python/cudf/cudf/_lib/cpp/io/parquet.pxd      |   24 +-
 python/cudf/cudf/_lib/cpp/io/types.pxd        |   29 +-
 python/cudf/cudf/_lib/cpp/libcpp/memory.pxd   |   12 +
 python/cudf/cudf/_lib/cpp/null_mask.pxd       |   10 +-
 .../cudf/_lib/cpp/nvtext/generate_ngrams.pxd  |    7 +-
 python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd  |   16 +
 python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd  |    8 +-
 .../cudf/_lib/cpp/nvtext/subword_tokenize.pxd |    8 +-
 python/cudf/cudf/_lib/cpp/sorting.pxd         |   29 +-
 .../cudf/cudf/_lib/cpp/stream_compaction.pxd  |   16 +-
 python/cudf/cudf/_lib/cpp/table/table.pxd     |    3 +-
 python/cudf/cudf/_lib/cpp/types.pxd           |   68 +-
 python/cudf/cudf/_lib/cpp/types.pyx           |    0
 python/cudf/cudf/_lib/csv.pyx                 |    8 +-
 python/cudf/cudf/_lib/exception_handler.hpp   |   80 -
 python/cudf/cudf/_lib/exception_handler.pxd   |   66 +-
 python/cudf/cudf/_lib/expressions.pxd         |    3 +
 python/cudf/cudf/_lib/expressions.pyx         |   40 +-
 python/cudf/cudf/_lib/interop.pyx             |    4 +
 python/cudf/cudf/_lib/io/datasource.pxd       |    8 +-
 python/cudf/cudf/_lib/io/datasource.pyx       |    7 +-
 python/cudf/cudf/_lib/io/utils.pxd            |   10 +-
 python/cudf/cudf/_lib/io/utils.pyx            |    6 +-
 python/cudf/cudf/_lib/join.pyx                |   14 +-
 python/cudf/cudf/_lib/json.pyx                |   15 +-
 python/cudf/cudf/_lib/null_mask.pyx           |   13 +-
 python/cudf/cudf/_lib/nvtext/CMakeLists.txt   |    5 +-
 .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx |   20 +-
 python/cudf/cudf/_lib/nvtext/jaccard.pyx      |   31 +
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |   36 +-
 python/cudf/cudf/_lib/nvtext/stemmer.pyx      |    4 +-
 .../cudf/_lib/nvtext/subword_tokenize.pyx     |    4 +-
 python/cudf/cudf/_lib/orc.pyx                 |   36 +-
 python/cudf/cudf/_lib/parquet.pyx             |   46 +-
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   21 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   18 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   16 +
 python/cudf/cudf/_lib/pylibcudf/column.pxd    |   50 +
 python/cudf/cudf/_lib/pylibcudf/column.pyx    |  194 ++
 python/cudf/cudf/_lib/pylibcudf/copying.pxd   |   15 +
 python/cudf/cudf/_lib/pylibcudf/copying.pyx   |   57 +
 .../cudf/_lib/pylibcudf/gpumemoryview.pxd     |    9 +
 .../cudf/_lib/pylibcudf/gpumemoryview.pyx     |   27 +
 python/cudf/cudf/_lib/pylibcudf/table.pxd     |   18 +
 python/cudf/cudf/_lib/pylibcudf/table.pyx     |   62 +
 python/cudf/cudf/_lib/pylibcudf/types.pxd     |   16 +
 python/cudf/cudf/_lib/pylibcudf/types.pyx     |   45 +
 python/cudf/cudf/_lib/pylibcudf/utils.pxd     |    7 +
 python/cudf/cudf/_lib/pylibcudf/utils.pyx     |   13 +
 python/cudf/cudf/_lib/scalar.pyx              |   68 +-
 python/cudf/cudf/_lib/sort.pyx                |  274 ++-
 python/cudf/cudf/_lib/stream_compaction.pyx   |   39 +-
 python/cudf/cudf/_lib/strings/__init__.py     |    4 +-
 python/cudf/cudf/_lib/strings/char_types.pyx  |    4 +-
 python/cudf/cudf/_lib/strings/combine.pyx     |    6 +-
 .../strings/convert/convert_fixed_point.pyx   |   12 +-
 python/cudf/cudf/_lib/strings/translate.pyx   |    4 +-
 python/cudf/cudf/_lib/strings_udf.pyx         |    4 +-
 python/cudf/cudf/_lib/transform.pyx           |    2 +-
 python/cudf/cudf/_lib/types.pxd               |    3 +-
 python/cudf/cudf/_lib/types.pyx               |   82 +-
 python/cudf/cudf/_lib/utils.pxd               |    3 +-
 python/cudf/cudf/_lib/utils.pyx               |   28 +-
 python/cudf/cudf/api/extensions/__init__.py   |    5 +-
 python/cudf/cudf/api/types.py                 |   28 +-
 python/cudf/cudf/core/_base_index.py          |  376 +++-
 python/cudf/cudf/core/_compat.py              |    1 +
 .../cudf/cudf/core/_internals/expressions.py  |    2 +
 python/cudf/cudf/core/_internals/timezones.py |   40 +-
 python/cudf/cudf/core/_internals/where.py     |    5 +-
 python/cudf/cudf/core/algorithms.py           |   14 +-
 python/cudf/cudf/core/buffer/__init__.py      |    2 +-
 python/cudf/cudf/core/buffer/buffer.py        |   56 +-
 python/cudf/cudf/core/buffer/cow_buffer.py    |  170 --
 .../core/buffer/exposure_tracked_buffer.py    |  311 +++
 python/cudf/cudf/core/buffer/spill_manager.py |    2 +-
 .../cudf/cudf/core/buffer/spillable_buffer.py |   31 +-
 python/cudf/cudf/core/buffer/utils.py         |   17 +-
 python/cudf/cudf/core/column/categorical.py   |   90 +-
 python/cudf/cudf/core/column/column.py        |  450 +++--
 python/cudf/cudf/core/column/datetime.py      |  102 +-
 python/cudf/cudf/core/column/decimal.py       |   68 +-
 python/cudf/cudf/core/column/interval.py      |    3 +-
 python/cudf/cudf/core/column/numerical.py     |   92 +-
 .../cudf/cudf/core/column/numerical_base.py   |   10 +
 python/cudf/cudf/core/column/string.py        |  181 +-
 python/cudf/cudf/core/column/struct.py        |    8 +-
 python/cudf/cudf/core/column/timedelta.py     |   39 +-
 python/cudf/cudf/core/column_accessor.py      |   45 +-
 python/cudf/cudf/core/copy_types.py           |  171 ++
 python/cudf/cudf/core/dataframe.py            |  884 ++++++---
 python/cudf/cudf/core/dtypes.py               |   72 +-
 python/cudf/cudf/core/frame.py                |  285 +--
 python/cudf/cudf/core/groupby/groupby.py      |  196 +-
 python/cudf/cudf/core/index.py                |  635 +++---
 python/cudf/cudf/core/indexed_frame.py        |  485 ++++-
 python/cudf/cudf/core/indexing_utils.py       |  243 +++
 python/cudf/cudf/core/join/_join_helpers.py   |   36 +-
 python/cudf/cudf/core/join/join.py            |   27 +-
 python/cudf/cudf/core/missing.py              |    6 +-
 python/cudf/cudf/core/multiindex.py           |  121 +-
 python/cudf/cudf/core/resample.py             |   61 +-
 python/cudf/cudf/core/reshape.py              |   29 +-
 python/cudf/cudf/core/scalar.py               |   27 +-
 python/cudf/cudf/core/series.py               |  188 +-
 python/cudf/cudf/core/single_column_frame.py  |   43 +-
 python/cudf/cudf/core/subword_tokenizer.py    |    3 +-
 python/cudf/cudf/core/tools/datetimes.py      |   46 +-
 python/cudf/cudf/core/udf/groupby_lowering.py |   61 +-
 python/cudf/cudf/core/udf/groupby_typing.py   |  105 +-
 python/cudf/cudf/core/udf/groupby_utils.py    |   54 +-
 python/cudf/cudf/core/udf/masked_lowering.py  |    1 +
 python/cudf/cudf/core/udf/masked_typing.py    |   29 +
 python/cudf/cudf/core/udf/utils.py            |  152 +-
 python/cudf/cudf/errors.py                    |    6 +-
 python/cudf/cudf/io/csv.py                    |    7 +-
 python/cudf/cudf/io/json.py                   |    7 +-
 python/cudf/cudf/io/orc.py                    |    4 +-
 python/cudf/cudf/io/parquet.py                |  148 +-
 python/cudf/cudf/options.py                   |   49 +
 python/cudf/cudf/testing/testing.py           |    4 +-
 .../orc/TestOrcFile.Spark.EmptyDecompData.orc |  Bin 0 -> 373 bytes
 ...tOrcFile.Spark.NestedNotNullableStruct.orc |  Bin 0 -> 310 bytes
 .../tests/data/parquet/delta_encoding.parquet |  Bin 577 -> 577 bytes
 .../data/parquet/fixed_len_byte_array.parquet |  Bin 0 -> 259 bytes
 .../data/parquet/rle_boolean_encoding.parquet |  Bin 0 -> 192 bytes
 .../cudf/tests/dataframe/test_conversion.py   |   37 +
 .../tests/indexes/datetime/test_indexing.py   |   19 +
 .../indexes/datetime/test_time_specific.py    |   16 +
 .../cudf/cudf/tests/indexes/test_interval.py  |  309 +++
 .../cudf/cudf/tests/input_output/test_text.py |  165 +-
 .../cudf/cudf/tests/series/test_conversion.py |   33 +
 .../cudf/tests/series/test_datetimelike.py    |   67 +
 python/cudf/cudf/tests/test_api_types.py      |   24 +-
 python/cudf/cudf/tests/test_array_function.py |   24 +-
 python/cudf/cudf/tests/test_binops.py         |  102 +-
 python/cudf/cudf/tests/test_categorical.py    |   10 +
 python/cudf/cudf/tests/test_column.py         |    4 +-
 python/cudf/cudf/tests/test_copying.py        |  582 +++---
 python/cudf/cudf/tests/test_csv.py            |   18 +-
 .../cudf/cudf/tests/test_custom_accessor.py   |    4 +-
 python/cudf/cudf/tests/test_dataframe.py      |  266 ++-
 python/cudf/cudf/tests/test_dataframe_copy.py |   42 +-
 python/cudf/cudf/tests/test_datetime.py       |   74 +
 python/cudf/cudf/tests/test_decimal.py        |    2 +-
 .../cudf/tests/test_extension_compilation.py  |   15 +-
 python/cudf/cudf/tests/test_factorize.py      |   37 +-
 python/cudf/cudf/tests/test_feather.py        |   12 +-
 python/cudf/cudf/tests/test_groupby.py        |  325 +++-
 python/cudf/cudf/tests/test_index.py          |  412 ++--
 python/cudf/cudf/tests/test_indexing.py       |  440 ++++-
 python/cudf/cudf/tests/test_interval.py       |   52 +-
 python/cudf/cudf/tests/test_joining.py        |   48 +-
 python/cudf/cudf/tests/test_json.py           |   77 +-
 python/cudf/cudf/tests/test_list.py           |    7 +-
 python/cudf/cudf/tests/test_monotonic.py      |   13 +-
 python/cudf/cudf/tests/test_multiindex.py     |   86 +-
 python/cudf/cudf/tests/test_numba_import.py   |   46 +
 python/cudf/cudf/tests/test_numerical.py      |    6 +-
 python/cudf/cudf/tests/test_onehot.py         |  100 +-
 python/cudf/cudf/tests/test_options.py        |  114 +-
 python/cudf/cudf/tests/test_orc.py            |   28 +-
 python/cudf/cudf/tests/test_parquet.py        |  194 +-
 python/cudf/cudf/tests/test_quantiles.py      |   15 +
 python/cudf/cudf/tests/test_rank.py           |   12 +-
 python/cudf/cudf/tests/test_reductions.py     |   69 +-
 python/cudf/cudf/tests/test_replace.py        |   13 +-
 python/cudf/cudf/tests/test_repr.py           |  119 +-
 python/cudf/cudf/tests/test_reshape.py        |  112 ++
 python/cudf/cudf/tests/test_scalar.py         |   28 +-
 python/cudf/cudf/tests/test_search.py         |   14 +-
 python/cudf/cudf/tests/test_serialize.py      |   11 +
 python/cudf/cudf/tests/test_series.py         |   84 +-
 python/cudf/cudf/tests/test_setitem.py        |   97 +-
 python/cudf/cudf/tests/test_sparse_df.py      |    4 +-
 python/cudf/cudf/tests/test_string.py         |    8 +-
 python/cudf/cudf/tests/test_string_udfs.py    |    8 +-
 python/cudf/cudf/tests/test_struct.py         |   19 +-
 .../cudf/cudf/tests/test_subword_tokenizer.py |  238 ---
 python/cudf/cudf/tests/test_timedelta.py      |   42 +
 python/cudf/cudf/tests/test_udf_masked_ops.py |   36 +
 python/cudf/cudf/tests/test_unaops.py         |   11 +-
 .../cudf/tests/text/test_subword_tokenizer.py |  239 ++-
 .../test_text_methods.py}                     |  303 +--
 python/cudf/cudf/utils/_numba.py              |  191 ++
 python/cudf/cudf/utils/_ptxcompiler.py        |  107 +
 python/cudf/cudf/utils/applyutils.py          |   12 +-
 python/cudf/cudf/utils/cudautils.py           |  144 +-
 python/cudf/cudf/utils/dtypes.py              |   22 +-
 python/cudf/cudf/utils/ioutils.py             |   19 +-
 python/cudf/cudf/utils/queryutils.py          |    9 +-
 python/cudf/cudf/utils/utils.py               |  187 +-
 python/cudf/pyproject.toml                    |   28 +-
 python/cudf/udf_cpp/CMakeLists.txt            |    2 +-
 python/cudf/udf_cpp/shim.cu                   |  135 +-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pxd   |    4 +-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pyx   |   32 +-
 python/cudf_kafka/pyproject.toml              |   10 +-
 python/custreamz/custreamz/kafka.py           |   21 +-
 .../custreamz/tests/test_dataframes.py        |    4 +-
 python/custreamz/pyproject.toml               |    6 +-
 python/dask_cudf/dask_cudf/__init__.py        |    2 +-
 python/dask_cudf/dask_cudf/backends.py        |   36 +
 python/dask_cudf/dask_cudf/io/csv.py          |   23 +-
 python/dask_cudf/dask_cudf/io/parquet.py      |  105 +-
 .../dask_cudf/dask_cudf/io/tests/test_csv.py  |   83 +
 .../dask_cudf/io/tests/test_parquet.py        |   70 +-
 .../dask_cudf/tests/test_accessor.py          |   34 +-
 .../dask_cudf/tests/test_dispatch.py          |   65 +-
 .../dask_cudf/dask_cudf/tests/test_groupby.py |   30 +-
 python/dask_cudf/pyproject.toml               |   14 +-
 1080 files changed, 46293 insertions(+), 20963 deletions(-)
 delete mode 100644 .flake8
 create mode 100644 .github/copy-pr-bot.yaml
 create mode 100755 ci/build_wheel.sh
 create mode 100755 ci/build_wheel_cudf.sh
 create mode 100755 ci/build_wheel_dask_cudf.sh
 delete mode 100755 ci/docs/build.sh
 delete mode 100755 ci/release/apply_wheel_modifications.sh
 create mode 100755 ci/test_wheel_cudf.sh
 create mode 100755 ci/test_wheel_dask_cudf.sh
 create mode 100644 conda/environments/all_cuda-120_arch-x86_64.yaml
 delete mode 100644 conda/recipes/libcudf/nvcomp.txt
 delete mode 100644 conda/recipes/libcudf/post-link.sh
 create mode 100644 cpp/benchmarks/fixture/nvbench_fixture.hpp
 delete mode 100644 cpp/benchmarks/fixture/rmm_pool_raii.hpp
 create mode 100644 cpp/benchmarks/stream_compaction/stable_distinct.cpp
 create mode 100644 cpp/benchmarks/string/char_types.cpp
 create mode 100644 cpp/benchmarks/string/count.cpp
 create mode 100644 cpp/benchmarks/string/gather.cpp
 create mode 100644 cpp/benchmarks/string/join_strings.cpp
 create mode 100644 cpp/benchmarks/string/split_re.cpp
 create mode 100644 cpp/benchmarks/text/edit_distance.cpp
 create mode 100644 cpp/benchmarks/text/hash_ngrams.cpp
 create mode 100644 cpp/benchmarks/text/jaccard.cpp
 delete mode 100644 cpp/benchmarks/text/normalize_spaces.cpp
 create mode 100644 cpp/cmake/thirdparty/get_libcudacxx.cmake
 create mode 100644 cpp/include/cudf/ast/detail/expression_transformer.hpp
 rename cpp/include/cudf/detail/{concatenate.cuh => concatenate_masks.hpp} (76%)
 delete mode 100644 cpp/include/cudf/detail/utilities/hash_functions.cuh
 create mode 100644 cpp/include/cudf/detail/utilities/stacktrace.hpp
 create mode 100644 cpp/include/cudf/hashing/detail/default_hash.cuh
 create mode 100644 cpp/include/cudf/hashing/detail/hash_functions.cuh
 rename cpp/include/cudf/{ => hashing}/detail/hashing.hpp (62%)
 create mode 100644 cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
 create mode 100644 cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
 create mode 100644 cpp/include/cudf/io/arrow_io_source.hpp
 create mode 100644 cpp/include/cudf/io/parquet_metadata.hpp
 create mode 100644 cpp/include/cudf/reduction/detail/reduction.hpp
 create mode 100644 cpp/include/nvtext/detail/generate_ngrams.hpp
 create mode 100644 cpp/include/nvtext/jaccard.hpp
 create mode 100644 cpp/src/hash/murmurhash3_x64_128.cu
 rename cpp/src/hash/{murmur_hash.cu => murmurhash3_x86_32.cu} (64%)
 rename cpp/src/hash/{spark_murmur_hash.cu => spark_murmurhash3_x86_32.cu} (81%)
 create mode 100644 cpp/src/hash/xxhash_64.cu
 create mode 100644 cpp/src/io/comp/statistics.cu
 rename cpp/src/io/json/{experimental => }/byte_range_info.cu (89%)
 rename cpp/src/io/json/{ => legacy}/json_gpu.cu (95%)
 rename cpp/src/io/json/{ => legacy}/json_gpu.hpp (95%)
 create mode 100644 cpp/src/io/json/legacy/read_json.hpp
 rename cpp/src/io/json/{ => legacy}/reader_impl.cu (94%)
 rename cpp/src/io/json/{experimental/read_json.cpp => read_json.cu} (80%)
 rename cpp/src/io/json/{experimental => }/read_json.hpp (91%)
 create mode 100644 cpp/src/io/parquet/decode_preprocess.cu
 create mode 100644 cpp/src/io/parquet/delta_binary.cuh
 create mode 100644 cpp/src/io/parquet/page_decode.cuh
 create mode 100644 cpp/src/io/parquet/page_delta_decode.cu
 create mode 100644 cpp/src/io/parquet/page_string_decode.cu
 create mode 100644 cpp/src/io/parquet/page_string_utils.cuh
 create mode 100644 cpp/src/io/parquet/predicate_pushdown.cpp
 create mode 100644 cpp/src/io/parquet/rle_stream.cuh
 create mode 100644 cpp/src/io/utilities/arrow_io_source.cpp
 create mode 100644 cpp/src/reductions/nested_type_minmax_util.cuh
 delete mode 100644 cpp/src/reductions/struct_minmax_util.cuh
 create mode 100644 cpp/src/rolling/detail/optimized_unbounded_window.cpp
 create mode 100644 cpp/src/rolling/detail/optimized_unbounded_window.hpp
 create mode 100644 cpp/src/rolling/detail/range_comparator_utils.cuh
 create mode 100644 cpp/src/text/jaccard.cu
 create mode 100644 cpp/src/utilities/stacktrace.cpp
 delete mode 100644 cpp/tests/hashing/hash_test.cpp
 create mode 100644 cpp/tests/hashing/md5_test.cpp
 create mode 100644 cpp/tests/hashing/murmurhash3_x64_128_test.cpp
 create mode 100644 cpp/tests/hashing/murmurhash3_x86_32_test.cpp
 create mode 100644 cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp
 create mode 100644 cpp/tests/hashing/xxhash_64_test.cpp
 create mode 100644 cpp/tests/rolling/range_comparator_test.cu
 create mode 100644 cpp/tests/stream_compaction/stable_distinct_tests.cpp
 create mode 100644 cpp/tests/streams/concatenate_test.cpp
 create mode 100644 cpp/tests/streams/copying_test.cpp
 create mode 100644 cpp/tests/streams/filling_test.cpp
 create mode 100644 cpp/tests/streams/groupby_test.cpp
 create mode 100644 cpp/tests/streams/hash_test.cpp
 create mode 100644 cpp/tests/streams/replace_test.cpp
 create mode 100644 cpp/tests/streams/search_test.cpp
 create mode 100644 cpp/tests/table/row_operator_tests_utilities.cu
 create mode 100644 cpp/tests/table/row_operator_tests_utilities.hpp
 create mode 100644 cpp/tests/text/jaccard_tests.cpp
 rename docs/cudf/source/_templates/autosummary/{class_without_autosummary.rst => class.rst} (50%)
 delete mode 100644 docs/cudf/source/_templates/autosummary/class_with_autosummary.rst
 create mode 100644 docs/cudf/source/api_docs/extension_dtypes.rst
 create mode 100644 docs/cudf/source/developer_guide/pylibcudf.md
 delete mode 100644 docs/cudf/source/user_guide/cudf.CategoricalDtype.rst
 delete mode 100644 docs/cudf/source/user_guide/cudf.Decimal128Dtype.rst
 delete mode 100644 docs/cudf/source/user_guide/cudf.Decimal32Dtype.rst
 delete mode 100644 docs/cudf/source/user_guide/cudf.Decimal64Dtype.rst
 delete mode 100644 docs/cudf/source/user_guide/cudf.ListDtype.rst
 delete mode 100644 docs/cudf/source/user_guide/cudf.StructDtype.rst
 create mode 100644 java/src/main/java/ai/rapids/cudf/ChunkedPack.java
 create mode 100755 java/src/main/java/ai/rapids/cudf/CudfColumnSizeOverflowException.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/HostMemoryAllocator.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/PackedColumnMetadata.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/TableDebug.java
 create mode 100644 java/src/main/native/src/ChunkedPackJni.cpp
 create mode 100644 java/src/main/native/src/PackedColumnMetadataJni.cpp
 create mode 100644 java/src/test/java/ai/rapids/cudf/LargeTableTest.java
 create mode 100644 python/cudf/cudf/_lib/cpp/CMakeLists.txt
 create mode 100644 python/cudf/cudf/_lib/cpp/copying.pyx
 create mode 100644 python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd
 create mode 100644 python/cudf/cudf/_lib/cpp/io/data_sink.pxd
 create mode 100644 python/cudf/cudf/_lib/cpp/io/datasource.pxd
 create mode 100644 python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
 create mode 100644 python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd
 create mode 100644 python/cudf/cudf/_lib/cpp/types.pyx
 delete mode 100644 python/cudf/cudf/_lib/exception_handler.hpp
 create mode 100644 python/cudf/cudf/_lib/nvtext/jaccard.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/__init__.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/__init__.py
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/column.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/column.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/copying.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/copying.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/table.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/table.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/types.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/types.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/utils.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/utils.pyx
 delete mode 100644 python/cudf/cudf/core/buffer/cow_buffer.py
 create mode 100644 python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
 create mode 100644 python/cudf/cudf/core/copy_types.py
 create mode 100644 python/cudf/cudf/core/indexing_utils.py
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.EmptyDecompData.orc
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.NestedNotNullableStruct.orc
 create mode 100644 python/cudf/cudf/tests/data/parquet/fixed_len_byte_array.parquet
 create mode 100644 python/cudf/cudf/tests/data/parquet/rle_boolean_encoding.parquet
 create mode 100644 python/cudf/cudf/tests/indexes/datetime/test_indexing.py
 create mode 100644 python/cudf/cudf/tests/test_numba_import.py
 delete mode 100644 python/cudf/cudf/tests/test_subword_tokenizer.py
 rename python/cudf/cudf/tests/{test_text.py => text/test_text_methods.py} (81%)
 create mode 100644 python/cudf/cudf/utils/_numba.py
 create mode 100644 python/cudf/cudf/utils/_ptxcompiler.py

diff --git a/.flake8 b/.flake8
deleted file mode 100644
index e80e3afc443..00000000000
--- a/.flake8
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2017-2023, NVIDIA CORPORATION.
-
-[flake8]
-filename = *.py, *.pyx, *.pxd, *.pxi
-exclude = __init__.py, *.egg, build, docs, .git
-force-check = True
-ignore =
-    # line break before binary operator
-    W503,
-    # whitespace before :
-    E203
-per-file-ignores =
-    # Rules ignored only in Cython:
-    # E211: whitespace before '(' (used in multi-line imports)
-    # E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
-    # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
-    # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
-    # E275: Missing whitespace after keyword (Doesn't work with Cython except?)
-    # E402: invalid syntax (works for Python, not Cython)
-    # E999: invalid syntax (works for Python, not Cython)
-    # W504: line break after binary operator (breaks lines that end with a pointer)
-    *.pyx: E211, E225, E226, E227, E275, E402, E999, W504
-    *.pxd: E211, E225, E226, E227, E275, E402, E999, W504
-    *.pxi: E211, E225, E226, E227, E275, E402, E999, W504
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
new file mode 100644
index 00000000000..895ba83ee54
--- /dev/null
+++ b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 2d1444c595d..9a0b4155035 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,5 +5,4 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-copy_prs: true
 recently_updated: true
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2d592d3f247..91ec0904103 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -54,31 +54,32 @@ jobs:
       sha: ${{ inputs.sha }}
       skip_upload_pkgs: libcudf-example
   docs-build:
-    if: github.ref_type == 'branch' && github.event_name == 'push'
+    if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
-      build_type: branch
-      node_type: "gpu-v100-latest-1"
       arch: "amd64"
+      branch: ${{ inputs.branch }}
+      build_type: ${{ inputs.build_type || 'branch' }}
       container_image: "rapidsai/ci:latest"
+      date: ${{ inputs.date }}
+      node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
+      sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-120-pip
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
-      package-name: cudf
-      package-dir: python/cudf
-      skbuild-configure-options: "-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF"
+      script: ci/build_wheel_cudf.sh
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@cuda-120-pip
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,18 +89,18 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@cuda-120-pip
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
     with:
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
-      package-name: dask_cudf
-      package-dir: python/dask_cudf
+      script: ci/build_wheel_dask_cudf.sh
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-publish.yml@cuda-120-pip
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 040fac16b8d..b47a40b13d2 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -26,34 +26,34 @@ jobs:
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -61,14 +61,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -78,7 +78,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -88,7 +88,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -98,37 +98,30 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-120-pip
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
     with:
       build_type: pull-request
-      package-name: cudf
-      package-dir: python/cudf
-      skbuild-configure-options: "-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF"
+      script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-120-pip
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
     with:
       build_type: pull-request
-      package-name: cudf
-      test-unittest: "python -m pytest -n 8 ./python/cudf/cudf/tests"
-      test-smoketest: "python ./ci/wheel_smoke_test_cudf.py"
+      script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-tests-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@cuda-120-pip
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
     with:
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
-      package-name: dask_cudf
-      package-dir: python/dask_cudf
-      before-wheel: "RAPIDS_PY_WHEEL_NAME=cudf_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-cudf && python -m pip install --no-deps ./local-cudf/cudf*.whl"
+      script: "ci/build_wheel_dask_cudf.sh"
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@cuda-120-pip
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
     with:
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
-      package-name: dask_cudf
-      # Install the cudf we just built, and also test against latest dask/distributed/dask-cuda.
-      test-before: "RAPIDS_PY_WHEEL_NAME=cudf_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-cudf-dep && python -m pip install --no-deps ./local-cudf-dep/cudf*.whl && python -m pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
-      test-unittest: "python -m pytest -n 8 ./python/dask_cudf/dask_cudf/tests"
+      script: ci/test_wheel_dask_cudf.sh
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a6fbc522845..6bd2787d6dc 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -36,7 +36,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,23 +79,20 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-120-pip
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      package-name: cudf
-      test-unittest: "python -m pytest -n 8 ./python/cudf/cudf/tests"
+      script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@cuda-120-pip
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
     with:
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
-      package-name: dask_cudf
-      # Test against latest dask/distributed/dask-cuda.
-      test-before: "python -m pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
-      test-unittest: "python -m pytest -n 8 ./python/dask_cudf/dask_cudf/tests"
+      script: ci/test_wheel_dask_cudf.sh
diff --git a/.gitignore b/.gitignore
index fb5c301fe3f..a9bf0854d65 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ DartConfiguration.tcl
 *.spec
 .nfs*
 .clangd
+compile_commands.json
 
 ## Python build directories & artifacts
 dask-worker-space/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0ac54113278..b5165cf026f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,17 +30,8 @@ repos:
                 files: python/.*
                 # Explicitly specify the pyproject.toml at the repo root, not per-project.
                 args: ["--config", "pyproject.toml"]
-      - repo: https://github.com/PyCQA/flake8
-        rev: 5.0.4
-        hooks:
-              - id: flake8
-                args: ["--config=.flake8"]
-                files: python/.*$
-                types: [file]
-                types_or: [python, cython]
-                additional_dependencies: ["flake8-force"]
       - repo: https://github.com/MarcoGorelli/cython-lint
-        rev: v0.1.10
+        rev: v0.15.0
         hooks:
               - id: cython-lint
       - repo: https://github.com/pre-commit/mirrors-mypy
@@ -85,7 +76,7 @@ repos:
                   (?x)^(
                     ^cpp/include/cudf_test/cxxopts.hpp|
                     ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*|
-                    ^python/cudf/cudf/tests/test_text.py
+                    ^python/cudf/cudf/tests/text/test_text_methods.py
                   )
       - repo: local
         hooks:
@@ -165,6 +156,12 @@ repos:
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
+      - repo: https://github.com/astral-sh/ruff-pre-commit
+        rev: v0.0.278
+        hooks:
+          - id: ruff
+            files: python/.*$
+
 
 default_language_version:
       python: python3
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6c28dba6462..76abf241d96 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,431 @@
+# cuDF 23.08.00 (9 Aug 2023)
+
+## 🚨 Breaking Changes
+
+- Enforce deprecations and add clarifications around existing deprecations ([#13710](https://github.com/rapidsai/cudf/pull/13710)) [@galipremsagar](https://github.com/galipremsagar)
+- Separate MurmurHash32 from hash_functions.cuh ([#13681](https://github.com/rapidsai/cudf/pull/13681)) [@davidwendt](https://github.com/davidwendt)
+- Avoid storing metadata in pointers in ORC and Parquet writers ([#13648](https://github.com/rapidsai/cudf/pull/13648)) [@vuule](https://github.com/vuule)
+- Expose streams in all public copying APIs ([#13629](https://github.com/rapidsai/cudf/pull/13629)) [@vyasr](https://github.com/vyasr)
+- Remove deprecated cudf::strings::slice_strings (by delimiter) functions ([#13628](https://github.com/rapidsai/cudf/pull/13628)) [@davidwendt](https://github.com/davidwendt)
+- Remove deprecated cudf.set_allocator. ([#13591](https://github.com/rapidsai/cudf/pull/13591)) [@bdice](https://github.com/bdice)
+- Change build.sh to use pip install instead of setup.py ([#13507](https://github.com/rapidsai/cudf/pull/13507)) [@vyasr](https://github.com/vyasr)
+- Remove unused max_rows_tensor parameter from subword tokenizer ([#13463](https://github.com/rapidsai/cudf/pull/13463)) [@davidwendt](https://github.com/davidwendt)
+- Fix decimal scale reductions in `_get_decimal_type` ([#13224](https://github.com/rapidsai/cudf/pull/13224)) [@charlesbluca](https://github.com/charlesbluca)
+
+## 🐛 Bug Fixes
+
+- Add CUDA version to cudf_kafka and libcudf-example build strings. ([#13769](https://github.com/rapidsai/cudf/pull/13769)) [@bdice](https://github.com/bdice)
+- Fix typo in wheels-test.yaml. ([#13763](https://github.com/rapidsai/cudf/pull/13763)) [@bdice](https://github.com/bdice)
+- Don&#39;t test strings shorter than the requested ngram size ([#13758](https://github.com/rapidsai/cudf/pull/13758)) [@vyasr](https://github.com/vyasr)
+- Add CUDA version to custreamz build string. ([#13754](https://github.com/rapidsai/cudf/pull/13754)) [@bdice](https://github.com/bdice)
+- Fix writing of ORC files with empty child string columns ([#13745](https://github.com/rapidsai/cudf/pull/13745)) [@vuule](https://github.com/vuule)
+- Remove the erroneous &quot;empty level&quot; short-circuit from ORC reader ([#13722](https://github.com/rapidsai/cudf/pull/13722)) [@vuule](https://github.com/vuule)
+- Fix character counting when writing sliced tables into ORC ([#13721](https://github.com/rapidsai/cudf/pull/13721)) [@vuule](https://github.com/vuule)
+- Parquet uses row group row count if missing from header ([#13712](https://github.com/rapidsai/cudf/pull/13712)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Fix reading of RLE encoded boolean data from parquet files with V2 page headers ([#13707](https://github.com/rapidsai/cudf/pull/13707)) [@etseidl](https://github.com/etseidl)
+- Fix a corner case of list lexicographic comparator ([#13701](https://github.com/rapidsai/cudf/pull/13701)) [@ttnghia](https://github.com/ttnghia)
+- Fix combined filtering and column projection in `dask_cudf.read_parquet` ([#13697](https://github.com/rapidsai/cudf/pull/13697)) [@rjzamora](https://github.com/rjzamora)
+- Revert fetch-rapids changes ([#13696](https://github.com/rapidsai/cudf/pull/13696)) [@vyasr](https://github.com/vyasr)
+- Data generator - include offsets in the size estimate of list elments ([#13688](https://github.com/rapidsai/cudf/pull/13688)) [@vuule](https://github.com/vuule)
+- Add `cuda-nvcc-impl` to `cudf` for `numba` CUDA 12 ([#13673](https://github.com/rapidsai/cudf/pull/13673)) [@jakirkham](https://github.com/jakirkham)
+- Fix combined filtering and column projection in `read_parquet` ([#13666](https://github.com/rapidsai/cudf/pull/13666)) [@rjzamora](https://github.com/rjzamora)
+- Use `thrust::identity` as hash functions for byte pair encoding ([#13665](https://github.com/rapidsai/cudf/pull/13665)) [@PointKernel](https://github.com/PointKernel)
+- Fix loc-getitem ordering when index contains duplicate labels ([#13659](https://github.com/rapidsai/cudf/pull/13659)) [@wence-](https://github.com/wence-)
+- [REVIEW] Introduce parity with pandas for `MultiIndex.loc` ordering &amp; fix a bug in `Groupby` with `as_index` ([#13657](https://github.com/rapidsai/cudf/pull/13657)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix memcheck error found in nvtext tokenize functions ([#13649](https://github.com/rapidsai/cudf/pull/13649)) [@davidwendt](https://github.com/davidwendt)
+- Fix `has_nonempty_nulls` ignoring column offset ([#13647](https://github.com/rapidsai/cudf/pull/13647)) [@ttnghia](https://github.com/ttnghia)
+- [Java] Avoid double-free corruption in case of an Exception while creating a ColumnView ([#13645](https://github.com/rapidsai/cudf/pull/13645)) [@razajafri](https://github.com/razajafri)
+- Fix memcheck error in ORC reader call to cudf::io::copy_uncompressed_kernel ([#13643](https://github.com/rapidsai/cudf/pull/13643)) [@davidwendt](https://github.com/davidwendt)
+- Fix CUDA 12 conda environment to remove cubinlinker and ptxcompiler. ([#13636](https://github.com/rapidsai/cudf/pull/13636)) [@bdice](https://github.com/bdice)
+- Fix inf/NaN comparisons for FLOAT orderby in window functions ([#13635](https://github.com/rapidsai/cudf/pull/13635)) [@mythrocks](https://github.com/mythrocks)
+- Refactor `Index` search to simplify code and increase correctness ([#13625](https://github.com/rapidsai/cudf/pull/13625)) [@wence-](https://github.com/wence-)
+- Fix compile warning for unused variable in split_re.cu ([#13621](https://github.com/rapidsai/cudf/pull/13621)) [@davidwendt](https://github.com/davidwendt)
+- Fix tz_localize for dask_cudf Series ([#13610](https://github.com/rapidsai/cudf/pull/13610)) [@shwina](https://github.com/shwina)
+- Fix issue with no decompressed data in ORC reader ([#13609](https://github.com/rapidsai/cudf/pull/13609)) [@vuule](https://github.com/vuule)
+- Fix floating point window range extents. ([#13606](https://github.com/rapidsai/cudf/pull/13606)) [@mythrocks](https://github.com/mythrocks)
+- Fix `localize(None)` for timezone-naive columns ([#13603](https://github.com/rapidsai/cudf/pull/13603)) [@shwina](https://github.com/shwina)
+- Fixed a memory leak caused by Exception thrown while constructing a ColumnView ([#13597](https://github.com/rapidsai/cudf/pull/13597)) [@razajafri](https://github.com/razajafri)
+- Handle nullptr return value from bitmask_or in distinct_count ([#13590](https://github.com/rapidsai/cudf/pull/13590)) [@wence-](https://github.com/wence-)
+- Bring parity with pandas in Index.join ([#13589](https://github.com/rapidsai/cudf/pull/13589)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix cudf.melt when there are more than 255 columns ([#13588](https://github.com/rapidsai/cudf/pull/13588)) [@hcho3](https://github.com/hcho3)
+- Fix memory issues in cuIO due to removal of memory padding ([#13586](https://github.com/rapidsai/cudf/pull/13586)) [@ttnghia](https://github.com/ttnghia)
+- Fix Parquet multi-file reading ([#13584](https://github.com/rapidsai/cudf/pull/13584)) [@etseidl](https://github.com/etseidl)
+- Fix memcheck error found in LISTS_TEST ([#13579](https://github.com/rapidsai/cudf/pull/13579)) [@davidwendt](https://github.com/davidwendt)
+- Fix memcheck error found in STRINGS_TEST ([#13578](https://github.com/rapidsai/cudf/pull/13578)) [@davidwendt](https://github.com/davidwendt)
+- Fix memcheck error found in INTEROP_TEST ([#13577](https://github.com/rapidsai/cudf/pull/13577)) [@davidwendt](https://github.com/davidwendt)
+- Fix memcheck errors found in REDUCTION_TEST ([#13574](https://github.com/rapidsai/cudf/pull/13574)) [@davidwendt](https://github.com/davidwendt)
+- Preemptive fix for hive-partitioning change in dask ([#13564](https://github.com/rapidsai/cudf/pull/13564)) [@rjzamora](https://github.com/rjzamora)
+- Fix an issue with `dask_cudf.read_csv` when lines are needed to be skipped ([#13555](https://github.com/rapidsai/cudf/pull/13555)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix out-of-bounds memory write in cudf::dictionary::detail::concatenate ([#13554](https://github.com/rapidsai/cudf/pull/13554)) [@davidwendt](https://github.com/davidwendt)
+- Fix the null mask size in json reader ([#13537](https://github.com/rapidsai/cudf/pull/13537)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix cudf::strings::strip for all-empty input column ([#13533](https://github.com/rapidsai/cudf/pull/13533)) [@davidwendt](https://github.com/davidwendt)
+- Make sure to build without isolation or installing dependencies ([#13524](https://github.com/rapidsai/cudf/pull/13524)) [@vyasr](https://github.com/vyasr)
+- Remove preload lib from CMake for now ([#13519](https://github.com/rapidsai/cudf/pull/13519)) [@vyasr](https://github.com/vyasr)
+- Fix missing separator after null values in JSON writer ([#13503](https://github.com/rapidsai/cudf/pull/13503)) [@karthikeyann](https://github.com/karthikeyann)
+- Ensure `single_lane_block_sum_reduce` is safe to call in a loop ([#13488](https://github.com/rapidsai/cudf/pull/13488)) [@wence-](https://github.com/wence-)
+- Update all versions in pyproject.toml files. ([#13486](https://github.com/rapidsai/cudf/pull/13486)) [@bdice](https://github.com/bdice)
+- Remove applying nvbench that doesn&#39;t exist in 23.08 ([#13484](https://github.com/rapidsai/cudf/pull/13484)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix chunked Parquet reader benchmark ([#13482](https://github.com/rapidsai/cudf/pull/13482)) [@vuule](https://github.com/vuule)
+- Update JNI JSON reader column compatability for Spark ([#13477](https://github.com/rapidsai/cudf/pull/13477)) [@revans2](https://github.com/revans2)
+- Fix unsanitized output of scan with strings ([#13455](https://github.com/rapidsai/cudf/pull/13455)) [@davidwendt](https://github.com/davidwendt)
+- Reject functions without bytecode from `_can_be_jitted` in GroupBy Apply ([#13429](https://github.com/rapidsai/cudf/pull/13429)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Fix decimal scale reductions in `_get_decimal_type` ([#13224](https://github.com/rapidsai/cudf/pull/13224)) [@charlesbluca](https://github.com/charlesbluca)
+
+## 📖 Documentation
+
+- Fix doxygen groups for io data sources and sinks ([#13718](https://github.com/rapidsai/cudf/pull/13718)) [@davidwendt](https://github.com/davidwendt)
+- Add pandas compatibility note to DataFrame.query docstring ([#13693](https://github.com/rapidsai/cudf/pull/13693)) [@beckernick](https://github.com/beckernick)
+- Add pylibcudf to developer guide ([#13639](https://github.com/rapidsai/cudf/pull/13639)) [@vyasr](https://github.com/vyasr)
+- Fix repeated words in doxygen text ([#13598](https://github.com/rapidsai/cudf/pull/13598)) [@karthikeyann](https://github.com/karthikeyann)
+- Update docs for top-level API. ([#13592](https://github.com/rapidsai/cudf/pull/13592)) [@bdice](https://github.com/bdice)
+- Fix the the doxygen text for cudf::concatenate and other places ([#13561](https://github.com/rapidsai/cudf/pull/13561)) [@davidwendt](https://github.com/davidwendt)
+- Document stream validation approach used in testing ([#13556](https://github.com/rapidsai/cudf/pull/13556)) [@vyasr](https://github.com/vyasr)
+- Cleanup doc repetitions in libcudf ([#13470](https://github.com/rapidsai/cudf/pull/13470)) [@karthikeyann](https://github.com/karthikeyann)
+
+## 🚀 New Features
+
+- Support `min` and `max` aggregations for list type in groupby and reduction ([#13676](https://github.com/rapidsai/cudf/pull/13676)) [@ttnghia](https://github.com/ttnghia)
+- Add nvtext::jaccard_index API for strings columns ([#13669](https://github.com/rapidsai/cudf/pull/13669)) [@davidwendt](https://github.com/davidwendt)
+- Add read_parquet_metadata libcudf API ([#13663](https://github.com/rapidsai/cudf/pull/13663)) [@karthikeyann](https://github.com/karthikeyann)
+- Expose streams in all public copying APIs ([#13629](https://github.com/rapidsai/cudf/pull/13629)) [@vyasr](https://github.com/vyasr)
+- Add XXHash_64 hash function to cudf ([#13612](https://github.com/rapidsai/cudf/pull/13612)) [@davidwendt](https://github.com/davidwendt)
+- Java support: Floating point order-by columns for RANGE window functions ([#13595](https://github.com/rapidsai/cudf/pull/13595)) [@mythrocks](https://github.com/mythrocks)
+- Use `cuco::static_map` to build string dictionaries in ORC writer ([#13580](https://github.com/rapidsai/cudf/pull/13580)) [@vuule](https://github.com/vuule)
+- Add pylibcudf subpackage with gather implementation ([#13562](https://github.com/rapidsai/cudf/pull/13562)) [@vyasr](https://github.com/vyasr)
+- Add JNI for `lists::concatenate_list_elements` ([#13547](https://github.com/rapidsai/cudf/pull/13547)) [@ttnghia](https://github.com/ttnghia)
+- Enable nested types for `lists::concatenate_list_elements` ([#13545](https://github.com/rapidsai/cudf/pull/13545)) [@ttnghia](https://github.com/ttnghia)
+- Add unicode encoding for string columns in JSON writer ([#13539](https://github.com/rapidsai/cudf/pull/13539)) [@karthikeyann](https://github.com/karthikeyann)
+- Remove numba kernels from `find_index_of_val` ([#13517](https://github.com/rapidsai/cudf/pull/13517)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Floating point order-by columns for RANGE window functions ([#13512](https://github.com/rapidsai/cudf/pull/13512)) [@mythrocks](https://github.com/mythrocks)
+- Parse column chunk metadata statistics in parquet reader ([#13472](https://github.com/rapidsai/cudf/pull/13472)) [@karthikeyann](https://github.com/karthikeyann)
+- Add `abs` function to apply ([#13408](https://github.com/rapidsai/cudf/pull/13408)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- [FEA] AST filtering in parquet reader ([#13348](https://github.com/rapidsai/cudf/pull/13348)) [@karthikeyann](https://github.com/karthikeyann)
+- [FEA] Adds option to recover from invalid JSON lines in JSON tokenizer ([#13344](https://github.com/rapidsai/cudf/pull/13344)) [@elstehle](https://github.com/elstehle)
+- Ensure cccl packages don&#39;t clash with upstream version ([#13235](https://github.com/rapidsai/cudf/pull/13235)) [@robertmaynard](https://github.com/robertmaynard)
+- Update `struct_minmax_util` to experimental row comparator ([#13069](https://github.com/rapidsai/cudf/pull/13069)) [@divyegala](https://github.com/divyegala)
+- Add stream parameter to hashing APIs ([#12090](https://github.com/rapidsai/cudf/pull/12090)) [@vyasr](https://github.com/vyasr)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for `23.08` release ([#13802](https://github.com/rapidsai/cudf/pull/13802)) [@galipremsagar](https://github.com/galipremsagar)
+- Relax protobuf pinnings. ([#13770](https://github.com/rapidsai/cudf/pull/13770)) [@bdice](https://github.com/bdice)
+- Switch fully unbounded window functions to use aggregations ([#13727](https://github.com/rapidsai/cudf/pull/13727)) [@mythrocks](https://github.com/mythrocks)
+- Switch to new wheel building pipeline ([#13723](https://github.com/rapidsai/cudf/pull/13723)) [@vyasr](https://github.com/vyasr)
+- Revert CUDA 12.0 CI workflows to branch-23.08. ([#13719](https://github.com/rapidsai/cudf/pull/13719)) [@bdice](https://github.com/bdice)
+- Adding identify minimum version requirement ([#13713](https://github.com/rapidsai/cudf/pull/13713)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Enforce deprecations and add clarifications around existing deprecations ([#13710](https://github.com/rapidsai/cudf/pull/13710)) [@galipremsagar](https://github.com/galipremsagar)
+- Optimize ORC reader performance for list data ([#13708](https://github.com/rapidsai/cudf/pull/13708)) [@vyasr](https://github.com/vyasr)
+- fix limit overflow message in a docstring ([#13703](https://github.com/rapidsai/cudf/pull/13703)) [@ahmet-uyar](https://github.com/ahmet-uyar)
+- Alleviates JSON parser&#39;s need for multi-file sources to end with a newline ([#13702](https://github.com/rapidsai/cudf/pull/13702)) [@elstehle](https://github.com/elstehle)
+- Update cython-lint and replace flake8 with ruff ([#13699](https://github.com/rapidsai/cudf/pull/13699)) [@vyasr](https://github.com/vyasr)
+- Add `__dask_tokenize__` definitions to cudf classes ([#13695](https://github.com/rapidsai/cudf/pull/13695)) [@rjzamora](https://github.com/rjzamora)
+- Convert libcudf hashing benchmarks to nvbench ([#13694](https://github.com/rapidsai/cudf/pull/13694)) [@davidwendt](https://github.com/davidwendt)
+- Separate MurmurHash32 from hash_functions.cuh ([#13681](https://github.com/rapidsai/cudf/pull/13681)) [@davidwendt](https://github.com/davidwendt)
+- Improve performance of cudf::strings::split on whitespace ([#13680](https://github.com/rapidsai/cudf/pull/13680)) [@davidwendt](https://github.com/davidwendt)
+- Allow ORC and Parquet writers to write nullable columns without nulls as non-nullable ([#13675](https://github.com/rapidsai/cudf/pull/13675)) [@vuule](https://github.com/vuule)
+- Raise a NotImplementedError in to_datetime when utc is passed ([#13670](https://github.com/rapidsai/cudf/pull/13670)) [@shwina](https://github.com/shwina)
+- Add rmm_mode parameter to nvbench base fixture ([#13668](https://github.com/rapidsai/cudf/pull/13668)) [@davidwendt](https://github.com/davidwendt)
+- Fix multiindex loc ordering in pandas-compat mode ([#13660](https://github.com/rapidsai/cudf/pull/13660)) [@wence-](https://github.com/wence-)
+- Add nvtext hash_character_ngrams function ([#13654](https://github.com/rapidsai/cudf/pull/13654)) [@davidwendt](https://github.com/davidwendt)
+- Avoid storing metadata in pointers in ORC and Parquet writers ([#13648](https://github.com/rapidsai/cudf/pull/13648)) [@vuule](https://github.com/vuule)
+- Acquire spill lock in to/from_arrow ([#13646](https://github.com/rapidsai/cudf/pull/13646)) [@shwina](https://github.com/shwina)
+- Expose stable versions of libcudf sort routines ([#13634](https://github.com/rapidsai/cudf/pull/13634)) [@wence-](https://github.com/wence-)
+- Separate out hash_test.cpp source for each hash API ([#13633](https://github.com/rapidsai/cudf/pull/13633)) [@davidwendt](https://github.com/davidwendt)
+- Remove deprecated cudf::strings::slice_strings (by delimiter) functions ([#13628](https://github.com/rapidsai/cudf/pull/13628)) [@davidwendt](https://github.com/davidwendt)
+- Create separate libcudf hash APIs for each supported hash function ([#13626](https://github.com/rapidsai/cudf/pull/13626)) [@davidwendt](https://github.com/davidwendt)
+- Add convert_dtypes API ([#13623](https://github.com/rapidsai/cudf/pull/13623)) [@shwina](https://github.com/shwina)
+- Clean up cupy in dependencies.yaml. ([#13617](https://github.com/rapidsai/cudf/pull/13617)) [@bdice](https://github.com/bdice)
+- Use cuda-version to constrain cudatoolkit. ([#13615](https://github.com/rapidsai/cudf/pull/13615)) [@bdice](https://github.com/bdice)
+- Add murmurhash3_x64_128 function to libcudf ([#13604](https://github.com/rapidsai/cudf/pull/13604)) [@davidwendt](https://github.com/davidwendt)
+- Performance improvement for cudf::strings::like ([#13594](https://github.com/rapidsai/cudf/pull/13594)) [@davidwendt](https://github.com/davidwendt)
+- Remove deprecated cudf.set_allocator. ([#13591](https://github.com/rapidsai/cudf/pull/13591)) [@bdice](https://github.com/bdice)
+- Clean up cudf device atomic with `cuda::atomic_ref` ([#13583](https://github.com/rapidsai/cudf/pull/13583)) [@PointKernel](https://github.com/PointKernel)
+- Add java bindings for distinct count ([#13573](https://github.com/rapidsai/cudf/pull/13573)) [@revans2](https://github.com/revans2)
+- Use nvcomp conda package. ([#13566](https://github.com/rapidsai/cudf/pull/13566)) [@bdice](https://github.com/bdice)
+- Add exception to string_scalar if input string exceeds size_type ([#13560](https://github.com/rapidsai/cudf/pull/13560)) [@davidwendt](https://github.com/davidwendt)
+- Add dispatch for `cudf.Dataframe` to/from `pyarrow.Table` conversion ([#13558](https://github.com/rapidsai/cudf/pull/13558)) [@rjzamora](https://github.com/rjzamora)
+- Get rid of `cuco::pair_type` aliases ([#13553](https://github.com/rapidsai/cudf/pull/13553)) [@PointKernel](https://github.com/PointKernel)
+- Introduce parity with pandas when `sort=False` in `Groupby` ([#13551](https://github.com/rapidsai/cudf/pull/13551)) [@galipremsagar](https://github.com/galipremsagar)
+- Update CMake in docker to 3.26.4 ([#13550](https://github.com/rapidsai/cudf/pull/13550)) [@NvTimLiu](https://github.com/NvTimLiu)
+- Clarify source of error message in stream testing. ([#13541](https://github.com/rapidsai/cudf/pull/13541)) [@bdice](https://github.com/bdice)
+- Deprecate `strings_to_categorical` in `cudf.read_parquet` ([#13540](https://github.com/rapidsai/cudf/pull/13540)) [@galipremsagar](https://github.com/galipremsagar)
+- Update to CMake 3.26.4 ([#13538](https://github.com/rapidsai/cudf/pull/13538)) [@vyasr](https://github.com/vyasr)
+- s3 folder naming fix ([#13536](https://github.com/rapidsai/cudf/pull/13536)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Implement iloc-getitem using parse-don&#39;t-validate approach ([#13534](https://github.com/rapidsai/cudf/pull/13534)) [@wence-](https://github.com/wence-)
+- Make synchronization explicit in the names of `hostdevice_*` copying APIs ([#13530](https://github.com/rapidsai/cudf/pull/13530)) [@ttnghia](https://github.com/ttnghia)
+- Add benchmark (Google Benchmark) dependency to conda packages. ([#13528](https://github.com/rapidsai/cudf/pull/13528)) [@bdice](https://github.com/bdice)
+- Add libcufile to dependencies.yaml. ([#13523](https://github.com/rapidsai/cudf/pull/13523)) [@bdice](https://github.com/bdice)
+- Fix some memoization logic in groupby/sort/sort_helper.cu ([#13521](https://github.com/rapidsai/cudf/pull/13521)) [@davidwendt](https://github.com/davidwendt)
+- Use sizes_to_offsets_iterator in cudf::gather for strings ([#13520](https://github.com/rapidsai/cudf/pull/13520)) [@davidwendt](https://github.com/davidwendt)
+- use rapids-upload-docs script ([#13518](https://github.com/rapidsai/cudf/pull/13518)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Support UTF-8 BOM in CSV reader ([#13516](https://github.com/rapidsai/cudf/pull/13516)) [@davidwendt](https://github.com/davidwendt)
+- Move stream-related test configuration to CMake ([#13513](https://github.com/rapidsai/cudf/pull/13513)) [@vyasr](https://github.com/vyasr)
+- Implement `cudf.option_context` ([#13511](https://github.com/rapidsai/cudf/pull/13511)) [@galipremsagar](https://github.com/galipremsagar)
+- Unpin `dask` and `distributed` for development ([#13508](https://github.com/rapidsai/cudf/pull/13508)) [@galipremsagar](https://github.com/galipremsagar)
+- Change build.sh to use pip install instead of setup.py ([#13507](https://github.com/rapidsai/cudf/pull/13507)) [@vyasr](https://github.com/vyasr)
+- Use test default stream ([#13506](https://github.com/rapidsai/cudf/pull/13506)) [@vyasr](https://github.com/vyasr)
+- Remove documentation build scripts for Jenkins ([#13495](https://github.com/rapidsai/cudf/pull/13495)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Use east const in include files ([#13494](https://github.com/rapidsai/cudf/pull/13494)) [@karthikeyann](https://github.com/karthikeyann)
+- Use east const in src files ([#13493](https://github.com/rapidsai/cudf/pull/13493)) [@karthikeyann](https://github.com/karthikeyann)
+- Use east const in tests files ([#13492](https://github.com/rapidsai/cudf/pull/13492)) [@karthikeyann](https://github.com/karthikeyann)
+- Use east const in benchmarks files ([#13491](https://github.com/rapidsai/cudf/pull/13491)) [@karthikeyann](https://github.com/karthikeyann)
+- Performance improvement for nvtext tokenize/token functions ([#13480](https://github.com/rapidsai/cudf/pull/13480)) [@davidwendt](https://github.com/davidwendt)
+- Add pd.Float*Dtype to Avro and ORC mappings ([#13475](https://github.com/rapidsai/cudf/pull/13475)) [@mroeschke](https://github.com/mroeschke)
+- Use pandas public APIs where available ([#13467](https://github.com/rapidsai/cudf/pull/13467)) [@mroeschke](https://github.com/mroeschke)
+- Allow pd.ArrowDtype in cudf.from_pandas ([#13465](https://github.com/rapidsai/cudf/pull/13465)) [@mroeschke](https://github.com/mroeschke)
+- Rework libcudf regex benchmarks with nvbench ([#13464](https://github.com/rapidsai/cudf/pull/13464)) [@davidwendt](https://github.com/davidwendt)
+- Remove unused max_rows_tensor parameter from subword tokenizer ([#13463](https://github.com/rapidsai/cudf/pull/13463)) [@davidwendt](https://github.com/davidwendt)
+- Separate io-text and nvtext pytests into different files ([#13435](https://github.com/rapidsai/cudf/pull/13435)) [@davidwendt](https://github.com/davidwendt)
+- Add a move_to function to cudf::string_view::const_iterator ([#13428](https://github.com/rapidsai/cudf/pull/13428)) [@davidwendt](https://github.com/davidwendt)
+- Allow newer scikit-build ([#13424](https://github.com/rapidsai/cudf/pull/13424)) [@vyasr](https://github.com/vyasr)
+- Refactor sort_by_values to sort_values, drop indices from return values. ([#13419](https://github.com/rapidsai/cudf/pull/13419)) [@bdice](https://github.com/bdice)
+- Inline Cython exception handler ([#13411](https://github.com/rapidsai/cudf/pull/13411)) [@vyasr](https://github.com/vyasr)
+- Init JNI version 23.08.0-SNAPSHOT ([#13401](https://github.com/rapidsai/cudf/pull/13401)) [@pxLi](https://github.com/pxLi)
+- Refactor ORC reader ([#13396](https://github.com/rapidsai/cudf/pull/13396)) [@ttnghia](https://github.com/ttnghia)
+- JNI: Remove cleaned objects in memory cleaner ([#13378](https://github.com/rapidsai/cudf/pull/13378)) [@res-life](https://github.com/res-life)
+- Add tests of currently unsupported indexing ([#13338](https://github.com/rapidsai/cudf/pull/13338)) [@wence-](https://github.com/wence-)
+- Performance improvement for some libcudf regex functions for long strings ([#13322](https://github.com/rapidsai/cudf/pull/13322)) [@davidwendt](https://github.com/davidwendt)
+- Exposure Tracked Buffer (first step towards unifying copy-on-write and spilling) ([#13307](https://github.com/rapidsai/cudf/pull/13307)) [@madsbk](https://github.com/madsbk)
+- Write string data directly to column_buffer in Parquet reader ([#13302](https://github.com/rapidsai/cudf/pull/13302)) [@etseidl](https://github.com/etseidl)
+- Add stacktrace into cudf exception types ([#13298](https://github.com/rapidsai/cudf/pull/13298)) [@ttnghia](https://github.com/ttnghia)
+- cuDF: Build CUDA 12 packages ([#12922](https://github.com/rapidsai/cudf/pull/12922)) [@bdice](https://github.com/bdice)
+
+# cuDF 23.06.00 (7 Jun 2023)
+
+## 🚨 Breaking Changes
+
+- Fix batch processing for parquet writer ([#13438](https://github.com/rapidsai/cudf/pull/13438)) [@ttnghia](https://github.com/ttnghia)
+- Use &lt;NA&gt; instead of null to match pandas. ([#13415](https://github.com/rapidsai/cudf/pull/13415)) [@bdice](https://github.com/bdice)
+- Remove UNKNOWN_NULL_COUNT ([#13372](https://github.com/rapidsai/cudf/pull/13372)) [@vyasr](https://github.com/vyasr)
+- Remove default UNKNOWN_NULL_COUNT from cudf::column member functions ([#13341](https://github.com/rapidsai/cudf/pull/13341)) [@davidwendt](https://github.com/davidwendt)
+- Use std::overflow_error when output would exceed column size limit ([#13323](https://github.com/rapidsai/cudf/pull/13323)) [@davidwendt](https://github.com/davidwendt)
+- Remove null mask and null count from column_view constructors ([#13311](https://github.com/rapidsai/cudf/pull/13311)) [@vyasr](https://github.com/vyasr)
+- Change default value of the `observed=` argument in groupby to `True` to reflect the actual behaviour ([#13296](https://github.com/rapidsai/cudf/pull/13296)) [@shwina](https://github.com/shwina)
+- Throw error if UNINITIALIZED is passed to cudf::state_null_count ([#13292](https://github.com/rapidsai/cudf/pull/13292)) [@davidwendt](https://github.com/davidwendt)
+- Remove default null-count parameter from cudf::make_strings_column factory ([#13227](https://github.com/rapidsai/cudf/pull/13227)) [@davidwendt](https://github.com/davidwendt)
+- Remove UNKNOWN_NULL_COUNT where it can be easily computed ([#13205](https://github.com/rapidsai/cudf/pull/13205)) [@vyasr](https://github.com/vyasr)
+- Update minimum Python version to Python 3.9 ([#13196](https://github.com/rapidsai/cudf/pull/13196)) [@shwina](https://github.com/shwina)
+- Refactor contiguous_split API into contiguous_split.hpp ([#13186](https://github.com/rapidsai/cudf/pull/13186)) [@abellina](https://github.com/abellina)
+- Cleanup Parquet chunked writer ([#13094](https://github.com/rapidsai/cudf/pull/13094)) [@ttnghia](https://github.com/ttnghia)
+- Cleanup ORC chunked writer ([#13091](https://github.com/rapidsai/cudf/pull/13091)) [@ttnghia](https://github.com/ttnghia)
+- Raise `NotImplementedError` when attempting to construct cuDF objects from timezone-aware datetimes ([#13086](https://github.com/rapidsai/cudf/pull/13086)) [@shwina](https://github.com/shwina)
+- Remove deprecated regex functions from libcudf ([#13067](https://github.com/rapidsai/cudf/pull/13067)) [@davidwendt](https://github.com/davidwendt)
+- [REVIEW] Upgrade to `arrow-11` ([#12757](https://github.com/rapidsai/cudf/pull/12757)) [@galipremsagar](https://github.com/galipremsagar)
+- Implement Python drop_duplicates with cudf::stable_distinct. ([#11656](https://github.com/rapidsai/cudf/pull/11656)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+
+## 🐛 Bug Fixes
+
+- Fix valid count computation in offset_bitmask_binop kernel ([#13489](https://github.com/rapidsai/cudf/pull/13489)) [@davidwendt](https://github.com/davidwendt)
+- Fix writing of ORC files with empty rowgroups ([#13466](https://github.com/rapidsai/cudf/pull/13466)) [@vuule](https://github.com/vuule)
+- Fix cudf::repeat logic when count is zero ([#13459](https://github.com/rapidsai/cudf/pull/13459)) [@davidwendt](https://github.com/davidwendt)
+- Fix batch processing for parquet writer ([#13438](https://github.com/rapidsai/cudf/pull/13438)) [@ttnghia](https://github.com/ttnghia)
+- Fix invalid use of std::exclusive_scan in Parquet writer ([#13434](https://github.com/rapidsai/cudf/pull/13434)) [@etseidl](https://github.com/etseidl)
+- Patch numba if it is imported first to ensure minor version compatibility works. ([#13433](https://github.com/rapidsai/cudf/pull/13433)) [@bdice](https://github.com/bdice)
+- Fix cudf::strings::replace_with_backrefs hang on empty match result ([#13418](https://github.com/rapidsai/cudf/pull/13418)) [@davidwendt](https://github.com/davidwendt)
+- Use &lt;NA&gt; instead of null to match pandas. ([#13415](https://github.com/rapidsai/cudf/pull/13415)) [@bdice](https://github.com/bdice)
+- Fix tokenize with non-space delimiter ([#13403](https://github.com/rapidsai/cudf/pull/13403)) [@shwina](https://github.com/shwina)
+- Fix groupby head/tail for empty dataframe ([#13398](https://github.com/rapidsai/cudf/pull/13398)) [@shwina](https://github.com/shwina)
+- Default to closed=&quot;right&quot; in `IntervalIndex` constructor ([#13394](https://github.com/rapidsai/cudf/pull/13394)) [@shwina](https://github.com/shwina)
+- Correctly reorder and reindex scan groupbys with null keys ([#13389](https://github.com/rapidsai/cudf/pull/13389)) [@wence-](https://github.com/wence-)
+- Fix unused argument errors in nvcc 11.5 ([#13387](https://github.com/rapidsai/cudf/pull/13387)) [@abellina](https://github.com/abellina)
+- Updates needed to work with jitify that leverages libcudacxx ([#13383](https://github.com/rapidsai/cudf/pull/13383)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix unused parameter warning/error in parquet/page_data.cu ([#13367](https://github.com/rapidsai/cudf/pull/13367)) [@davidwendt](https://github.com/davidwendt)
+- Fix page size estimation in Parquet writer ([#13364](https://github.com/rapidsai/cudf/pull/13364)) [@etseidl](https://github.com/etseidl)
+- Fix subword_tokenize error when input contains no tokens ([#13320](https://github.com/rapidsai/cudf/pull/13320)) [@davidwendt](https://github.com/davidwendt)
+- Support gcc 12 as the C++ compiler ([#13316](https://github.com/rapidsai/cudf/pull/13316)) [@robertmaynard](https://github.com/robertmaynard)
+- Correctly set bitmask size in `from_column_view` ([#13315](https://github.com/rapidsai/cudf/pull/13315)) [@wence-](https://github.com/wence-)
+- Fix approach to detecting assignment for gte/lte operators ([#13285](https://github.com/rapidsai/cudf/pull/13285)) [@vyasr](https://github.com/vyasr)
+- Fix parquet schema interpretation issue ([#13277](https://github.com/rapidsai/cudf/pull/13277)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Fix 64bit shift bug in avro reader ([#13276](https://github.com/rapidsai/cudf/pull/13276)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix unused variables/parameters in parquet/writer_impl.cu ([#13263](https://github.com/rapidsai/cudf/pull/13263)) [@davidwendt](https://github.com/davidwendt)
+- Clean up buffers in case AssertionError ([#13262](https://github.com/rapidsai/cudf/pull/13262)) [@razajafri](https://github.com/razajafri)
+- Allow empty input table in ast `compute_column` ([#13245](https://github.com/rapidsai/cudf/pull/13245)) [@wence-](https://github.com/wence-)
+- Fix structs_column_wrapper constructors to copy input column wrappers ([#13243](https://github.com/rapidsai/cudf/pull/13243)) [@davidwendt](https://github.com/davidwendt)
+- Fix the row index stream order in ORC reader ([#13242](https://github.com/rapidsai/cudf/pull/13242)) [@vuule](https://github.com/vuule)
+- Make `is_decompression_disabled` and `is_compression_disabled` thread-safe ([#13240](https://github.com/rapidsai/cudf/pull/13240)) [@vuule](https://github.com/vuule)
+- Add [[maybe_unused]] to nvbench environment. ([#13219](https://github.com/rapidsai/cudf/pull/13219)) [@bdice](https://github.com/bdice)
+- Fix race in ORC string dictionary creation ([#13214](https://github.com/rapidsai/cudf/pull/13214)) [@revans2](https://github.com/revans2)
+- Add scalar argtypes to udf cache keys ([#13194](https://github.com/rapidsai/cudf/pull/13194)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Fix unused parameter warning/error in grouped_rolling.cu ([#13192](https://github.com/rapidsai/cudf/pull/13192)) [@davidwendt](https://github.com/davidwendt)
+- Avoid skbuild 0.17.2 which affected the cmake -DPython_LIBRARY string ([#13188](https://github.com/rapidsai/cudf/pull/13188)) [@sevagh](https://github.com/sevagh)
+- Fix `hostdevice_vector::subspan` ([#13187](https://github.com/rapidsai/cudf/pull/13187)) [@ttnghia](https://github.com/ttnghia)
+- Use custom nvbench entry point to ensure `cudf::nvbench_base_fixture` usage ([#13183](https://github.com/rapidsai/cudf/pull/13183)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix slice_strings to return empty strings for stop &lt; start indices ([#13178](https://github.com/rapidsai/cudf/pull/13178)) [@davidwendt](https://github.com/davidwendt)
+- Allow compilation with any GTest version 1.11+ ([#13153](https://github.com/rapidsai/cudf/pull/13153)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix a few clang-format style check errors ([#13146](https://github.com/rapidsai/cudf/pull/13146)) [@davidwendt](https://github.com/davidwendt)
+- [REVIEW] Fix `Series` and `DataFrame` constructors to validate index lengths ([#13122](https://github.com/rapidsai/cudf/pull/13122)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix hash join when the input tables have nulls on only one side ([#13120](https://github.com/rapidsai/cudf/pull/13120)) [@ttnghia](https://github.com/ttnghia)
+- Fix GPU_ARCHS setting in Java CMake build and CMAKE_CUDA_ARCHITECTURES in Python package build. ([#13117](https://github.com/rapidsai/cudf/pull/13117)) [@davidwendt](https://github.com/davidwendt)
+- Adds checks to make sure json reader won&#39;t overflow ([#13115](https://github.com/rapidsai/cudf/pull/13115)) [@elstehle](https://github.com/elstehle)
+- Fix `null_count` of columns returned by `chunked_parquet_reader` ([#13111](https://github.com/rapidsai/cudf/pull/13111)) [@vuule](https://github.com/vuule)
+- Fixes sliced list and struct column bug  in JSON chunked writer ([#13108](https://github.com/rapidsai/cudf/pull/13108)) [@karthikeyann](https://github.com/karthikeyann)
+- [REVIEW] Fix missing confluent kafka version ([#13101](https://github.com/rapidsai/cudf/pull/13101)) [@galipremsagar](https://github.com/galipremsagar)
+- Use make_empty_lists_column instead of make_empty_column(type_id::LIST) ([#13099](https://github.com/rapidsai/cudf/pull/13099)) [@davidwendt](https://github.com/davidwendt)
+- Raise `NotImplementedError` when attempting to construct cuDF objects from timezone-aware datetimes ([#13086](https://github.com/rapidsai/cudf/pull/13086)) [@shwina](https://github.com/shwina)
+- Fix column selection `read_parquet` benchmarks ([#13082](https://github.com/rapidsai/cudf/pull/13082)) [@vuule](https://github.com/vuule)
+- Fix bugs in iterative groupby apply algorithm ([#13078](https://github.com/rapidsai/cudf/pull/13078)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add algorithm include in data_sink.hpp ([#13068](https://github.com/rapidsai/cudf/pull/13068)) [@ahendriksen](https://github.com/ahendriksen)
+- Fix tests/identify_stream_usage.cpp ([#13066](https://github.com/rapidsai/cudf/pull/13066)) [@ahendriksen](https://github.com/ahendriksen)
+- Prevent overflow with `skip_rows` in ORC and Parquet readers ([#13063](https://github.com/rapidsai/cudf/pull/13063)) [@vuule](https://github.com/vuule)
+- Add except declaration in Cython interface for regex_program::create ([#13054](https://github.com/rapidsai/cudf/pull/13054)) [@davidwendt](https://github.com/davidwendt)
+- [REVIEW] Fix branch version in CI scripts ([#13029](https://github.com/rapidsai/cudf/pull/13029)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix OOB memory access in CSV reader when reading without NA values ([#13011](https://github.com/rapidsai/cudf/pull/13011)) [@vuule](https://github.com/vuule)
+- Fix read_avro() skip_rows and num_rows. ([#12912](https://github.com/rapidsai/cudf/pull/12912)) [@tpn](https://github.com/tpn)
+- Purge nonempty nulls from byte_cast list outputs. ([#11971](https://github.com/rapidsai/cudf/pull/11971)) [@bdice](https://github.com/bdice)
+- Fix consumption of CPU-backed interchange protocol dataframes ([#11392](https://github.com/rapidsai/cudf/pull/11392)) [@shwina](https://github.com/shwina)
+
+## 🚀 New Features
+
+- Remove numba JIT kernel usage from dataframe copy tests ([#13385](https://github.com/rapidsai/cudf/pull/13385)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add JNI for ORC/Parquet writer compression statistics ([#13376](https://github.com/rapidsai/cudf/pull/13376)) [@ttnghia](https://github.com/ttnghia)
+- Use _compile_or_get in JIT groupby apply ([#13350](https://github.com/rapidsai/cudf/pull/13350)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- cuDF numba cuda 12 updates ([#13337](https://github.com/rapidsai/cudf/pull/13337)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add tz_convert method to convert between timestamps ([#13328](https://github.com/rapidsai/cudf/pull/13328)) [@shwina](https://github.com/shwina)
+- Optionally return compression statistics from ORC and Parquet writers ([#13294](https://github.com/rapidsai/cudf/pull/13294)) [@vuule](https://github.com/vuule)
+- Support the case=False argument to str.contains ([#13290](https://github.com/rapidsai/cudf/pull/13290)) [@shwina](https://github.com/shwina)
+- Add an event handler for ColumnVector.close ([#13279](https://github.com/rapidsai/cudf/pull/13279)) [@abellina](https://github.com/abellina)
+- JNI api for cudf::chunked_pack ([#13278](https://github.com/rapidsai/cudf/pull/13278)) [@abellina](https://github.com/abellina)
+- Implement a chunked_pack API ([#13260](https://github.com/rapidsai/cudf/pull/13260)) [@abellina](https://github.com/abellina)
+- Update cudf recipes to use GTest version to &gt;=1.13 ([#13207](https://github.com/rapidsai/cudf/pull/13207)) [@robertmaynard](https://github.com/robertmaynard)
+- JNI changes for range-extents in window functions. ([#13199](https://github.com/rapidsai/cudf/pull/13199)) [@mythrocks](https://github.com/mythrocks)
+- Add support for DatetimeTZDtype and tz_localize ([#13163](https://github.com/rapidsai/cudf/pull/13163)) [@shwina](https://github.com/shwina)
+- Add IS_NULL operator to AST ([#13145](https://github.com/rapidsai/cudf/pull/13145)) [@karthikeyann](https://github.com/karthikeyann)
+- STRING order-by column for RANGE window functions ([#13143](https://github.com/rapidsai/cudf/pull/13143)) [@mythrocks](https://github.com/mythrocks)
+- Update `contains_table` to experimental row hasher and equality comparator ([#13119](https://github.com/rapidsai/cudf/pull/13119)) [@divyegala](https://github.com/divyegala)
+- Automatically select `GroupBy.apply` algorithm based on if the UDF is jittable ([#13113](https://github.com/rapidsai/cudf/pull/13113)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Refactor Parquet chunked writer ([#13076](https://github.com/rapidsai/cudf/pull/13076)) [@ttnghia](https://github.com/ttnghia)
+- Add Python bindings for string literal support in AST ([#13073](https://github.com/rapidsai/cudf/pull/13073)) [@karthikeyann](https://github.com/karthikeyann)
+- Add Java bindings for string literal support in AST ([#13072](https://github.com/rapidsai/cudf/pull/13072)) [@karthikeyann](https://github.com/karthikeyann)
+- Add string scalar support in AST ([#13061](https://github.com/rapidsai/cudf/pull/13061)) [@karthikeyann](https://github.com/karthikeyann)
+- Log cuIO warnings using the libcudf logger ([#13043](https://github.com/rapidsai/cudf/pull/13043)) [@vuule](https://github.com/vuule)
+- Update `mixed_join` to use experimental row hasher and comparator ([#13028](https://github.com/rapidsai/cudf/pull/13028)) [@divyegala](https://github.com/divyegala)
+- Support structs of lists in row lexicographic comparator ([#13005](https://github.com/rapidsai/cudf/pull/13005)) [@ttnghia](https://github.com/ttnghia)
+- Adding `hostdevice_span` that is a span createable from `hostdevice_vector` ([#12981](https://github.com/rapidsai/cudf/pull/12981)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Add nvtext::minhash function ([#12961](https://github.com/rapidsai/cudf/pull/12961)) [@davidwendt](https://github.com/davidwendt)
+- Support lists of structs in row lexicographic comparator ([#12953](https://github.com/rapidsai/cudf/pull/12953)) [@ttnghia](https://github.com/ttnghia)
+- Update `join` to use experimental row hasher and comparator ([#12787](https://github.com/rapidsai/cudf/pull/12787)) [@divyegala](https://github.com/divyegala)
+- Implement Python drop_duplicates with cudf::stable_distinct. ([#11656](https://github.com/rapidsai/cudf/pull/11656)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+
+## 🛠️ Improvements
+
+- Drop extraneous dependencies from cudf conda recipe. ([#13406](https://github.com/rapidsai/cudf/pull/13406)) [@bdice](https://github.com/bdice)
+- Handle some corner-cases in indexing with boolean masks ([#13402](https://github.com/rapidsai/cudf/pull/13402)) [@wence-](https://github.com/wence-)
+- Add cudf::stable_distinct public API, tests, and benchmarks. ([#13392](https://github.com/rapidsai/cudf/pull/13392)) [@bdice](https://github.com/bdice)
+- [JNI] Pass this ColumnVector to the onClosed event handler ([#13386](https://github.com/rapidsai/cudf/pull/13386)) [@abellina](https://github.com/abellina)
+- Fix JNI method with mismatched parameter list ([#13384](https://github.com/rapidsai/cudf/pull/13384)) [@ttnghia](https://github.com/ttnghia)
+- Split up experimental_row_operator_tests.cu to improve its compile time ([#13382](https://github.com/rapidsai/cudf/pull/13382)) [@davidwendt](https://github.com/davidwendt)
+- Deprecate cudf::strings::slice_strings APIs that accept delimiters ([#13373](https://github.com/rapidsai/cudf/pull/13373)) [@davidwendt](https://github.com/davidwendt)
+- Remove UNKNOWN_NULL_COUNT ([#13372](https://github.com/rapidsai/cudf/pull/13372)) [@vyasr](https://github.com/vyasr)
+- Move some nvtext benchmarks to nvbench ([#13368](https://github.com/rapidsai/cudf/pull/13368)) [@davidwendt](https://github.com/davidwendt)
+- run docs nightly too ([#13366](https://github.com/rapidsai/cudf/pull/13366)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Add warning for default `dtype` parameter in `get_dummies` ([#13365](https://github.com/rapidsai/cudf/pull/13365)) [@galipremsagar](https://github.com/galipremsagar)
+- Add log messages about kvikIO compatibility mode ([#13363](https://github.com/rapidsai/cudf/pull/13363)) [@vuule](https://github.com/vuule)
+- Switch back to using primary shared-action-workflows branch ([#13362](https://github.com/rapidsai/cudf/pull/13362)) [@vyasr](https://github.com/vyasr)
+- Deprecate `StringIndex` and use `Index` instead ([#13361](https://github.com/rapidsai/cudf/pull/13361)) [@galipremsagar](https://github.com/galipremsagar)
+- Ensure columns have valid null counts in CUDF JNI. ([#13355](https://github.com/rapidsai/cudf/pull/13355)) [@mythrocks](https://github.com/mythrocks)
+- Expunge most uses of `TypeVar(bound=&quot;Foo&quot;)` ([#13346](https://github.com/rapidsai/cudf/pull/13346)) [@wence-](https://github.com/wence-)
+- Remove all references to UNKNOWN_NULL_COUNT in Python ([#13345](https://github.com/rapidsai/cudf/pull/13345)) [@vyasr](https://github.com/vyasr)
+- Improve `distinct_count` with `cuco::static_set` ([#13343](https://github.com/rapidsai/cudf/pull/13343)) [@PointKernel](https://github.com/PointKernel)
+- Fix  `contiguous_split` performance ([#13342](https://github.com/rapidsai/cudf/pull/13342)) [@ttnghia](https://github.com/ttnghia)
+- Remove default UNKNOWN_NULL_COUNT from cudf::column member functions ([#13341](https://github.com/rapidsai/cudf/pull/13341)) [@davidwendt](https://github.com/davidwendt)
+- Update mypy to 1.3 ([#13340](https://github.com/rapidsai/cudf/pull/13340)) [@wence-](https://github.com/wence-)
+- [Java] Purge non-empty nulls when setting validity ([#13335](https://github.com/rapidsai/cudf/pull/13335)) [@razajafri](https://github.com/razajafri)
+- Add row-wise filtering step to `read_parquet` ([#13334](https://github.com/rapidsai/cudf/pull/13334)) [@rjzamora](https://github.com/rjzamora)
+- Performance improvement for nvtext::minhash ([#13333](https://github.com/rapidsai/cudf/pull/13333)) [@davidwendt](https://github.com/davidwendt)
+- Fix some libcudf functions to set the null count on returning columns ([#13331](https://github.com/rapidsai/cudf/pull/13331)) [@davidwendt](https://github.com/davidwendt)
+- Change cudf::detail::concatenate_masks to return null-count ([#13330](https://github.com/rapidsai/cudf/pull/13330)) [@davidwendt](https://github.com/davidwendt)
+- Move `meta` calculation in `dask_cudf.read_parquet` ([#13327](https://github.com/rapidsai/cudf/pull/13327)) [@rjzamora](https://github.com/rjzamora)
+- Changes to support Numpy &gt;= 1.24 ([#13325](https://github.com/rapidsai/cudf/pull/13325)) [@shwina](https://github.com/shwina)
+- Use std::overflow_error when output would exceed column size limit ([#13323](https://github.com/rapidsai/cudf/pull/13323)) [@davidwendt](https://github.com/davidwendt)
+- Clean up `distinct_count` benchmark ([#13321](https://github.com/rapidsai/cudf/pull/13321)) [@PointKernel](https://github.com/PointKernel)
+- Fix gtest pinning to 1.13.0. ([#13319](https://github.com/rapidsai/cudf/pull/13319)) [@bdice](https://github.com/bdice)
+- Remove null mask and null count from column_view constructors ([#13311](https://github.com/rapidsai/cudf/pull/13311)) [@vyasr](https://github.com/vyasr)
+- Address feedback from 13289 ([#13306](https://github.com/rapidsai/cudf/pull/13306)) [@vyasr](https://github.com/vyasr)
+- Change default value of the `observed=` argument in groupby to `True` to reflect the actual behaviour ([#13296](https://github.com/rapidsai/cudf/pull/13296)) [@shwina](https://github.com/shwina)
+- First check for `BaseDtype` when infering the data type of an arbitrary object ([#13295](https://github.com/rapidsai/cudf/pull/13295)) [@shwina](https://github.com/shwina)
+- Throw error if UNINITIALIZED is passed to cudf::state_null_count ([#13292](https://github.com/rapidsai/cudf/pull/13292)) [@davidwendt](https://github.com/davidwendt)
+- Support CUDA 12.0 for pip wheels ([#13289](https://github.com/rapidsai/cudf/pull/13289)) [@divyegala](https://github.com/divyegala)
+- Refactor `transform_lists_of_structs` in `row_operators.cu` ([#13288](https://github.com/rapidsai/cudf/pull/13288)) [@ttnghia](https://github.com/ttnghia)
+- Branch 23.06 merge 23.04 ([#13286](https://github.com/rapidsai/cudf/pull/13286)) [@vyasr](https://github.com/vyasr)
+- Update cupy dependency ([#13284](https://github.com/rapidsai/cudf/pull/13284)) [@vyasr](https://github.com/vyasr)
+- Performance improvement in cudf::strings::join_strings for long strings ([#13283](https://github.com/rapidsai/cudf/pull/13283)) [@davidwendt](https://github.com/davidwendt)
+- Fix unused variables and functions ([#13275](https://github.com/rapidsai/cudf/pull/13275)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix integer overflow in `partition` `scatter_map` construction ([#13272](https://github.com/rapidsai/cudf/pull/13272)) [@wence-](https://github.com/wence-)
+- Numba 0.57 compatibility fixes ([#13271](https://github.com/rapidsai/cudf/pull/13271)) [@gmarkall](https://github.com/gmarkall)
+- Performance improvement in cudf::strings::all_characters_of_type ([#13259](https://github.com/rapidsai/cudf/pull/13259)) [@davidwendt](https://github.com/davidwendt)
+- Remove default null-count parameter from some libcudf factory functions ([#13258](https://github.com/rapidsai/cudf/pull/13258)) [@davidwendt](https://github.com/davidwendt)
+- Roll our own generate_string() because mimesis&#39; has gone away ([#13257](https://github.com/rapidsai/cudf/pull/13257)) [@shwina](https://github.com/shwina)
+- Build wheels using new single image workflow ([#13249](https://github.com/rapidsai/cudf/pull/13249)) [@vyasr](https://github.com/vyasr)
+- Enable sccache hits from local builds ([#13248](https://github.com/rapidsai/cudf/pull/13248)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Revert to branch-23.06 for shared-action-workflows ([#13247](https://github.com/rapidsai/cudf/pull/13247)) [@shwina](https://github.com/shwina)
+- Introduce `pandas_compatible` option in `cudf` ([#13241](https://github.com/rapidsai/cudf/pull/13241)) [@galipremsagar](https://github.com/galipremsagar)
+- Add metadata_builder helper class ([#13232](https://github.com/rapidsai/cudf/pull/13232)) [@abellina](https://github.com/abellina)
+- Use libkvikio conda packages in libcudf, add explicit libcufile dependency. ([#13231](https://github.com/rapidsai/cudf/pull/13231)) [@bdice](https://github.com/bdice)
+- Remove default null-count parameter from cudf::make_strings_column factory ([#13227](https://github.com/rapidsai/cudf/pull/13227)) [@davidwendt](https://github.com/davidwendt)
+- Performance improvement in cudf::strings::find/rfind for long strings ([#13226](https://github.com/rapidsai/cudf/pull/13226)) [@davidwendt](https://github.com/davidwendt)
+- Add chunked reader benchmark ([#13223](https://github.com/rapidsai/cudf/pull/13223)) [@SrikarVanavasam](https://github.com/SrikarVanavasam)
+- Set the null count in output columns in the CSV reader ([#13221](https://github.com/rapidsai/cudf/pull/13221)) [@vuule](https://github.com/vuule)
+- Skip Non-Empty nulls tests for the nightly build just like we skip CuFileTest and CudaFatalTest ([#13213](https://github.com/rapidsai/cudf/pull/13213)) [@razajafri](https://github.com/razajafri)
+- Fix string_scalar stream usage in write_json.cu ([#13212](https://github.com/rapidsai/cudf/pull/13212)) [@davidwendt](https://github.com/davidwendt)
+- Use canonicalized name for dlopen&#39;d libraries (libcufile) ([#13210](https://github.com/rapidsai/cudf/pull/13210)) [@shwina](https://github.com/shwina)
+- Refactor pinned memory vector and ORC+Parquet writers ([#13206](https://github.com/rapidsai/cudf/pull/13206)) [@ttnghia](https://github.com/ttnghia)
+- Remove UNKNOWN_NULL_COUNT where it can be easily computed ([#13205](https://github.com/rapidsai/cudf/pull/13205)) [@vyasr](https://github.com/vyasr)
+- Optimization to decoding of parquet level streams ([#13203](https://github.com/rapidsai/cudf/pull/13203)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Clean up and simplify `gpuDecideCompression` ([#13202](https://github.com/rapidsai/cudf/pull/13202)) [@vuule](https://github.com/vuule)
+- Use std::array for a statically sized vector in `create_serialized_trie` ([#13201](https://github.com/rapidsai/cudf/pull/13201)) [@vuule](https://github.com/vuule)
+- Update minimum Python version to Python 3.9 ([#13196](https://github.com/rapidsai/cudf/pull/13196)) [@shwina](https://github.com/shwina)
+- Refactor contiguous_split API into contiguous_split.hpp ([#13186](https://github.com/rapidsai/cudf/pull/13186)) [@abellina](https://github.com/abellina)
+- Remove usage of rapids-get-rapids-version-from-git ([#13184](https://github.com/rapidsai/cudf/pull/13184)) [@jjacobelli](https://github.com/jjacobelli)
+- Enable mixed-dtype decimal/scalar binary operations ([#13171](https://github.com/rapidsai/cudf/pull/13171)) [@shwina](https://github.com/shwina)
+- Split up unique_count.cu to improve build time ([#13169](https://github.com/rapidsai/cudf/pull/13169)) [@davidwendt](https://github.com/davidwendt)
+- Use nvtx3 includes in string examples. ([#13165](https://github.com/rapidsai/cudf/pull/13165)) [@bdice](https://github.com/bdice)
+- Change some .cu gtest files to .cpp ([#13155](https://github.com/rapidsai/cudf/pull/13155)) [@davidwendt](https://github.com/davidwendt)
+- Remove wheel pytest verbosity ([#13151](https://github.com/rapidsai/cudf/pull/13151)) [@sevagh](https://github.com/sevagh)
+- Fix libcudf to always pass null-count to set_null_mask ([#13149](https://github.com/rapidsai/cudf/pull/13149)) [@davidwendt](https://github.com/davidwendt)
+- Fix gtests to always pass null-count to set_null_mask calls ([#13148](https://github.com/rapidsai/cudf/pull/13148)) [@davidwendt](https://github.com/davidwendt)
+- Optimize JSON writer ([#13144](https://github.com/rapidsai/cudf/pull/13144)) [@karthikeyann](https://github.com/karthikeyann)
+- Performance improvement for libcudf upper/lower conversion for long strings ([#13142](https://github.com/rapidsai/cudf/pull/13142)) [@davidwendt](https://github.com/davidwendt)
+- [REVIEW] Deprecate `pad` and `backfill` methods ([#13140](https://github.com/rapidsai/cudf/pull/13140)) [@galipremsagar](https://github.com/galipremsagar)
+- Use CTAD instead of functions in ProtobufReader ([#13135](https://github.com/rapidsai/cudf/pull/13135)) [@vuule](https://github.com/vuule)
+- Remove more instances of `UNKNOWN_NULL_COUNT` ([#13134](https://github.com/rapidsai/cudf/pull/13134)) [@vyasr](https://github.com/vyasr)
+- Update clang-format to 16.0.1. ([#13133](https://github.com/rapidsai/cudf/pull/13133)) [@bdice](https://github.com/bdice)
+- Add log messages about cuIO&#39;s nvCOMP and cuFile use ([#13132](https://github.com/rapidsai/cudf/pull/13132)) [@vuule](https://github.com/vuule)
+- Branch 23.06 merge 23.04 ([#13131](https://github.com/rapidsai/cudf/pull/13131)) [@vyasr](https://github.com/vyasr)
+- Compute null-count in cudf::detail::slice ([#13124](https://github.com/rapidsai/cudf/pull/13124)) [@davidwendt](https://github.com/davidwendt)
+- Use ARC V2 self-hosted runners for GPU jobs ([#13123](https://github.com/rapidsai/cudf/pull/13123)) [@jjacobelli](https://github.com/jjacobelli)
+- Set null-count in linked_column_view conversion operator ([#13121](https://github.com/rapidsai/cudf/pull/13121)) [@davidwendt](https://github.com/davidwendt)
+- Adding ifdefs around nvcc-specific pragmas ([#13110](https://github.com/rapidsai/cudf/pull/13110)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Add null-count parameter to json experimental parse_data utility ([#13107](https://github.com/rapidsai/cudf/pull/13107)) [@davidwendt](https://github.com/davidwendt)
+- Remove uses-setup-env-vars ([#13105](https://github.com/rapidsai/cudf/pull/13105)) [@vyasr](https://github.com/vyasr)
+- Explicitly compute null count in concatenate APIs ([#13104](https://github.com/rapidsai/cudf/pull/13104)) [@vyasr](https://github.com/vyasr)
+- Replace unnecessary uses of `UNKNOWN_NULL_COUNT` ([#13102](https://github.com/rapidsai/cudf/pull/13102)) [@vyasr](https://github.com/vyasr)
+- Performance improvement for cudf::string_view::find functions ([#13100](https://github.com/rapidsai/cudf/pull/13100)) [@davidwendt](https://github.com/davidwendt)
+- Use `.element()` instead of `.data()` for window range calculations ([#13095](https://github.com/rapidsai/cudf/pull/13095)) [@mythrocks](https://github.com/mythrocks)
+- Cleanup Parquet chunked writer ([#13094](https://github.com/rapidsai/cudf/pull/13094)) [@ttnghia](https://github.com/ttnghia)
+- Fix unused variable error/warning in page_data.cu ([#13093](https://github.com/rapidsai/cudf/pull/13093)) [@davidwendt](https://github.com/davidwendt)
+- Cleanup ORC chunked writer ([#13091](https://github.com/rapidsai/cudf/pull/13091)) [@ttnghia](https://github.com/ttnghia)
+- Remove using namespace cudf; from libcudf gtests source ([#13089](https://github.com/rapidsai/cudf/pull/13089)) [@davidwendt](https://github.com/davidwendt)
+- Change cudf::test::make_null_mask to also return null-count ([#13081](https://github.com/rapidsai/cudf/pull/13081)) [@davidwendt](https://github.com/davidwendt)
+- Resolved automerger from `branch-23.04` to `branch-23.06` ([#13080](https://github.com/rapidsai/cudf/pull/13080)) [@galipremsagar](https://github.com/galipremsagar)
+- Assert for non-empty nulls ([#13071](https://github.com/rapidsai/cudf/pull/13071)) [@razajafri](https://github.com/razajafri)
+- Remove deprecated regex functions from libcudf ([#13067](https://github.com/rapidsai/cudf/pull/13067)) [@davidwendt](https://github.com/davidwendt)
+- Refactor `cudf::detail::sorted_order` ([#13062](https://github.com/rapidsai/cudf/pull/13062)) [@ttnghia](https://github.com/ttnghia)
+- Improve performance of slice_strings for long strings ([#13057](https://github.com/rapidsai/cudf/pull/13057)) [@davidwendt](https://github.com/davidwendt)
+- Reduce shared memory usage in gpuComputePageSizes by 50% ([#13047](https://github.com/rapidsai/cudf/pull/13047)) [@nvdbaranec](https://github.com/nvdbaranec)
+- [REVIEW] Add notes to performance comparisons notebook ([#13044](https://github.com/rapidsai/cudf/pull/13044)) [@galipremsagar](https://github.com/galipremsagar)
+- Enable binary operations between scalars and columns of differing decimal types ([#13034](https://github.com/rapidsai/cudf/pull/13034)) [@shwina](https://github.com/shwina)
+- Remove console output from some libcudf gtests ([#13027](https://github.com/rapidsai/cudf/pull/13027)) [@davidwendt](https://github.com/davidwendt)
+- Remove underscore in build string. ([#13025](https://github.com/rapidsai/cudf/pull/13025)) [@bdice](https://github.com/bdice)
+- Bump up JNI version 23.06.0-SNAPSHOT ([#13021](https://github.com/rapidsai/cudf/pull/13021)) [@pxLi](https://github.com/pxLi)
+- Fix auto merger from `branch-23.04` to `branch-23.06` ([#13009](https://github.com/rapidsai/cudf/pull/13009)) [@galipremsagar](https://github.com/galipremsagar)
+- Reduce peak memory use when writing compressed ORC files. ([#12963](https://github.com/rapidsai/cudf/pull/12963)) [@vuule](https://github.com/vuule)
+- Add nvtx annotatations to groupby methods ([#12941](https://github.com/rapidsai/cudf/pull/12941)) [@wence-](https://github.com/wence-)
+- Compute column sizes in Parquet preprocess with single kernel ([#12931](https://github.com/rapidsai/cudf/pull/12931)) [@SrikarVanavasam](https://github.com/SrikarVanavasam)
+- Add Python bindings for time zone data (TZiF) reader ([#12826](https://github.com/rapidsai/cudf/pull/12826)) [@shwina](https://github.com/shwina)
+- Optimize set-like operations ([#12769](https://github.com/rapidsai/cudf/pull/12769)) [@ttnghia](https://github.com/ttnghia)
+- [REVIEW] Upgrade to `arrow-11` ([#12757](https://github.com/rapidsai/cudf/pull/12757)) [@galipremsagar](https://github.com/galipremsagar)
+- Add empty test files for test reorganization ([#12288](https://github.com/rapidsai/cudf/pull/12288)) [@shwina](https://github.com/shwina)
+
 # cuDF 23.04.00 (6 Apr 2023)
 
 ## 🚨 Breaking Changes
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f5959de10ab..07537e75018 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -73,7 +73,7 @@ Compilers:
 
 * `gcc` version 9.3+
 * `nvcc` version 11.5+
-* `cmake` version 3.23.1+
+* `cmake` version 3.26.4+
 
 CUDA/GPU:
 
diff --git a/README.md b/README.md
index e62d6772755..64c980d0cb3 100644
--- a/README.md
+++ b/README.md
@@ -61,11 +61,11 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids
 
 ### Conda
 
-cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` channel:
+cuDF can be installed with conda (via [miniconda](https://conda.io/miniconda.html) or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` channel:
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=23.06 python=3.10 cudatoolkit=11.8
+    cudf=23.10 python=3.10 cuda-version=11.8
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/build.sh b/build.sh
index 3d004f4fd4d..2ad69712e5d 100755
--- a/build.sh
+++ b/build.sh
@@ -32,7 +32,7 @@ HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [li
    custreamz                     - build the custreamz Python package
    -v                            - verbose build mode
    -g                            - build for debug
-   -n                            - no install step
+   -n                            - no install step (does not affect Python)
    --allgpuarch                  - build for all supported GPU architectures
    --disable_nvtx                - disable inserting NVTX profiling ranges
    --opensource_nvcomp           - disable use of proprietary nvcomp extensions
@@ -332,10 +332,9 @@ fi
 if buildAll || hasArg cudf; then
 
     cd ${REPODIR}/python/cudf
-    python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
-    if [[ ${INSTALL_TARGET} != "" ]]; then
-        python setup.py install --single-version-externally-managed --record=record.txt  -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
-    fi
+    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} ${EXTRA_CMAKE_ARGS}" \
+        SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \
+        python -m pip install --no-build-isolation --no-deps .
 fi
 
 
@@ -343,12 +342,7 @@ fi
 if buildAll || hasArg dask_cudf; then
 
     cd ${REPODIR}/python/dask_cudf
-    if [[ ${INSTALL_TARGET} != "" ]]; then
-        PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL}
-        python setup.py install --single-version-externally-managed --record=record.txt
-    else
-        PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL}
-    fi
+    python -m pip install --no-build-isolation --no-deps .
 fi
 
 if hasArg cudfjar; then
@@ -375,21 +369,15 @@ fi
 # build cudf_kafka Python package
 if hasArg cudf_kafka; then
     cd ${REPODIR}/python/cudf_kafka
-    if [[ ${INSTALL_TARGET} != "" ]]; then
-        PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL}
-        python setup.py install --single-version-externally-managed --record=record.txt
-    else
-        PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL} --library-dir=${LIBCUDF_BUILD_DIR}
-    fi
+    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR}" \
+        SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \
+        python -m pip install --no-build-isolation --no-deps .
 fi
 
 # build custreamz Python package
 if hasArg custreamz; then
     cd ${REPODIR}/python/custreamz
-    if [[ ${INSTALL_TARGET} != "" ]]; then
-        PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL}
-        python setup.py install --single-version-externally-managed --record=record.txt
-    else
-        PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL} --library-dir=${LIBCUDF_BUILD_DIR}
-    fi
+    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR}" \
+        SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \
+        python -m pip install --no-build-isolation --no-deps .
 fi
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index bc27e7d76b0..8b757fecf5a 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -11,6 +11,8 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
-rapids-mamba-retry mambabuild conda/recipes/libcudf
+# With boa installed conda build forward to boa
+rapids-conda-retry mambabuild \
+    conda/recipes/libcudf
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index bfb782ef467..1ed047a500b 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -19,39 +19,39 @@ rapids-print-env
 rapids-logger "Downloading artifacts from previous jobs"
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
-VERSION_NUMBER="23.06"
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
   libcudf cudf dask-cudf
 
+export RAPIDS_VERSION_NUMBER="23.10"
+export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
-rapids-logger "Build Doxygen docs"
+rapids-logger "Build CPP docs"
 pushd cpp/doxygen
-aws s3 cp s3://rapidsai-docs/librmm/${VERSION_NUMBER}/html/rmm.tag . || echo "Failed to download rmm Doxygen tag"
+aws s3 cp s3://rapidsai-docs/librmm/${RAPIDS_VERSION_NUMBER}/html/rmm.tag . || echo "Failed to download rmm Doxygen tag"
 doxygen Doxyfile
+mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html"
+mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html"
 popd
 
-rapids-logger "Build cuDF Sphinx docs"
+rapids-logger "Build Python docs"
 pushd docs/cudf
-sphinx-build -b dirhtml source _html
-sphinx-build -b text source _text
+make dirhtml
+make text
+mkdir -p "${RAPIDS_DOCS_DIR}/cudf/"{html,txt}
+mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html"
+mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
 popd
 
-
 rapids-logger "Build dask-cuDF Sphinx docs"
 pushd docs/dask_cudf
-sphinx-build -b dirhtml source _html
-sphinx-build -b text source _text
+make dirhtml
+make text
+mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/"{html,txt}
+mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
+mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
 popd
 
-
-if [[ ${RAPIDS_BUILD_TYPE} == "branch" ]]; then
-  rapids-logger "Upload Docs to S3"
-  aws s3 sync --no-progress --delete cpp/doxygen/html "s3://rapidsai-docs/libcudf/${VERSION_NUMBER}/html"
-  aws s3 sync --no-progress --delete docs/cudf/_html "s3://rapidsai-docs/cudf/${VERSION_NUMBER}/html"
-  aws s3 sync --no-progress --delete docs/cudf/_text "s3://rapidsai-docs/cudf/${VERSION_NUMBER}/txt"
-  aws s3 sync --no-progress --delete docs/dask_cudf/_html "s3://rapidsai-docs/dask-cudf/${VERSION_NUMBER}/html"
-  aws s3 sync --no-progress --delete docs/dask_cudf/_text "s3://rapidsai-docs/dask-cudf/${VERSION_NUMBER}/txt"
-fi
+rapids-upload-docs
diff --git a/ci/build_python.sh b/ci/build_python.sh
index ec34d63b282..61f160b25f5 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -15,24 +15,25 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
 # TODO: Remove `--no-test` flag once importing on a CPU
 # node works correctly
-rapids-mamba-retry mambabuild \
+# With boa installed conda build forwards to the boa builder
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/cudf
 
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/dask-cudf
 
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf_kafka
 
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
new file mode 100755
index 00000000000..06d0c3c7a56
--- /dev/null
+++ b/ci/build_wheel.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_name=$1
+package_dir=$2
+
+source rapids-configure-sccache
+source rapids-date-string
+
+# Use gha-tools rapids-pip-wheel-version to generate wheel version then
+# update the necessary files
+version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})"
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+# This is the version of the suffix with a preceding hyphen. It's used
+# everywhere except in the final wheel name.
+PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
+
+# Patch project metadata files to include the CUDA version suffix and version override.
+pyproject_file="${package_dir}/pyproject.toml"
+
+sed -i "s/^version = .*/version = \"${version_override}\"/g" ${pyproject_file}
+sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+
+# For nightlies we want to ensure that we're pulling in alphas as well. The
+# easiest way to do so is to augment the spec with a constraint containing a
+# min alpha version that doesn't affect the version bounds but does allow usage
+# of alpha versions for that dependency without --pre
+alpha_spec=''
+if ! rapids-is-release-build; then
+    alpha_spec=',>=0.0.0a0'
+fi
+
+if [[ ${package_name} == "dask_cudf" ]]; then
+    sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+else
+    sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
+    # ptxcompiler and cubinlinker aren't version constrained
+    sed -r -i "s/ptxcompiler\"/ptxcompiler${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+    sed -r -i "s/cubinlinker\"/cubinlinker${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+fi
+
+if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
+    sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
+    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
+    sed -i "/ptxcompiler/d" ${pyproject_file}
+    sed -i "/cubinlinker/d" ${pyproject_file}
+fi
+
+cd "${package_dir}"
+
+python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
new file mode 100755
index 00000000000..7d3919b2d72
--- /dev/null
+++ b/ci/build_wheel_cudf.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir="python/cudf"
+
+export SKBUILD_CONFIGURE_OPTIONS="-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF"
+
+./ci/build_wheel.sh cudf ${package_dir}
+
+mkdir -p ${package_dir}/final_dist
+python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
new file mode 100755
index 00000000000..47e35c46004
--- /dev/null
+++ b/ci/build_wheel_dask_cudf.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir="python/dask_cudf"
+
+./ci/build_wheel.sh dask_cudf ${package_dir}
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/dist
diff --git a/ci/check_style.sh b/ci/check_style.sh
index 36b856ae6f3..e96ad8bf1db 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -14,7 +14,7 @@ rapids-dependency-file-generator \
 rapids-mamba-retry env create --force -f env.yaml -n checks
 conda activate checks
 
-FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.06/cmake-format-rapids-cmake.json
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/cmake-format-rapids-cmake.json
 export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
 mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
index e76d9524c76..dd89b092496 100644
--- a/ci/checks/copyright.py
+++ b/ci/checks/copyright.py
@@ -26,7 +26,6 @@
     re.compile(r"CMakeLists[.]txt$"),
     re.compile(r"CMakeLists_standalone[.]txt$"),
     re.compile(r"setup[.]cfg$"),
-    re.compile(r"[.]flake8[.]cython$"),
     re.compile(r"meta[.]yaml$"),
 ]
 ExemptFiles = [
diff --git a/ci/checks/doxygen.sh b/ci/checks/doxygen.sh
index f260fbcd1a4..d932fa097e9 100755
--- a/ci/checks/doxygen.sh
+++ b/ci/checks/doxygen.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 ###############################
 # cuDF doxygen warnings check #
 ###############################
@@ -13,11 +13,11 @@ fi
 # Utility to return version as number for comparison
 function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }'; }
 
-# doxygen supported version 1.8.20 to 1.9.1
+# doxygen supported version 1.9.1
 DOXYGEN_VERSION=`doxygen --version`
-if [ $(version "$DOXYGEN_VERSION") -lt $(version "1.8.20") ] ||  [ $(version $DOXYGEN_VERSION) -gt $(version "1.9.1") ]; then
+if [ ! $(version "$DOXYGEN_VERSION") -eq $(version "1.9.1") ] ; then
   echo -e "warning: Unsupported doxygen version $DOXYGEN_VERSION"
-  echo -e "Expecting doxygen version from 1.8.20 to 1.9.1"
+  echo -e "Expecting doxygen version 1.9.1"
   exit 0
 fi
 
diff --git a/ci/docs/build.sh b/ci/docs/build.sh
deleted file mode 100755
index f50bb14d648..00000000000
--- a/ci/docs/build.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-#################################
-# cuDF Docs build script for CI #
-#################################
-
-if [ -z "$PROJECT_WORKSPACE" ]; then
-    echo ">>>> ERROR: Could not detect PROJECT_WORKSPACE in environment"
-    echo ">>>> WARNING: This script contains git commands meant for automated building, do not run locally"
-    exit 1
-fi
-
-export DOCS_WORKSPACE="$WORKSPACE/docs"
-export PATH=/conda/bin:/usr/local/cuda/bin:$PATH
-export HOME="$WORKSPACE"
-export PROJECT_WORKSPACE=/rapids/cudf
-export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
-export PROJECTS=(cudf libcudf)
-
-gpuci_logger "Check environment..."
-env
-
-gpuci_logger "Check GPU usage..."
-nvidia-smi
-
-gpuci_logger "Activate conda env..."
-. /opt/conda/etc/profile.d/conda.sh
-conda activate rapids
-
-gpuci_logger "Check versions..."
-python --version
-
-conda info
-conda config --show-sources
-conda list --show-channel-urls
-
-
-#libcudf Doxygen build
-gpuci_logger "Build libcudf docs..."
-cd $PROJECT_WORKSPACE/cpp/doxygen
-wget "https://raw.githubusercontent.com/rapidsai/docs/gh-pages/api/librmm/${BRANCH_VERSION}/rmm.tag" || echo "Failed to download rmm Doxygen tag"
-doxygen Doxyfile
-
-#cudf Sphinx Build
-gpuci_logger "Build cuDF docs..."
-cd $PROJECT_WORKSPACE/docs/cudf
-make html
-
-#Commit to Website
-cd $DOCS_WORKSPACE
-
-for PROJECT in ${PROJECTS[@]}; do
-    if [ ! -d "api/$PROJECT/$BRANCH_VERSION" ]; then
-        mkdir -p api/$PROJECT/$BRANCH_VERSION
-    fi
-    rm -rf $DOCS_WORKSPACE/api/$PROJECT/$BRANCH_VERSION/*
-done
-
-
-mv $PROJECT_WORKSPACE/docs/cudf/build/html/* $DOCS_WORKSPACE/api/cudf/$BRANCH_VERSION
-mv $PROJECT_WORKSPACE/cpp/doxygen/html/* $DOCS_WORKSPACE/api/libcudf/$BRANCH_VERSION
diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh
deleted file mode 100755
index 3de1814dfaf..00000000000
--- a/ci/release/apply_wheel_modifications.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Usage: bash apply_wheel_modifications.sh <new_version> <cuda_suffix>
-
-VERSION=${1}
-CUDA_SUFFIX=${2}
-
-# pyproject.toml versions
-sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/cudf/pyproject.toml
-sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/dask_cudf/pyproject.toml
-sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/cudf_kafka/pyproject.toml
-sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/custreamz/pyproject.toml
-
-# cudf pyproject.toml cuda suffixes
-sed -i "s/^name = \"cudf\"/name = \"cudf${CUDA_SUFFIX}\"/g" python/cudf/pyproject.toml
-sed -i "s/rmm/rmm${CUDA_SUFFIX}/g" python/cudf/pyproject.toml
-sed -i "s/ptxcompiler/ptxcompiler${CUDA_SUFFIX}/g" python/cudf/pyproject.toml
-sed -i "s/cubinlinker/cubinlinker${CUDA_SUFFIX}/g" python/cudf/pyproject.toml
-
-# dask_cudf pyproject.toml cuda suffixes
-sed -i "s/^name = \"dask_cudf\"/name = \"dask_cudf${CUDA_SUFFIX}\"/g" python/dask_cudf/pyproject.toml
-# Need to provide the == to avoid modifying the URL
-sed -i "s/\"cudf==/\"cudf${CUDA_SUFFIX}==/g" python/dask_cudf/pyproject.toml
-
-if [[ $CUDA_SUFFIX == "-cu12" ]]; then
-    sed -i "s/cuda-python[<=>\.,0-9]*/cuda-python>=12.0,<13.0/g" python/cudf/pyproject.toml
-    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" python/{cudf,dask_cudf}/pyproject.toml
-    sed -i "s/numba[<=>\.,0-9]*/numba>=0.57/g" python/{cudf,dask_cudf}/pyproject.toml
-    sed -i "/ptxcompiler/d" python/cudf/pyproject.toml
-    sed -i "/cubinlinker/d" python/cudf/pyproject.toml
-fi
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 2ee901d178e..5e735a71994 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -21,12 +21,14 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 #Get <major>.<minor> for next version
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
+NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
 # Need to distutils-normalize the versions for some use cases
 CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))")
 NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
+PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))")
 echo "current is ${CURRENT_SHORT_TAG_PEP440}, next is ${NEXT_SHORT_TAG_PEP440}"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
@@ -60,6 +62,9 @@ sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/dask_cudf/p
 sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cudf_kafka/pyproject.toml
 sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/custreamz/pyproject.toml
 
+# Wheel testing script
+sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_dask_cudf.sh
+
 # rapids-cmake version
 sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake
 
@@ -75,14 +80,24 @@ sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/
 sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/dask_cudf/source/conf.py
 sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/dask_cudf/source/conf.py
 
-
-# bump rmm & dask-cuda
-for FILE in conda/environments/*.yaml dependencies.yaml; do
-  sed_runner "s/cudf==${CURRENT_SHORT_TAG_PEP440}/cudf==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
-  sed_runner "s/cudf_kafka==${CURRENT_SHORT_TAG_PEP440}/cudf_kafka==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
-  sed_runner "s/dask-cuda==${CURRENT_SHORT_TAG_PEP440}/dask-cuda==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
-  sed_runner "s/kvikio==${CURRENT_SHORT_TAG_PEP440}/kvikio==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
-  sed_runner "s/rmm==${CURRENT_SHORT_TAG_PEP440}/rmm==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
+DEPENDENCIES=(
+  cudf
+  cudf_kafka
+  custreamz
+  dask-cuda
+  dask-cudf
+  kvikio
+  libkvikio
+  librmm
+  rmm
+)
+for DEP in "${DEPENDENCIES[@]}"; do
+  for FILE in dependencies.yaml conda/environments/*.yaml; do
+    sed_runner "/-.* ${DEP}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" ${FILE}
+  done
+  for FILE in python/*/pyproject.toml; do
+    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE}
+  done
 done
 
 # Doxyfile update
@@ -96,13 +111,15 @@ sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt
 sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt
 
-# Dependency versions in pyproject.toml
-sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/pyproject.toml
-sed_runner "s/cudf==.*\",/cudf==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/dask_cudf/pyproject.toml
-
 # CI files
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
   sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
 done
-sed_runner "s/VERSION_NUMBER=\".*/VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
+sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
+
+# Java files
+NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT"
+sed_runner "s|<version>.*-SNAPSHOT</version>|<version>${NEXT_FULL_JAVA_TAG}</version>|g" java/pom.xml
+sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" java/ci/README.md
+sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 846b90c78e5..30172b76f01 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -7,35 +7,39 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
-# Get library for finding incorrect default stream usage.
-STREAM_IDENTIFY_LIB_MODE_CUDF="${CONDA_PREFIX}/lib/libcudf_identify_stream_usage_mode_cudf.so"
-STREAM_IDENTIFY_LIB_MODE_TESTING="${CONDA_PREFIX}/lib/libcudf_identify_stream_usage_mode_testing.so"
-
-echo "STREAM_IDENTIFY_LIB=${STREAM_IDENTIFY_LIB_MODE_CUDF}"
-
 # Run libcudf and libcudf_kafka gtests from libcudf-tests package
-rapids-logger "Run gtests"
-
-cd $CONDA_PREFIX/bin/gtests/libcudf/
-export GTEST_CUDF_STREAM_MODE="new_cudf_default"
 export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/
-export LD_PRELOAD=${STREAM_IDENTIFY_LIB_MODE_CUDF}
-
-ctest -E SPAN_TEST -j20 --output-on-failure
-
-# This one test is specifically designed to test using a thrust device vector,
-# so we expect and allow it to include default stream usage.
-_allowlist_filter="SpanTest.CanConstructFromDeviceContainers"
-GTEST_FILTER="-${_allowlist_filter}" ctest -R SPAN_TEST -VV
-LD_PRELOAD= GTEST_CUDF_STREAM_MODE=default GTEST_FILTER="${_allowlist_filter}" ctest -R SPAN_TEST -VV
 
+pushd $CONDA_PREFIX/bin/gtests/libcudf/
+rapids-logger "Run libcudf gtests"
+ctest -j20 --output-on-failure
 SUITEERROR=$?
+popd
 
 if (( ${SUITEERROR} == 0 )); then
-    cd $CONDA_PREFIX/bin/gtests/libcudf_kafka/
+    pushd $CONDA_PREFIX/bin/gtests/libcudf_kafka/
+    rapids-logger "Run libcudf_kafka gtests"
     ctest -j20 --output-on-failure
     SUITEERROR=$?
+    popd
+fi
+
+# Ensure that benchmarks are runnable
+pushd $CONDA_PREFIX/bin/benchmarks/libcudf/
+rapids-logger "Run tests of libcudf benchmarks"
+
+if (( ${SUITEERROR} == 0 )); then
+    # Run a small Google benchmark
+    ./MERGE_BENCH --benchmark_filter=/2/
+    SUITEERROR=$?
+fi
+
+if (( ${SUITEERROR} == 0 )); then
+    # Run a small nvbench benchmark
+    ./STRINGS_NVBENCH --run-once --benchmark 0 --devices 0
+    SUITEERROR=$?
 fi
+popd
 
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
new file mode 100755
index 00000000000..83e24ab3ff1
--- /dev/null
+++ b/ci/test_wheel_cudf.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+set -eou pipefail
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install $(echo ./dist/cudf*.whl)[test]
+
+# Run smoke tests for aarch64 pull requests
+if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
+    python ./ci/wheel_smoke_test_cudf.py
+else
+    python -m pytest -n 8 ./python/cudf/cudf/tests
+fi
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
new file mode 100755
index 00000000000..d6e7f4bf65e
--- /dev/null
+++ b/ci/test_wheel_dask_cudf.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+set -eou pipefail
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+
+# Download the cudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
+
+# Always install latest dask for testing
+python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
+
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
+
+python -m pytest -n 8 ./python/dask_cudf/dask_cudf/tests
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 4031f1aa1c3..692ba78f317 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,24 +9,27 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
+- benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
 - c-compiler
 - cachetools
-- cmake>=3.23.1,!=3.25.0
+- cmake>=3.26.4
 - cubinlinker
-- cuda-python>=11.7.1,<12.0
+- cuda-nvtx=11.8
+- cuda-python>=11.7.1,<12.0a0
 - cuda-sanitizer-api=11.8.86
-- cudatoolkit=11.8
+- cuda-version=11.8
+- cudatoolkit
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=0.29,<0.30
-- dask-core==2023.3.2
-- dask-cuda==23.6.*
-- dask==2023.3.2
-- distributed==2023.3.2.1
+- cython>=3.0.0
+- dask-core>=2023.7.1
+- dask-cuda==23.10.*
+- dask>=2023.7.1
+- distributed>=2023.7.1
 - dlpack>=0.5,<0.6.0a0
-- doxygen=1.8.20
+- doxygen=1.9.1
 - fastavro>=0.22.9
 - fmt>=9.1.0,<10
 - fsspec>=0.6.0
@@ -34,13 +37,17 @@ dependencies:
 - gmock>=1.13.0
 - gtest>=1.13.0
 - hypothesis
+- identify>=2.5.20
 - ipython
-- libarrow==11.0.0.*
+- libarrow==12.0.1.*
+- libcufile-dev=1.4.0.31
+- libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==23.6.*
+- libkvikio==23.10.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==23.6.*
+- librmm==23.10.*
+- make
 - mimesis>=4.1.0
 - moto>=4.0.8
 - msgpack-python
@@ -48,19 +55,20 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba>=0.56.4,<0.57
-- numpy>=1.21,<1.24
+- numba>=0.57
+- numpy>=1.21
 - numpydoc
 - nvcc_linux-64=11.8
+- nvcomp==2.6.1
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0
 - pandoc
 - pip
 - pre-commit
-- protobuf>=4.21.6,<4.22
+- protobuf>=4.21,<5
 - ptxcompiler
-- pyarrow==11.0.0.*
+- pyarrow==12.0.1.*
 - pydata-sphinx-theme
 - pyorc
 - pytest
@@ -72,9 +80,9 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
-- rmm==23.6.*
+- rmm==23.10.*
 - s3fs>=2022.3.0
-- scikit-build>=0.13.1,<0.17.2
+- scikit-build>=0.13.1
 - scipy
 - spdlog>=1.11.0,<1.12
 - sphinx
@@ -86,7 +94,8 @@ dependencies:
 - sysroot_linux-64==2.17
 - tokenizers==0.13.1
 - transformers==4.24.0
-- typing_extensions
+- typing_extensions>=4.0.0
+- zlib>=1.2.13
 - pip:
   - git+https://github.com/python-streamz/streamz.git@master
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
new file mode 100644
index 00000000000..cf1bf4b8733
--- /dev/null
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -0,0 +1,98 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- pytorch
+- conda-forge
+- nvidia
+dependencies:
+- aiobotocore>=2.2.0
+- benchmark==1.8.0
+- boto3>=1.21.21
+- botocore>=1.24.21
+- c-compiler
+- cachetools
+- cmake>=3.26.4
+- cuda-cudart-dev
+- cuda-nvcc
+- cuda-nvrtc-dev
+- cuda-nvtx-dev
+- cuda-python>=12.0,<13.0a0
+- cuda-sanitizer-api
+- cuda-version=12.0
+- cupy>=12.0.0
+- cxx-compiler
+- cython>=3.0.0
+- dask-core>=2023.7.1
+- dask-cuda==23.10.*
+- dask>=2023.7.1
+- distributed>=2023.7.1
+- dlpack>=0.5,<0.6.0a0
+- doxygen=1.9.1
+- fastavro>=0.22.9
+- fmt>=9.1.0,<10
+- fsspec>=0.6.0
+- gcc_linux-64=11.*
+- gmock>=1.13.0
+- gtest>=1.13.0
+- hypothesis
+- identify>=2.5.20
+- ipython
+- libarrow==12.0.1.*
+- libcufile-dev
+- libcurand-dev
+- libkvikio==23.10.*
+- librdkafka>=1.9.0,<1.10.0a0
+- librmm==23.10.*
+- make
+- mimesis>=4.1.0
+- moto>=4.0.8
+- msgpack-python
+- myst-nb
+- nbsphinx
+- ninja
+- notebook
+- numba>=0.57
+- numpy>=1.21
+- numpydoc
+- nvcomp==2.6.1
+- nvtx>=0.2.1
+- packaging
+- pandas>=1.3,<1.6.0dev0
+- pandoc
+- pip
+- pre-commit
+- protobuf>=4.21,<5
+- pyarrow==12.0.1.*
+- pydata-sphinx-theme
+- pyorc
+- pytest
+- pytest-benchmark
+- pytest-cases
+- pytest-cov
+- pytest-xdist
+- python-confluent-kafka>=1.9.0,<1.10.0a0
+- python-snappy>=0.6.0
+- python>=3.9,<3.11
+- pytorch<1.12.0
+- rmm==23.10.*
+- s3fs>=2022.3.0
+- scikit-build>=0.13.1
+- scipy
+- spdlog>=1.11.0,<1.12
+- sphinx
+- sphinx-autobuild
+- sphinx-copybutton
+- sphinx-markdown-tables
+- sphinxcontrib-websupport
+- streamz
+- sysroot_linux-64==2.17
+- tokenizers==0.13.1
+- transformers==4.24.0
+- typing_extensions>=4.0.0
+- zlib>=1.2.13
+- pip:
+  - git+https://github.com/python-streamz/streamz.git@master
+name: all_cuda-120_arch-x86_64
diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml
index 7494fec79a0..c98c2701653 100644
--- a/conda/recipes/cudf/conda_build_config.yaml
+++ b/conda/recipes/cudf/conda_build_config.yaml
@@ -8,7 +8,10 @@ sysroot_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.23.1,!=3.25.0"
+  - ">=3.26.4"
 
 cuda_compiler:
+  - cuda-nvcc
+
+cuda11_compiler:
   - nvcc
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index f8074711b88..a909b72c878 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -37,52 +37,74 @@ build:
     # libcudf's run_exports pinning is looser than we would like
     - libcudf
   ignore_run_exports_from:
-    - {{ compiler('cuda') }}
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }}
+    {% endif %}
 
 requirements:
   build:
     - cmake {{ cmake_version }}
+    - ninja
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
-    - {{ compiler('cuda') }} {{ cuda_version }}
-    - ninja
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    {% else %}
+    - {{ compiler('cuda') }}
+    {% endif %}
+    - cuda-version ={{ cuda_version }}
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
-    - protobuf >=4.21.6,<4.22
+    - protobuf ==4.21.*
     - python
-    - cython >=0.29,<0.30
+    - cython >=3.0.0
     - scikit-build >=0.13.1
     - setuptools
-    - numba >=0.56.4,<0.57
     - dlpack >=0.5,<0.6.0a0
-    - pyarrow =11
+    - pyarrow =12
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
-    - cudatoolkit ={{ cuda_version }}
+    {% if cuda_major == "11" %}
+    - cudatoolkit
+    {% else %}
+    - cuda-cudart-dev
+    - cuda-nvrtc
+    - libcufile-dev  # [linux64]
+    {% endif %}
+    - cuda-version ={{ cuda_version }}
   run:
-    - protobuf >=4.21.6,<4.22
+    - {{ pin_compatible('protobuf', min_pin='x.x', max_pin='x') }}
     - python
-    - typing_extensions
+    - typing_extensions >=4.0.0
     - pandas >=1.3,<1.6.0dev0
     - cupy >=12.0.0
-    - numba >=0.56.4,<0.57
-    - numpy >=1.21,<1.24  # Temporarily upper bound numpy to avoid overflow deprecations
+    - numba >=0.57
+    - numpy >=1.21
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
-    - libcudf {{ version }}
-    - fastavro >=0.22.0
+    - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
-    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+    {% if cuda_major == "11" %}
+    - cudatoolkit
+    - ptxcompiler >=0.7.0
+    - cubinlinker  # CUDA enhanced compatibility.
+    - cuda-python >=11.7.1,<12.0a0
+    {% else %}
+    # Needed by Numba for CUDA support
+    - cuda-nvcc-impl
+    # TODO: Add nvjitlink here
+    # xref: https://github.com/rapidsai/cudf/issues/12822
+    - cuda-nvrtc
+    - cuda-python >=12.0,<13.0a0
+    {% endif %}
+    - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - nvtx >=0.2.1
     - packaging
-    - ptxcompiler >=0.7.0
     - cachetools
-    - cubinlinker  # CUDA enhanced compatibility.
-    - cuda-python >=11.7.1,<12.0
 
 test:
   requires:
-    - cudatoolkit ={{ cuda_version }}
+    - cuda-version ={{ cuda_version }}
   imports:
     - cudf
 
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index ccc49851a8e..b63a136ad2d 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -6,3 +6,6 @@ cxx_compiler_version:
 
 sysroot_version:
   - "2.17"
+
+cmake_version:
+  - ">=3.26.4"
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 9a0d0f0d48e..ec0cc402511 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -4,6 +4,7 @@
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
+{% set cuda_major = cuda_version.split('.')[0] %}
 {% set date_string = environ['RAPIDS_DATE_STRING'] %}
 
 package:
@@ -15,7 +16,7 @@ source:
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   script_env:
     - AWS_ACCESS_KEY_ID
     - AWS_SECRET_ACCESS_KEY
@@ -35,25 +36,27 @@ build:
 
 requirements:
   build:
-    - cmake >=3.23.1,!=3.25.0
+    - cmake {{ cmake_version }}
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     - ninja
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
     - python
-    - cython >=0.29,<0.30
+    - cython >=3.0.0
+    - cuda-version ={{ cuda_version }}
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
     - setuptools
   run:
     - python
+    - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - libcudf_kafka ={{ version }}
     - cudf ={{ version }}
 
 test:
   requires:
-    - cudatoolkit ={{ cuda_version }}
+    - cuda-version ={{ cuda_version }}
   imports:
     - cudf_kafka
 
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index b5aff0090dd..7aaa40bffd0 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -4,6 +4,7 @@
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
+{% set cuda_major = cuda_version.split('.')[0] %}
 {% set date_string = environ['RAPIDS_DATE_STRING'] %}
 
 package:
@@ -15,7 +16,7 @@ source:
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   script_env:
     - AWS_ACCESS_KEY_ID
     - AWS_SECRET_ACCESS_KEY
@@ -38,19 +39,21 @@ requirements:
     - python
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - cudf_kafka ={{ version }}
+    - cuda-version ={{ cuda_version }}
   run:
     - python
     - streamz
     - cudf ={{ version }}
-    - dask ==2023.3.2
-    - dask-core ==2023.3.2
-    - distributed ==2023.3.2.1
-    - python-confluent-kafka >=1.9.0,<1.10.0a0
     - cudf_kafka ={{ version }}
+    - dask >=2023.7.1
+    - dask-core >=2023.7.1
+    - distributed >=2023.7.1
+    - python-confluent-kafka >=1.9.0,<1.10.0a0
+    - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
   requires:
-    - cudatoolkit ={{ cuda_version }}
+    - cuda-version ={{ cuda_version }}
   imports:
     - custreamz
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index d060723859d..12809ba648f 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -38,21 +38,21 @@ requirements:
   host:
     - python
     - cudf ={{ version }}
-    - dask ==2023.3.2
-    - dask-core ==2023.3.2
-    - distributed ==2023.3.2.1
-    - cudatoolkit ={{ cuda_version }}
+    - dask >=2023.7.1
+    - dask-core >=2023.7.1
+    - distributed >=2023.7.1
+    - cuda-version ={{ cuda_version }}
   run:
     - python
     - cudf ={{ version }}
-    - dask ==2023.3.2
-    - dask-core ==2023.3.2
-    - distributed ==2023.3.2.1
-    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+    - dask >=2023.7.1
+    - dask-core >=2023.7.1
+    - distributed >=2023.7.1
+    - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
   requires:
-    - cudatoolkit ={{ cuda_version }}
+    - cuda-version ={{ cuda_version }}
   imports:
     - dask_cudf
 
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
index d315e1d8a6d..7dc54747a0c 100644
--- a/conda/recipes/dask-cudf/run_test.sh
+++ b/conda/recipes/dask-cudf/run_test.sh
@@ -18,18 +18,18 @@ if [ "${ARCH}" = "aarch64" ]; then
 fi
 
 # Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=0
+export INSTALL_DASK_MAIN=1
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2023.3.2"
+export DASK_STABLE_VERSION="2023.7.1"
 
 # Install the conda-forge or nightly version of dask and distributed
 if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
     rapids-logger "rapids-mamba-retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'"
     rapids-mamba-retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed"
 else
-    rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed==2023.3.2.1 conda-forge::dask-core==2023.3.2 --force-reinstall"
-    rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=="2023.3.2.1" conda-forge::dask-core=="2023.3.2" --force-reinstall
+    rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall"
+    rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall
 fi
 
 logger "python -c 'import dask_cudf'"
diff --git a/conda/recipes/libcudf/build.sh b/conda/recipes/libcudf/build.sh
index 7ac9e83f31c..47047f41b25 100644
--- a/conda/recipes/libcudf/build.sh
+++ b/conda/recipes/libcudf/build.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 export cudf_ROOT="$(realpath ./cpp/build)"
-./build.sh -n -v libcudf libcudf_kafka benchmarks tests --build_metrics --incl_cache_stats --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DCUDF_ENABLE_ARROW_S3=ON\"
+
+./build.sh -n -v \
+    libcudf libcudf_kafka benchmarks tests \
+    --build_metrics --incl_cache_stats \
+    --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DCUDF_ENABLE_ARROW_S3=ON -DNVBench_ENABLE_CUPTI=OFF\"
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index d89cbee67d0..25b3f19de77 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -5,19 +5,25 @@ cxx_compiler_version:
   - 11
 
 cuda_compiler:
+  - cuda-nvcc
+
+cuda11_compiler:
   - nvcc
 
 sysroot_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.23.1,!=3.25.0"
+  - ">=3.26.4"
+
+gbench_version:
+  - "==1.8.0"
 
 gtest_version:
   - ">=1.13.0"
 
 libarrow_version:
-  - "=11"
+  - "=12"
 
 dlpack_version:
   - ">=0.5,<0.6.0a0"
@@ -25,24 +31,29 @@ dlpack_version:
 librdkafka_version:
   - ">=1.9.0,<1.10.0a0"
 
-# The CTK libraries below are missing from the conda-forge::cudatoolkit
-# package. The "*_host_*" version specifiers correspond to `11.8` packages and the
-# "*_run_*" version specifiers correspond to `11.x` packages.
+fmt_version:
+  - ">=9.1.0,<10"
+
+spdlog_version:
+  - ">=1.11.0,<1.12"
+
+nvcomp_version:
+  - "=2.6.1"
 
-libcufile_host_version:
+zlib_version:
+  - ">=1.2.13"
+# The CTK libraries below are missing from the conda-forge::cudatoolkit package
+# for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
+# and the "*_run_*" version specifiers correspond to `11.x` packages.
+
+cuda11_libcufile_host_version:
   - "1.4.0.31"
 
-libcufile_run_version:
+cuda11_libcufile_run_version:
   - ">=1.0.0.82,<=1.4.0.31"
 
-libcurand_host_version:
+cuda11_libcurand_host_version:
   - "=10.3.0.86"
 
-libcurand_run_version:
+cuda11_libcurand_run_version:
   - ">=10.2.5.43,<10.3.1"
-
-fmt_version:
-  - ">=9.1.0,<10"
-
-spdlog_version:
-  - ">=1.11.0,<1.12"
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 275b8f9332f..c844131ad31 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -37,27 +37,43 @@ requirements:
     - cmake {{ cmake_version }}
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
-    - {{ compiler('cuda') }} {{ cuda_version }}
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    {% else %}
+    - {{ compiler('cuda') }}
+    {% endif %}
+    - cuda-version ={{ cuda_version }}
     - ninja
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
     - librmm ={{ minor_version }}
     - libkvikio ={{ minor_version }}
-    - cudatoolkit ={{ cuda_version }}
+    {% if cuda_major == "11" %}
+    - cudatoolkit
+    - libcufile {{ cuda11_libcufile_host_version }}  # [linux64]
+    - libcufile-dev {{ cuda11_libcufile_host_version }}  # [linux64]
+    - libcurand {{ cuda11_libcurand_host_version }}
+    - libcurand-dev {{ cuda11_libcurand_host_version }}
     - cuda-nvrtc ={{ cuda_version }}
     - cuda-nvrtc-dev ={{ cuda_version }}
     - cuda-nvtx ={{ cuda_version }}
-    - libcufile {{ libcufile_host_version }}      # [linux64]
-    - libcufile-dev {{ libcufile_host_version }}  # [linux64]
-    - libcurand {{ libcurand_host_version }}
-    - libcurand-dev {{ libcurand_host_version }}
+    {% else %}
+    - cuda-nvrtc-dev
+    - cuda-nvtx-dev
+    - libcufile-dev  # [linux64]
+    - libcurand-dev
+    {% endif %}
+    - cuda-version ={{ cuda_version }}
+    - nvcomp {{ nvcomp_version }}
     - libarrow {{ libarrow_version }}
     - dlpack {{ dlpack_version }}
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
     - spdlog {{ spdlog_version }}
+    - benchmark {{ gbench_version }}
     - gtest {{ gtest_version }}
     - gmock {{ gtest_version }}
+    - zlib {{ zlib_version }}
 
 outputs:
   - name: libcudf
@@ -69,17 +85,25 @@ outputs:
       run_exports:
         - {{ pin_subpackage("libcudf", max_pin="x.x") }}
       ignore_run_exports_from:
-        - {{ compiler('cuda') }}
+        {% if cuda_major == "11" %}
+        - {{ compiler('cuda11') }}
+        {% endif %}
     requirements:
       build:
         - cmake {{ cmake_version }}
       run:
-        - cudatoolkit {{ cuda_spec }}
+        {% if cuda_major == "11" %}
+        - cudatoolkit
+        - libcufile {{ cuda11_libcufile_run_version }}  # [linux64]
+        {% else %}
+        - cuda-nvrtc
+        - libcufile  # [linux64]
+        {% endif %}
+        - cuda-version {{ cuda_spec }}
+        - nvcomp {{ nvcomp_version }}
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
         - libarrow {{ libarrow_version }}
-        - libcufile {{ libcufile_run_version }}      # [linux64]
-        - libcufile-dev {{ libcufile_run_version }}  # [linux64]
         - dlpack {{ dlpack_version }}
         - gtest {{ gtest_version }}
         - gmock {{ gtest_version }}
@@ -91,6 +115,7 @@ outputs:
         - test -f $PREFIX/lib/libcudf_identify_stream_usage_mode_testing.so
         - test -f $PREFIX/include/cudf/aggregation.hpp
         - test -f $PREFIX/include/cudf/ast/detail/expression_parser.hpp
+        - test -f $PREFIX/include/cudf/ast/detail/expression_transformer.hpp
         - test -f $PREFIX/include/cudf/ast/detail/operators.hpp
         - test -f $PREFIX/include/cudf/ast/expressions.hpp
         - test -f $PREFIX/include/cudf/binaryop.hpp
@@ -107,6 +132,7 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/binaryop.hpp
         - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh
         - test -f $PREFIX/include/cudf/detail/concatenate.hpp
+        - test -f $PREFIX/include/cudf/detail/concatenate_masks.hpp
         - test -f $PREFIX/include/cudf/detail/contiguous_split.hpp
         - test -f $PREFIX/include/cudf/detail/copy.hpp
         - test -f $PREFIX/include/cudf/detail/datetime.hpp
@@ -115,7 +141,6 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/groupby.hpp
         - test -f $PREFIX/include/cudf/detail/groupby/group_replace_nulls.hpp
         - test -f $PREFIX/include/cudf/detail/groupby/sort_helper.hpp
-        - test -f $PREFIX/include/cudf/detail/hashing.hpp
         - test -f $PREFIX/include/cudf/detail/interop.hpp
         - test -f $PREFIX/include/cudf/detail/is_element_valid.hpp
         - test -f $PREFIX/include/cudf/detail/join.hpp
@@ -149,6 +174,7 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/utilities/linked_column.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/logger.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/pinned_host_vector.hpp
+        - test -f $PREFIX/include/cudf/detail/utilities/stacktrace.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/visitor_overload.hpp
         - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp
@@ -167,7 +193,9 @@ outputs:
         - test -f $PREFIX/include/cudf/fixed_point/temporary.hpp
         - test -f $PREFIX/include/cudf/groupby.hpp
         - test -f $PREFIX/include/cudf/hashing.hpp
+        - test -f $PREFIX/include/cudf/hashing/detail/hashing.hpp
         - test -f $PREFIX/include/cudf/interop.hpp
+        - test -f $PREFIX/include/cudf/io/arrow_io_source.hpp
         - test -f $PREFIX/include/cudf/io/avro.hpp
         - test -f $PREFIX/include/cudf/io/csv.hpp
         - test -f $PREFIX/include/cudf/io/data_sink.hpp
@@ -184,6 +212,7 @@ outputs:
         - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
         - test -f $PREFIX/include/cudf/io/orc_types.hpp
         - test -f $PREFIX/include/cudf/io/parquet.hpp
+        - test -f $PREFIX/include/cudf/io/parquet_metadata.hpp
         - test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
         - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
         - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -226,6 +255,7 @@ outputs:
         - test -f $PREFIX/include/cudf/partitioning.hpp
         - test -f $PREFIX/include/cudf/quantiles.hpp
         - test -f $PREFIX/include/cudf/reduction.hpp
+        - test -f $PREFIX/include/cudf/reduction/detail/reduction.hpp
         - test -f $PREFIX/include/cudf/reduction/detail/reduction_functions.hpp
         - test -f $PREFIX/include/cudf/reduction/detail/segmented_reduction_functions.hpp
         - test -f $PREFIX/include/cudf/replace.hpp
@@ -329,19 +359,22 @@ outputs:
       license_family: APACHE
       license_file: LICENSE
       summary: libcudf library
-      prelink_message:
-        - nvcomp.txt
   - name: libcudf_kafka
     version: {{ version }}
     script: install_libcudf_kafka.sh
     build:
       number: {{ GIT_DESCRIBE_NUMBER }}
-      string: {{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+      string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        - {{ compiler('cuda') }}
+        {% if cuda_major == "11" %}
+        - {{ compiler('cuda11') }}
+        {% endif %}
     requirements:
       build:
         - cmake {{ cmake_version }}
+      host:
+        - librdkafka {{ librdkafka_version }}
+        - {{ pin_subpackage('libcudf', exact=True) }}
       run:
         - librdkafka {{ librdkafka_version }}
         - {{ pin_subpackage('libcudf', exact=True) }}
@@ -359,20 +392,32 @@ outputs:
     script: install_libcudf_example.sh
     build:
       number: {{ GIT_DESCRIBE_NUMBER }}
-      string: {{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+      string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        - {{ compiler('cuda') }}
+        {% if cuda_major == "11" %}
+        - {{ compiler('cuda11') }}
+        {% endif %}
     requirements:
       build:
         - cmake {{ cmake_version }}
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
-        - {{ compiler('cuda') }} {{ cuda_version }}
+        {% if cuda_major == "11" %}
+        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        {% else %}
+        - {{ compiler('cuda') }}
+        {% endif %}
+        - cuda-version ={{ cuda_version }}
         - ninja
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
         - {{ pin_subpackage('libcudf', exact=True) }}
+        {% if cuda_major == "11" %}
         - cuda-nvtx ={{ cuda_version }}
+        {% else %}
+        - cuda-nvtx-dev
+        {% endif %}
+        - cuda-version ={{ cuda_version }}
       run:
         - {{ pin_subpackage('libcudf', exact=True) }}
     about:
@@ -388,17 +433,34 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        - {{ compiler('cuda') }}
+        {% if cuda_major == "11" %}
+        - {{ compiler('cuda11') }}
+        {% endif %}
     requirements:
       build:
         - cmake {{ cmake_version }}
+      host:
+        - {{ pin_subpackage('libcudf', exact=True) }}
+        - {{ pin_subpackage('libcudf_kafka', exact=True) }}
+        - cuda-version {{ cuda_spec }}
+        {% if cuda_major == "11" %}
+        - libcurand {{ cuda11_libcurand_run_version }}
+        {% else %}
+        - libcurand-dev
+        {% endif %}
+        - benchmark {{ gbench_version }}
+        - gtest {{ gtest_version }}
+        - gmock {{ gtest_version }}
       run:
         - {{ pin_subpackage('libcudf', exact=True) }}
         - {{ pin_subpackage('libcudf_kafka', exact=True) }}
-        - cudatoolkit {{ cuda_spec }}
+        - cuda-version {{ cuda_spec }}
+        {% if cuda_major == "11" %}
+        - libcurand {{ cuda11_libcurand_run_version }}
+        {% endif %}
+        - benchmark {{ gbench_version }}
         - gtest {{ gtest_version }}
         - gmock {{ gtest_version }}
-        - libcurand {{ libcurand_run_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
diff --git a/conda/recipes/libcudf/nvcomp.txt b/conda/recipes/libcudf/nvcomp.txt
deleted file mode 100644
index 9a0047e71fa..00000000000
--- a/conda/recipes/libcudf/nvcomp.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-By downloading and using the libcudf conda package, you accept the terms
-and conditions of the NVIDIA NVCOMP Software License Agreement:
-  https://developer.download.nvidia.com/compute/nvcomp/2.3/LICENSE.txt
diff --git a/conda/recipes/libcudf/post-link.sh b/conda/recipes/libcudf/post-link.sh
deleted file mode 100644
index 8ae2349f791..00000000000
--- a/conda/recipes/libcudf/post-link.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-# Only add the license notice to libcudf and not our examples / tests
-if [[ "$PKG_NAME" == "libcudf" ]]; then
-  cat ./nvcomp.txt >> $PREFIX/.messages.txt
-fi
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0c33550c9df..516865e5782 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
 include(../fetch_rapids.cmake)
 include(rapids-cmake)
@@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF)
 
 project(
   CUDF
-  VERSION 23.06.00
+  VERSION 23.10.00
   LANGUAGES C CXX CUDA
 )
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
@@ -62,11 +62,18 @@ option(
          stream to external libraries."
   OFF
 )
+# Option to add all symbols to the dynamic symbol table in the library file, allowing to retrieve
+# human-readable stacktrace for debugging.
+option(
+  CUDF_BUILD_STACKTRACE_DEBUG
+  "Replace the current optimization flags by the options '-rdynamic -Og -NDEBUG', useful for debugging with stacktrace retrieval"
+  OFF
+)
 option(DISABLE_DEPRECATION_WARNINGS "Disable warnings generated from deprecated declarations." OFF)
 # Option to enable line info in CUDA device compilation to allow introspection when profiling /
 # memchecking
 option(CUDA_ENABLE_LINEINFO
-       "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF
+       "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
 )
 option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
 # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
@@ -94,13 +101,17 @@ message(VERBOSE "CUDF: Use a file cache for JIT compiled kernels: ${JITIFY_USE_C
 message(VERBOSE "CUDF: Build and statically link Arrow libraries: ${CUDF_USE_ARROW_STATIC}")
 message(VERBOSE "CUDF: Build and enable S3 filesystem support for Arrow: ${CUDF_ENABLE_ARROW_S3}")
 message(VERBOSE "CUDF: Build with per-thread default stream: ${CUDF_USE_PER_THREAD_DEFAULT_STREAM}")
+message(
+  VERBOSE
+  "CUDF: Replace the current optimization flags by the options '-rdynamic -Og' (useful for debugging with stacktrace retrieval): ${CUDF_BUILD_STACKTRACE_DEBUG}"
+)
 message(
   VERBOSE
   "CUDF: Disable warnings generated from deprecated declarations: ${DISABLE_DEPRECATION_WARNINGS}"
 )
 message(
   VERBOSE
-  "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler: ${CUDA_ENABLE_LINEINFO}"
+  "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler): ${CUDA_ENABLE_LINEINFO}"
 )
 message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 
@@ -115,6 +126,10 @@ if(BUILD_TESTS AND NOT CUDF_BUILD_TESTUTIL)
   )
 endif()
 
+if(CUDF_BUILD_STACKTRACE_DEBUG AND NOT CMAKE_COMPILER_IS_GNUCXX)
+  message(FATAL_ERROR "CUDF_BUILD_STACKTRACE_DEBUG is only supported with GCC compiler")
+endif()
+
 set(CUDF_CXX_FLAGS "")
 set(CUDF_CUDA_FLAGS "")
 set(CUDF_CXX_DEFINITIONS "")
@@ -178,8 +193,7 @@ include(cmake/thirdparty/get_arrow.cmake)
 # find dlpack
 include(cmake/thirdparty/get_dlpack.cmake)
 # find libcu++
-include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
-rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
+include(cmake/thirdparty/get_libcudacxx.cmake)
 # find cuCollections Should come after including thrust and libcudacxx
 include(cmake/thirdparty/get_cucollections.cmake)
 # find or install GoogleTest
@@ -330,8 +344,10 @@ add_library(
   src/groupby/sort/sort_helper.cu
   src/hash/hashing.cu
   src/hash/md5_hash.cu
-  src/hash/murmur_hash.cu
-  src/hash/spark_murmur_hash.cu
+  src/hash/murmurhash3_x86_32.cu
+  src/hash/murmurhash3_x64_128.cu
+  src/hash/spark_murmurhash3_x86_32.cu
+  src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
   src/interop/from_arrow.cu
   src/interop/to_arrow.cu
@@ -346,6 +362,7 @@ add_library(
   src/io/comp/nvcomp_adapter.cpp
   src/io/comp/nvcomp_adapter.cu
   src/io/comp/snap.cu
+  src/io/comp/statistics.cu
   src/io/comp/uncomp.cpp
   src/io/comp/unsnap.cu
   src/io/csv/csv_gpu.cu
@@ -353,13 +370,13 @@ add_library(
   src/io/csv/reader_impl.cu
   src/io/csv/writer_impl.cu
   src/io/functions.cpp
+  src/io/json/byte_range_info.cu
   src/io/json/json_column.cu
-  src/io/json/json_gpu.cu
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
-  src/io/json/reader_impl.cu
-  src/io/json/experimental/byte_range_info.cu
-  src/io/json/experimental/read_json.cpp
+  src/io/json/read_json.cu
+  src/io/json/legacy/json_gpu.cu
+  src/io/json/legacy/reader_impl.cu
   src/io/json/write_json.cu
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
@@ -373,10 +390,14 @@ add_library(
   src/io/orc/writer_impl.cu
   src/io/parquet/compact_protocol_reader.cpp
   src/io/parquet/compact_protocol_writer.cpp
+  src/io/parquet/decode_preprocess.cu
   src/io/parquet/page_data.cu
   src/io/parquet/chunk_dict.cu
   src/io/parquet/page_enc.cu
   src/io/parquet/page_hdr.cu
+  src/io/parquet/page_delta_decode.cu
+  src/io/parquet/page_string_decode.cu
+  src/io/parquet/predicate_pushdown.cpp
   src/io/parquet/reader.cpp
   src/io/parquet/reader_impl.cpp
   src/io/parquet/reader_impl_helpers.cpp
@@ -389,6 +410,7 @@ add_library(
   src/io/text/bgzip_data_chunk_source.cu
   src/io/text/bgzip_utils.cpp
   src/io/text/multibyte_split.cu
+  src/io/utilities/arrow_io_source.cpp
   src/io/utilities/column_buffer.cpp
   src/io/utilities/config_utils.cpp
   src/io/utilities/data_sink.cpp
@@ -483,6 +505,7 @@ add_library(
   src/reshape/byte_cast.cu
   src/reshape/interleave_columns.cu
   src/reshape/tile.cu
+  src/rolling/detail/optimized_unbounded_window.cpp
   src/rolling/detail/rolling_collect_list.cu
   src/rolling/detail/rolling_fixed_window.cu
   src/rolling/detail/rolling_variable_window.cu
@@ -580,6 +603,7 @@ add_library(
   src/text/detokenize.cu
   src/text/edit_distance.cu
   src/text/generate_ngrams.cu
+  src/text/jaccard.cu
   src/text/minhash.cu
   src/text/ngrams_tokenize.cu
   src/text/normalize.cu
@@ -608,6 +632,7 @@ add_library(
   src/utilities/default_stream.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
+  src/utilities/stacktrace.cpp
   src/utilities/traits.cpp
   src/utilities/type_checks.cpp
   src/utilities/type_dispatcher.cpp
@@ -646,6 +671,31 @@ target_compile_options(
                "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
 )
 
+if(CUDF_BUILD_STACKTRACE_DEBUG)
+  # Remove any optimization level to avoid nvcc warning "incompatible redefinition for option
+  # 'optimize'".
+  string(REGEX REPLACE "(\-O[0123])" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
+  string(REGEX REPLACE "(\-O[0123])" "" CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE}")
+  string(REGEX REPLACE "(\-O[0123])" "" CMAKE_CUDA_FLAGS_MINSIZEREL
+                       "${CMAKE_CUDA_FLAGS_MINSIZEREL}"
+  )
+  string(REGEX REPLACE "(\-O[0123])" "" CMAKE_CUDA_FLAGS_RELWITHDEBINFO
+                       "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO}"
+  )
+
+  add_library(cudf_backtrace INTERFACE)
+  target_compile_definitions(cudf_backtrace INTERFACE CUDF_BUILD_STACKTRACE_DEBUG)
+  target_compile_options(
+    cudf_backtrace INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-Og>"
+                             "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Og>"
+  )
+  target_link_options(
+    cudf_backtrace INTERFACE "$<$<LINK_LANGUAGE:CXX>:-rdynamic>"
+    "$<$<LINK_LANGUAGE:CUDA>:-Xlinker=-rdynamic>"
+  )
+  target_link_libraries(cudf PRIVATE cudf_backtrace)
+endif()
+
 # Specify include paths for the current target and dependents
 target_include_directories(
   cudf
@@ -829,7 +879,9 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
   # depending via ctest and whether it has been updated to expose public stream APIs.
   foreach(_mode cudf testing)
     set(_tgt "cudf_identify_stream_usage_mode_${_mode}")
-    add_library(${_tgt} SHARED tests/utilities/identify_stream_usage.cpp)
+    add_library(
+      ${_tgt} SHARED src/utilities/stacktrace.cpp tests/utilities/identify_stream_usage.cpp
+    )
 
     set_target_properties(
       ${_tgt}
@@ -838,7 +890,14 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
                  CXX_STANDARD_REQUIRED ON
                  POSITION_INDEPENDENT_CODE ON
     )
+    target_compile_options(
+      ${_tgt} PRIVATE "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
+    )
+    target_include_directories(${_tgt} PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>")
     target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm)
+    if(CUDF_BUILD_STACKTRACE_DEBUG)
+      target_link_libraries(${_tgt} PRIVATE cudf_backtrace)
+    endif()
     add_library(cudf::${_tgt} ALIAS ${_tgt})
 
     if("${_mode}" STREQUAL "testing")
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index dcc70a4b6d9..5e7862f4b3b 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -153,8 +153,12 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
-  STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/distinct_count.cpp
-  stream_compaction/unique.cpp stream_compaction/unique_count.cpp
+  STREAM_COMPACTION_NVBENCH
+  stream_compaction/distinct.cpp
+  stream_compaction/distinct_count.cpp
+  stream_compaction/stable_distinct.cpp
+  stream_compaction/unique.cpp
+  stream_compaction/unique_count.cpp
 )
 
 # ##################################################################################################
@@ -220,7 +224,8 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * hashing benchmark -----------------------------------------------------------------------------
-ConfigureBench(HASHING_BENCH hashing/hash.cpp hashing/partition.cpp)
+ConfigureBench(HASHING_BENCH hashing/partition.cpp)
+ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
 
 # ##################################################################################################
 # * merge benchmark -------------------------------------------------------------------------------
@@ -268,39 +273,48 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.c
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(
-  TEXT_BENCH text/ngrams.cpp text/normalize.cpp text/normalize_spaces.cpp text/replace.cpp
-  text/subword.cpp text/tokenize.cpp
-)
+ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
-ConfigureNVBench(TEXT_NVBENCH text/minhash.cpp)
+ConfigureNVBench(
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp
+)
 
 # ##################################################################################################
 # * strings benchmark -------------------------------------------------------------------
 ConfigureBench(
   STRINGS_BENCH
   string/combine.cpp
-  string/contains.cpp
   string/convert_datetime.cpp
   string/convert_durations.cpp
   string/convert_fixed_point.cpp
   string/convert_numerics.cpp
   string/copy.cu
-  string/extract.cpp
   string/factory.cu
   string/filter.cpp
   string/find.cpp
   string/repeat_strings.cpp
   string/replace.cpp
-  string/replace_re.cpp
   string/slice.cpp
-  string/split.cpp
   string/translate.cpp
   string/url_decode.cu
 )
 
 ConfigureNVBench(
-  STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp string/case.cpp
+  STRINGS_NVBENCH
+  string/case.cpp
+  string/char_types.cpp
+  string/contains.cpp
+  string/count.cpp
+  string/extract.cpp
+  string/gather.cpp
+  string/join_strings.cpp
+  string/lengths.cpp
+  string/like.cpp
+  string/replace_re.cpp
+  string/reverse.cpp
+  string/split.cpp
+  string/split_re.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index fbba38431dd..a1131df4472 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -42,6 +42,10 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
     cuda_event_timer timer(state, true);
     cudf::binary_operation(lhs, rhs, binop, output_dtype);
   }
+
+  // use number of bytes read and written to global memory
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
+                          (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut)));
 }
 
 // TODO tparam boolean for null.
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index fd7b469cffd..aef3d92b4f5 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -118,13 +118,27 @@ size_t non_fixed_width_size<cudf::string_view>(data_profile const& profile)
   return get_distribution_mean(dist);
 }
 
+double geometric_sum(size_t n, double p)
+{
+  if (p == 1) { return n; }
+  return (1 - std::pow(p, n)) / (1 - p);
+}
+
 template <>
 size_t non_fixed_width_size<cudf::list_view>(data_profile const& profile)
 {
   auto const dist_params       = profile.get_distribution_params<cudf::list_view>();
   auto const single_level_mean = get_distribution_mean(dist_params.length_params);
-  auto const element_size = avg_element_size(profile, cudf::data_type{dist_params.element_type});
-  return element_size * pow(single_level_mean, dist_params.max_depth);
+
+  auto const element_size  = avg_element_size(profile, cudf::data_type{dist_params.element_type});
+  auto const element_count = std::pow(single_level_mean, dist_params.max_depth);
+
+  // Each nesting level includes offsets, this is the sum of all levels
+  // Also include an additional offset per level for the size of the last element
+  auto const total_offset_count =
+    geometric_sum(dist_params.max_depth, single_level_mean) + dist_params.max_depth;
+
+  return sizeof(cudf::size_type) * total_offset_count + element_size * element_count;
 }
 
 template <>
@@ -441,7 +455,8 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
     dtype,
     num_rows,
     data.release(),
-    profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
+    profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{},
+    profile.get_null_probability().has_value() ? null_count : 0);
 }
 
 struct valid_or_zero {
@@ -721,8 +736,11 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
     thrust::device_pointer_cast(offsets.end())[-1] =
       current_child_column->size();  // Always include all elements
 
-    auto offsets_column = std::make_unique<cudf::column>(
-      cudf::data_type{cudf::type_id::INT32}, num_rows + 1, offsets.release());
+    auto offsets_column = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
+                                                         num_rows + 1,
+                                                         offsets.release(),
+                                                         rmm::device_buffer{},
+                                                         0);
 
     auto [null_mask, null_count] = cudf::detail::valid_if(valids.begin(),
                                                           valids.end(),
@@ -781,6 +799,25 @@ std::vector<cudf::type_id> cycle_dtypes(std::vector<cudf::type_id> const& dtype_
   return out_dtypes;
 }
 
+/**
+ * @brief Repeat the given two data types with a given ratio of a:b.
+ *
+ * The first dtype will have 'first_num' columns and the second will have 'num_cols - first_num'
+ * columns.
+ */
+std::vector<cudf::type_id> mix_dtypes(std::pair<cudf::type_id, cudf::type_id> const& dtype_ids,
+                                      cudf::size_type num_cols,
+                                      int first_num)
+{
+  std::vector<cudf::type_id> out_dtypes;
+  out_dtypes.reserve(num_cols);
+  for (cudf::size_type col = 0; col < first_num; ++col)
+    out_dtypes.push_back(dtype_ids.first);
+  for (cudf::size_type col = first_num; col < num_cols; ++col)
+    out_dtypes.push_back(dtype_ids.second);
+  return out_dtypes;
+}
+
 std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,
                                                  table_size_bytes table_bytes,
                                                  data_profile const& profile,
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index 8a5811218d0..a2efdb819bf 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -666,6 +666,21 @@ std::unique_ptr<cudf::table> create_sequence_table(
  */
 std::vector<cudf::type_id> cycle_dtypes(std::vector<cudf::type_id> const& dtype_ids,
                                         cudf::size_type num_cols);
+
+/**
+ * @brief Repeat the given two data types with a given ratio of a:b.
+ *
+ * The first dtype will have 'first_num' columns and the second will have 'num_cols - first_num'
+ * columns.
+ *
+ * @param dtype_ids Pair of requested column types
+ * @param num_cols Total number of columns in the output vector
+ * @param first_num Total number of columns of type `dtype_ids.first`
+ * @return A vector of type_ids
+ */
+std::vector<cudf::type_id> mix_dtypes(std::pair<cudf::type_id, cudf::type_id> const& dtype_ids,
+                                      cudf::size_type num_cols,
+                                      int first_num);
 /**
  * @brief Create a random null mask object
  *
diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu
index aff90039cb9..910fc689c0b 100644
--- a/cpp/benchmarks/copying/contiguous_split.cu
+++ b/cpp/benchmarks/copying/contiguous_split.cu
@@ -25,12 +25,30 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-template <typename T>
+void contiguous_split(cudf::table_view const& src_table, std::vector<cudf::size_type> const& splits)
+{
+  auto result = cudf::contiguous_split(src_table, splits);
+}
+
+void chunked_pack(cudf::table_view const& src_table, std::vector<cudf::size_type> const&)
+{
+  auto const mr     = rmm::mr::get_current_device_resource();
+  auto const stream = cudf::get_default_stream();
+  auto user_buffer  = rmm::device_uvector<std::uint8_t>(100L * 1024 * 1024, stream, mr);
+  auto chunked_pack = cudf::chunked_pack::create(src_table, user_buffer.size(), mr);
+  while (chunked_pack->has_next()) {
+    auto iter_size = chunked_pack->next(user_buffer);
+  }
+  stream.synchronize();
+}
+
+template <typename T, typename ContigSplitImpl>
 void BM_contiguous_split_common(benchmark::State& state,
                                 std::vector<T>& src_cols,
                                 int64_t num_rows,
                                 int64_t num_splits,
-                                int64_t bytes_total)
+                                int64_t bytes_total,
+                                ContigSplitImpl& impl)
 {
   // generate splits
   std::vector<cudf::size_type> splits;
@@ -57,7 +75,7 @@ void BM_contiguous_split_common(benchmark::State& state,
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    auto result = cudf::contiguous_split(src_table, splits);
+    impl(src_table, splits);
   }
 
   // it's 2x bytes_total because we're both reading and writing.
@@ -65,8 +83,10 @@ void BM_contiguous_split_common(benchmark::State& state,
 }
 
 class ContiguousSplit : public cudf::benchmark {};
+class ChunkedPack : public cudf::benchmark {};
 
-void BM_contiguous_split(benchmark::State& state)
+template <typename ContiguousSplitImpl>
+void BM_contiguous_split(benchmark::State& state, ContiguousSplitImpl& impl)
 {
   int64_t const total_desired_bytes = state.range(0);
   cudf::size_type const num_cols    = state.range(1);
@@ -91,12 +111,14 @@ void BM_contiguous_split(benchmark::State& state)
     (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols)
                       : 0);
 
-  BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes);
+  BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes, impl);
 }
 
 class ContiguousSplitStrings : public cudf::benchmark {};
+class ChunkedPackStrings : public cudf::benchmark {};
 
-void BM_contiguous_split_strings(benchmark::State& state)
+template <typename ContiguousSplitImpl>
+void BM_contiguous_split_strings(benchmark::State& state, ContiguousSplitImpl& impl)
 {
   int64_t const total_desired_bytes = state.range(0);
   cudf::size_type const num_cols    = state.range(1);
@@ -104,7 +126,7 @@ void BM_contiguous_split_strings(benchmark::State& state)
   bool const include_validity       = state.range(3) != 0;
 
   constexpr int64_t string_len = 8;
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "aaaaaaaa", "bbbbbbbb", "cccccccc", "dddddddd", "eeeeeeee", "ffffffff", "gggggggg", "hhhhhhhh"};
 
   int64_t const col_len_bytes = total_desired_bytes / num_cols;
@@ -129,17 +151,17 @@ void BM_contiguous_split_strings(benchmark::State& state)
   }
 
   int64_t const total_bytes =
-    total_desired_bytes + ((num_rows + 1) * sizeof(cudf::offset_type)) +
+    total_desired_bytes + ((num_rows + 1) * sizeof(cudf::size_type)) +
     (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols)
                       : 0);
 
-  BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes);
+  BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes, impl);
 }
 
 #define CSBM_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \
   BENCHMARK_DEFINE_F(ContiguousSplit, name)(::benchmark::State & state)      \
   {                                                                          \
-    BM_contiguous_split(state);                                              \
+    BM_contiguous_split(state, contiguous_split);                            \
   }                                                                          \
   BENCHMARK_REGISTER_F(ContiguousSplit, name)                                \
     ->Args({size, num_columns, num_splits, validity})                        \
@@ -168,7 +190,7 @@ CSBM_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024,
 #define CSBM_STRINGS_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \
   BENCHMARK_DEFINE_F(ContiguousSplitStrings, name)(::benchmark::State & state)       \
   {                                                                                  \
-    BM_contiguous_split_strings(state);                                              \
+    BM_contiguous_split_strings(state, contiguous_split);                            \
   }                                                                                  \
   BENCHMARK_REGISTER_F(ContiguousSplitStrings, name)                                 \
     ->Args({size, num_columns, num_splits, validity})                                \
@@ -189,3 +211,53 @@ CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 10
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 1);
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 0);
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
+
+#define CCSBM_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \
+  BENCHMARK_DEFINE_F(ChunkedPack, name)(::benchmark::State & state)           \
+  {                                                                           \
+    BM_contiguous_split(state, chunked_pack);                                 \
+  }                                                                           \
+  BENCHMARK_REGISTER_F(ChunkedPack, name)                                     \
+    ->Args({size, num_columns, num_splits, validity})                         \
+    ->Unit(benchmark::kMillisecond)                                           \
+    ->UseManualTime()                                                         \
+    ->Iterations(8)
+CCSBM_BENCHMARK_DEFINE(6Gb512ColsNoValidity, (int64_t)6 * 1024 * 1024 * 1024, 512, 0, 0);
+CCSBM_BENCHMARK_DEFINE(6Gb512ColsValidity, (int64_t)6 * 1024 * 1024 * 1024, 512, 0, 1);
+CCSBM_BENCHMARK_DEFINE(6Gb10ColsNoValidity, (int64_t)6 * 1024 * 1024 * 1024, 10, 0, 0);
+CCSBM_BENCHMARK_DEFINE(6Gb10ColsValidity, (int64_t)6 * 1024 * 1024 * 1024, 10, 0, 1);
+
+CCSBM_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 0);
+CCSBM_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 1);
+CCSBM_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 0);
+CCSBM_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 1);
+CCSBM_BENCHMARK_DEFINE(4Gb4ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);
+
+CCSBM_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 0);
+CCSBM_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 1);
+CCSBM_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 0);
+CCSBM_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 1);
+CCSBM_BENCHMARK_DEFINE(1Gb1ColValidity, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
+
+#define CCSBM_STRINGS_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \
+  BENCHMARK_DEFINE_F(ChunkedPackStrings, name)(::benchmark::State & state)            \
+  {                                                                                   \
+    BM_contiguous_split_strings(state, chunked_pack);                                 \
+  }                                                                                   \
+  BENCHMARK_REGISTER_F(ChunkedPackStrings, name)                                      \
+    ->Args({size, num_columns, num_splits, validity})                                 \
+    ->Unit(benchmark::kMillisecond)                                                   \
+    ->UseManualTime()                                                                 \
+    ->Iterations(8)
+
+CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 0);
+CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 1);
+CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 0);
+CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 1);
+CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb4ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);
+
+CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 0);
+CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 1);
+CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 0);
+CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 1);
+CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColValidity, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
diff --git a/cpp/benchmarks/copying/copy_if_else.cpp b/cpp/benchmarks/copying/copy_if_else.cpp
index a10f54b3d6f..50ddfb82feb 100644
--- a/cpp/benchmarks/copying/copy_if_else.cpp
+++ b/cpp/benchmarks/copying/copy_if_else.cpp
@@ -47,6 +47,14 @@ static void BM_copy_if_else(benchmark::State& state, bool nulls)
     cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::copy_if_else(lhs, rhs, decision);
   }
+
+  auto const bytes_read    = n_rows * (sizeof(TypeParam) + sizeof(bool));
+  auto const bytes_written = n_rows * sizeof(TypeParam);
+  auto const null_bytes    = nulls ? 2 * cudf::bitmask_allocation_size_bytes(n_rows) : 0;
+
+  // Use number of bytes read and written.
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+                          (bytes_read + bytes_written + null_bytes));
 }
 
 #define COPY_BENCHMARK_DEFINE(name, type, b)                  \
diff --git a/cpp/benchmarks/copying/gather.cu b/cpp/benchmarks/copying/gather.cu
index 149fd611656..eeb0149fb3a 100644
--- a/cpp/benchmarks/copying/gather.cu
+++ b/cpp/benchmarks/copying/gather.cu
@@ -31,8 +31,8 @@ class Gather : public cudf::benchmark {};
 template <class TypeParam, bool coalesce>
 void BM_gather(benchmark::State& state)
 {
-  const cudf::size_type source_size{(cudf::size_type)state.range(0)};
-  const auto n_cols = (cudf::size_type)state.range(1);
+  cudf::size_type const source_size{(cudf::size_type)state.range(0)};
+  auto const n_cols = (cudf::size_type)state.range(1);
 
   // Gather indices
   auto gather_map_table =
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index e153abee3a3..bc6c2e52da8 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,17 +73,17 @@ class benchmark : public ::benchmark::Fixture {
  public:
   benchmark() : ::benchmark::Fixture()
   {
-    const char* env_iterations = std::getenv("CUDF_BENCHMARK_ITERATIONS");
+    char const* env_iterations = std::getenv("CUDF_BENCHMARK_ITERATIONS");
     if (env_iterations != nullptr) { this->Iterations(std::max(0L, atol(env_iterations))); }
   }
 
-  void SetUp(const ::benchmark::State& state) override
+  void SetUp(::benchmark::State const& state) override
   {
     mr = make_pool_instance();
     rmm::mr::set_current_device_resource(mr.get());  // set default resource to pool
   }
 
-  void TearDown(const ::benchmark::State& state) override
+  void TearDown(::benchmark::State const& state) override
   {
     // reset default resource to the initial resource
     rmm::mr::set_current_device_resource(nullptr);
@@ -91,10 +91,10 @@ class benchmark : public ::benchmark::Fixture {
   }
 
   // eliminate partial override warnings (see benchmark/benchmark.h)
-  void SetUp(::benchmark::State& st) override { SetUp(const_cast<const ::benchmark::State&>(st)); }
+  void SetUp(::benchmark::State& st) override { SetUp(const_cast<::benchmark::State const&>(st)); }
   void TearDown(::benchmark::State& st) override
   {
-    TearDown(const_cast<const ::benchmark::State&>(st));
+    TearDown(const_cast<::benchmark::State const&>(st));
   }
 
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
new file mode 100644
index 00000000000..e08f9101522
--- /dev/null
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/mr/device/arena_memory_resource.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <string>
+
+namespace cudf {
+namespace detail {
+static std::string rmm_mode_param{"--rmm_mode"};  ///< RMM mode command-line parameter name
+}  // namespace detail
+
+/**
+ * Base fixture for cudf benchmarks using nvbench.
+ *
+ * Initializes the default memory resource to use the RMM pool device resource.
+ */
+struct nvbench_base_fixture {
+  inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+
+  inline auto make_pool()
+  {
+    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+  }
+
+  inline auto make_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
+
+  inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
+
+  inline auto make_arena()
+  {
+    return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda());
+  }
+
+  inline auto make_managed_pool()
+  {
+    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_managed());
+  }
+
+  inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
+    std::string const& mode)
+  {
+    if (mode == "cuda") return make_cuda();
+    if (mode == "pool") return make_pool();
+    if (mode == "async") return make_async();
+    if (mode == "arena") return make_arena();
+    if (mode == "managed") return make_managed();
+    if (mode == "managed_pool") return make_managed_pool();
+    CUDF_FAIL("Unknown rmm_mode parameter: " + mode +
+              "\nExpecting: cuda, pool, async, arena, managed, or managed_pool");
+  }
+
+  nvbench_base_fixture(int argc, char const* const* argv)
+  {
+    for (int i = 1; i < argc - 1; ++i) {
+      std::string arg = argv[i];
+      if (arg == detail::rmm_mode_param) {
+        i++;
+        rmm_mode = argv[i];
+      }
+    }
+
+    mr = create_memory_resource(rmm_mode);
+    rmm::mr::set_current_device_resource(mr.get());
+    std::cout << "RMM memory resource = " << rmm_mode << "\n";
+  }
+
+  std::shared_ptr<rmm::mr::device_memory_resource> mr;
+  std::string rmm_mode{"pool"};
+};
+
+}  // namespace cudf
diff --git a/cpp/benchmarks/fixture/nvbench_main.cpp b/cpp/benchmarks/fixture/nvbench_main.cpp
index f58eae62372..64c4d83ac17 100644
--- a/cpp/benchmarks/fixture/nvbench_main.cpp
+++ b/cpp/benchmarks/fixture/nvbench_main.cpp
@@ -14,9 +14,28 @@
  * limitations under the License.
  */
 
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
+#include <benchmarks/fixture/nvbench_fixture.hpp>
 #define NVBENCH_ENVIRONMENT cudf::nvbench_base_fixture
 
 #include <nvbench/main.cuh>
 
+#include <vector>
+
+// strip off the rmm_mode parameter before passing the
+// remaining arguments to nvbench::option_parser
+#undef NVBENCH_MAIN_PARSE
+#define NVBENCH_MAIN_PARSE(argc, argv)         \
+  nvbench::option_parser parser;               \
+  std::vector<std::string> m_args;             \
+  for (int i = 0; i < argc; ++i) {             \
+    std::string arg = argv[i];                 \
+    if (arg == cudf::detail::rmm_mode_param) { \
+      i += 2;                                  \
+    } else {                                   \
+      m_args.push_back(arg);                   \
+    }                                          \
+  }                                            \
+  parser.parse(m_args)
+
+// this declares/defines the main() function using the definitions above
 NVBENCH_MAIN
diff --git a/cpp/benchmarks/fixture/rmm_pool_raii.hpp b/cpp/benchmarks/fixture/rmm_pool_raii.hpp
deleted file mode 100644
index 23f49735855..00000000000
--- a/cpp/benchmarks/fixture/rmm_pool_raii.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/owning_wrapper.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-namespace cudf {
-
-/**
- * @brief An RAII class setting up RMM memory pool for `nvbench` benchmarks
- *
- * This is a temporary solution before templated fixtures tests are supported
- * in `nvbench`. Similarly to `cudf::benchmark`, creating this RAII object in
- * each benchmark will ensure that the RAPIDS Memory Manager pool mode is used
- * in benchmarks, which eliminates memory allocation / deallocation performance
- * overhead from the benchmark.
- *
- * Example:
- *
- * void my_benchmark(nvbench::state& state) {
- * cudf::rmm_pool_raii pool_raii;
- * state.exec([](nvbench::launch& launch) {
- *       // benchmark stuff
- *  });
- * }
- *
- * NVBENCH_BENCH(my_benchmark);
- */
-class rmm_pool_raii {
- private:
-  // memory resource factory helpers
-  inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
-
-  inline auto make_pool()
-  {
-    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
-  }
-
- public:
-  rmm_pool_raii()
-  {
-    mr = make_pool();
-    rmm::mr::set_current_device_resource(mr.get());  // set default resource to pool
-  }
-
-  ~rmm_pool_raii()
-  {
-    rmm::mr::set_current_device_resource(nullptr);
-    mr.reset();
-  }
-
- private:
-  std::shared_ptr<rmm::mr::device_memory_resource> mr;
-};
-
-/**
- * Base fixture for cudf benchmarks using nvbench.
- *
- * Initializes the default memory resource to use the RMM pool device resource.
- */
-struct nvbench_base_fixture {
-  rmm_pool_raii _mr;
-};
-
-}  // namespace cudf
diff --git a/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp b/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp
index 7d86ed1b95c..57f52861cb5 100644
--- a/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ namespace cudf {
 template <class Fixture>
 class FunctionTemplateBenchmark : public Fixture {
  public:
-  FunctionTemplateBenchmark(const char* name, ::benchmark::internal::Function* func)
+  FunctionTemplateBenchmark(char const* name, ::benchmark::internal::Function* func)
     : Fixture(), func_(func)
   {
     this->SetName(name);
diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
index 077558f8709..e65c37f001d 100644
--- a/cpp/benchmarks/groupby/group_max.cpp
+++ b/cpp/benchmarks/groupby/group_max.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/groupby.hpp>
 
@@ -24,7 +23,7 @@
 template <typename Type>
 void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
 {
-  const auto size = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const size = static_cast<cudf::size_type>(state.get_int64("num_rows"));
 
   auto const keys = [&] {
     data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
diff --git a/cpp/benchmarks/groupby/group_no_requests.cpp b/cpp/benchmarks/groupby/group_no_requests.cpp
index 7a35873efe9..34618acec75 100644
--- a/cpp/benchmarks/groupby/group_no_requests.cpp
+++ b/cpp/benchmarks/groupby/group_no_requests.cpp
@@ -28,7 +28,7 @@ class Groupby : public cudf::benchmark {};
 
 void BM_basic_no_requests(benchmark::State& state)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
 
   data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
     cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
@@ -59,7 +59,7 @@ BENCHMARK_REGISTER_F(Groupby, BasicNoRequest)
 
 void BM_pre_sorted_no_requests(benchmark::State& state)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
 
   data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
     cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
diff --git a/cpp/benchmarks/groupby/group_nth.cpp b/cpp/benchmarks/groupby/group_nth.cpp
index 948414e8417..f2c24433858 100644
--- a/cpp/benchmarks/groupby/group_nth.cpp
+++ b/cpp/benchmarks/groupby/group_nth.cpp
@@ -29,7 +29,7 @@ class Groupby : public cudf::benchmark {};
 void BM_pre_sorted_nth(benchmark::State& state)
 {
   // const cudf::size_type num_columns{(cudf::size_type)state.range(0)};
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
 
   data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
     cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
diff --git a/cpp/benchmarks/groupby/group_nunique.cpp b/cpp/benchmarks/groupby/group_nunique.cpp
index f74ed95200e..63d738b2951 100644
--- a/cpp/benchmarks/groupby/group_nunique.cpp
+++ b/cpp/benchmarks/groupby/group_nunique.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/groupby.hpp>
 
@@ -40,7 +39,7 @@ auto make_aggregation_request_vector(cudf::column_view const& values, Args&&...
 template <typename Type>
 void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
 {
-  const auto size = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const size = static_cast<cudf::size_type>(state.get_int64("num_rows"));
 
   auto const keys = [&] {
     data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution(
diff --git a/cpp/benchmarks/groupby/group_rank.cpp b/cpp/benchmarks/groupby/group_rank.cpp
index 6aac3826e55..2122720a421 100644
--- a/cpp/benchmarks/groupby/group_rank.cpp
+++ b/cpp/benchmarks/groupby/group_rank.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/groupby.hpp>
diff --git a/cpp/benchmarks/groupby/group_scan.cpp b/cpp/benchmarks/groupby/group_scan.cpp
index c9ae10c775f..2ae5b6fc2b8 100644
--- a/cpp/benchmarks/groupby/group_scan.cpp
+++ b/cpp/benchmarks/groupby/group_scan.cpp
@@ -29,7 +29,7 @@ class Groupby : public cudf::benchmark {};
 
 void BM_basic_sum_scan(benchmark::State& state)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
 
   data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
     cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
@@ -61,7 +61,7 @@ BENCHMARK_REGISTER_F(Groupby, BasicSumScan)
 
 void BM_pre_sorted_sum_scan(benchmark::State& state)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
 
   data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution(
     cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
diff --git a/cpp/benchmarks/groupby/group_shift.cpp b/cpp/benchmarks/groupby/group_shift.cpp
index 1ad6560b73f..eda2b3dd158 100644
--- a/cpp/benchmarks/groupby/group_shift.cpp
+++ b/cpp/benchmarks/groupby/group_shift.cpp
@@ -28,8 +28,8 @@ class Groupby : public cudf::benchmark {};
 
 void BM_group_shift(benchmark::State& state)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
-  const int num_groups = 100;
+  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
+  int const num_groups = 100;
 
   data_profile const profile =
     data_profile_builder().cardinality(0).null_probability(0.01).distribution(
diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp
index 53ef12ffeaa..44a12c1c30e 100644
--- a/cpp/benchmarks/groupby/group_struct_keys.cpp
+++ b/cpp/benchmarks/groupby/group_struct_keys.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -34,10 +33,10 @@ void bench_groupby_struct_keys(nvbench::state& state)
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution(0, 100);
 
-  const cudf::size_type n_rows{static_cast<cudf::size_type>(state.get_int64("NumRows"))};
-  const cudf::size_type n_cols{1};
-  const cudf::size_type depth{static_cast<cudf::size_type>(state.get_int64("Depth"))};
-  const bool nulls{static_cast<bool>(state.get_int64("Nulls"))};
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.get_int64("NumRows"))};
+  cudf::size_type const n_cols{1};
+  cudf::size_type const depth{static_cast<cudf::size_type>(state.get_int64("Depth"))};
+  bool const nulls{static_cast<bool>(state.get_int64("Nulls"))};
 
   // Create columns with values in the range [0,100)
   std::vector<column_wrapper> columns;
diff --git a/cpp/benchmarks/groupby/group_sum.cpp b/cpp/benchmarks/groupby/group_sum.cpp
index fbfb8865b81..b3fd881ccbc 100644
--- a/cpp/benchmarks/groupby/group_sum.cpp
+++ b/cpp/benchmarks/groupby/group_sum.cpp
@@ -28,7 +28,7 @@ class Groupby : public cudf::benchmark {};
 
 void BM_basic_sum(benchmark::State& state)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
 
   data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
     cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
@@ -61,7 +61,7 @@ BENCHMARK_REGISTER_F(Groupby, Basic)
 
 void BM_pre_sorted_sum(benchmark::State& state)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
 
   data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution(
     cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index d71e4742f0a..e679b4b62d2 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -15,47 +15,71 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/hashing.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-class HashBenchmark : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-enum contains_nulls { no_nulls, nulls };
+#include <optional>
 
-static void BM_hash(benchmark::State& state, cudf::hash_id hid, contains_nulls has_nulls)
+static void bench_hash(nvbench::state& state)
 {
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const data = create_random_table({cudf::type_id::INT64}, row_count{n_rows});
-  if (has_nulls == contains_nulls::no_nulls)
-    data->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::hash(data->view(), hid);
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const nulls    = state.get_float64("nulls");
+  // disable null bitmask if probability is exactly 0.0
+  bool const no_nulls  = nulls == 0.0;
+  auto const hash_name = state.get_string("hash_name");
+
+  data_profile const profile =
+    data_profile_builder().null_probability(no_nulls ? std::nullopt : std::optional<double>{nulls});
+  auto const data = create_random_table(
+    {cudf::type_id::INT64, cudf::type_id::STRING}, row_count{num_rows}, profile);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+
+  // collect statistics
+  cudf::strings_column_view input(data->get_column(1).view());
+  auto const chars_size = input.chars_size();
+  // add memory read from string column
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  // add memory read from int64_t column
+  state.add_global_memory_reads<nvbench::int64_t>(num_rows);
+  // add memory read from bitmaks
+  if (!no_nulls) {
+    state.add_global_memory_reads<nvbench::int8_t>(2 *
+                                                   cudf::bitmask_allocation_size_bytes(num_rows));
   }
-}
+  // memory written depends on used hash
 
-#define concat(a, b, c) a##b##c
+  if (hash_name == "murmurhash3_x86_32") {
+    state.add_global_memory_writes<nvbench::uint32_t>(num_rows);
 
-#define H_BENCHMARK_DEFINE(name, hid, n)                                            \
-  BENCHMARK_DEFINE_F(HashBenchmark, name)                                           \
-  (::benchmark::State & st) { BM_hash(st, cudf::hash_id::hid, contains_nulls::n); } \
-  BENCHMARK_REGISTER_F(HashBenchmark, name)                                         \
-    ->RangeMultiplier(4)                                                            \
-    ->Ranges({{1 << 14, 1 << 24}})                                                  \
-    ->UseManualTime()                                                               \
-    ->Unit(benchmark::kMillisecond);
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = cudf::hashing::murmurhash3_x86_32(data->view());
+    });
+  } else if (hash_name == "md5") {
+    // md5 creates a 32-byte string
+    state.add_global_memory_writes<nvbench::int8_t>(32 * num_rows);
 
-#define HASH_BENCHMARK_DEFINE(hid, n) H_BENCHMARK_DEFINE(concat(hid, _, n), hid, n)
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); });
+  } else if (hash_name == "spark_murmurhash3_x86_32") {
+    state.add_global_memory_writes<nvbench::int32_t>(num_rows);
 
-HASH_BENCHMARK_DEFINE(HASH_MURMUR3, nulls)
-HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, nulls)
-HASH_BENCHMARK_DEFINE(HASH_MD5, nulls)
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = cudf::hashing::spark_murmurhash3_x86_32(data->view());
+    });
+  } else {
+    state.skip(hash_name + ": unknown hash name");
+  }
+}
 
-HASH_BENCHMARK_DEFINE(HASH_MURMUR3, no_nulls)
-HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, no_nulls)
-HASH_BENCHMARK_DEFINE(HASH_MD5, no_nulls)
+NVBENCH_BENCH(bench_hash)
+  .set_name("hashing")
+  .add_int64_axis("num_rows", {65536, 16777216})
+  .add_float64_axis("nulls", {0.0, 0.1})
+  .add_string_axis("hash_name", {"murmurhash3_x86_32", "md5", "spark_murmurhash3_x86_32"});
diff --git a/cpp/benchmarks/hashing/partition.cpp b/cpp/benchmarks/hashing/partition.cpp
index b688fe2ed7f..0bec4394216 100644
--- a/cpp/benchmarks/hashing/partition.cpp
+++ b/cpp/benchmarks/hashing/partition.cpp
@@ -43,6 +43,13 @@ void BM_hash_partition(benchmark::State& state)
     cuda_event_timer timer(state, true);
     auto output = cudf::hash_partition(input, columns_to_hash, num_partitions);
   }
+
+  auto const bytes_read      = num_rows * num_cols * sizeof(T);
+  auto const bytes_written   = num_rows * num_cols * sizeof(T);
+  auto const partition_bytes = num_partitions * sizeof(cudf::size_type);
+
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+                          (bytes_read + bytes_written + partition_bytes));
 }
 
 BENCHMARK_DEFINE_F(Hashing, hash_partition)
diff --git a/cpp/benchmarks/io/csv/csv_reader_input.cpp b/cpp/benchmarks/io/csv/csv_reader_input.cpp
index 4ae4e139b59..6216a9ecec2 100644
--- a/cpp/benchmarks/io/csv/csv_reader_input.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader_input.cpp
@@ -16,7 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
diff --git a/cpp/benchmarks/io/csv/csv_reader_options.cpp b/cpp/benchmarks/io/csv/csv_reader_options.cpp
index 2d0e0e5754e..93ef5bed774 100644
--- a/cpp/benchmarks/io/csv/csv_reader_options.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader_options.cpp
@@ -16,7 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp
index 1ca6b5b2a9b..8ff07be1531 100644
--- a/cpp/benchmarks/io/csv/csv_writer.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer.cpp
@@ -23,7 +23,7 @@
 
 #include <nvbench/nvbench.cuh>
 
-// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
 constexpr size_t data_size         = 256 << 20;
 constexpr cudf::size_type num_cols = 64;
diff --git a/cpp/benchmarks/io/fst.cu b/cpp/benchmarks/io/fst.cu
index 7fb505f1d34..c0c88517d41 100644
--- a/cpp/benchmarks/io/fst.cu
+++ b/cpp/benchmarks/io/fst.cu
@@ -15,8 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
-#include <nvbench/nvbench.cuh>
 
 #include <io/fst/lookup_tables.cuh>
 #include <io/utilities/hostdevice_vector.hpp>  //TODO find better replacement
@@ -35,6 +33,8 @@
 
 #include <thrust/iterator/discard_iterator.h>
 
+#include <nvbench/nvbench.cuh>
+
 #include <cstdlib>
 
 namespace {
@@ -60,17 +60,16 @@ auto make_test_json_data(nvbench::state& state)
 
   auto d_input_scalar                = cudf::make_string_scalar(input);
   auto& d_string_scalar              = static_cast<cudf::string_scalar&>(*d_input_scalar);
-  const cudf::size_type repeat_times = string_size / input.size();
+  cudf::size_type const repeat_times = string_size / input.size();
   return cudf::strings::repeat_string(d_string_scalar, repeat_times);
 }
 
 // Type used to represent the atomic symbol type used within the finite-state machine
 using SymbolT = char;
 // Type sufficiently large to index symbols within the input and output (may be unsigned)
-using SymbolOffsetT = uint32_t;
-// Helper class to set up transition table, symbol group lookup table, and translation table
-using DfaFstT = cudf::io::fst::detail::Dfa<char, NUM_SYMBOL_GROUPS, TT_NUM_STATES>;
-constexpr std::size_t single_item = 1;
+using SymbolOffsetT                       = uint32_t;
+constexpr std::size_t single_item         = 1;
+constexpr auto max_translation_table_size = TT_NUM_STATES * NUM_SYMBOL_GROUPS;
 
 }  // namespace
 
@@ -89,12 +88,16 @@ void BM_FST_JSON(nvbench::state& state)
   state.add_element_count(d_input.size());
 
   // Prepare input & output buffers
-  hostdevice_vector<SymbolT> output_gpu(d_input.size(), stream_view);
-  hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
-  hostdevice_vector<SymbolOffsetT> out_indexes_gpu(d_input.size(), stream_view);
+  cudf::detail::hostdevice_vector<SymbolT> output_gpu(d_input.size(), stream_view);
+  cudf::detail::hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
+  cudf::detail::hostdevice_vector<SymbolOffsetT> out_indexes_gpu(d_input.size(), stream_view);
 
   // Run algorithm
-  DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()};
+  auto parser = cudf::io::fst::detail::make_fst(
+    cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
+    cudf::io::fst::detail::make_transition_table(pda_state_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
@@ -124,12 +127,16 @@ void BM_FST_JSON_no_outidx(nvbench::state& state)
   state.add_element_count(d_input.size());
 
   // Prepare input & output buffers
-  hostdevice_vector<SymbolT> output_gpu(d_input.size(), stream_view);
-  hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
-  hostdevice_vector<SymbolOffsetT> out_indexes_gpu(d_input.size(), stream_view);
+  cudf::detail::hostdevice_vector<SymbolT> output_gpu(d_input.size(), stream_view);
+  cudf::detail::hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
+  cudf::detail::hostdevice_vector<SymbolOffsetT> out_indexes_gpu(d_input.size(), stream_view);
 
   // Run algorithm
-  DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()};
+  auto parser = cudf::io::fst::detail::make_fst(
+    cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
+    cudf::io::fst::detail::make_transition_table(pda_state_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
@@ -159,10 +166,14 @@ void BM_FST_JSON_no_out(nvbench::state& state)
   state.add_element_count(d_input.size());
 
   // Prepare input & output buffers
-  hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
+  cudf::detail::hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
 
   // Run algorithm
-  DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()};
+  auto parser = cudf::io::fst::detail::make_fst(
+    cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
+    cudf::io::fst::detail::make_transition_table(pda_state_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
@@ -192,11 +203,15 @@ void BM_FST_JSON_no_str(nvbench::state& state)
   state.add_element_count(d_input.size());
 
   // Prepare input & output buffers
-  hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
-  hostdevice_vector<SymbolOffsetT> out_indexes_gpu(d_input.size(), stream_view);
+  cudf::detail::hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
+  cudf::detail::hostdevice_vector<SymbolOffsetT> out_indexes_gpu(d_input.size(), stream_view);
 
   // Run algorithm
-  DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()};
+  auto parser = cudf::io::fst::detail::make_fst(
+    cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
+    cudf::io::fst::detail::make_transition_table(pda_state_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
diff --git a/cpp/benchmarks/io/json/json_reader_input.cpp b/cpp/benchmarks/io/json/json_reader_input.cpp
index 55614d040d5..31bb5dafa88 100644
--- a/cpp/benchmarks/io/json/json_reader_input.cpp
+++ b/cpp/benchmarks/io/json/json_reader_input.cpp
@@ -24,17 +24,13 @@
 
 #include <nvbench/nvbench.cuh>
 
-// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
 constexpr size_t data_size         = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
 
-void json_read_common(cudf::io::json_writer_options const& write_opts,
-                      cuio_source_sink_pair& source_sink,
-                      nvbench::state& state)
+void json_read_common(cuio_source_sink_pair& source_sink, nvbench::state& state)
 {
-  cudf::io::write_json(write_opts);
-
   cudf::io::json_reader_options read_opts =
     cudf::io::json_reader_options::builder(source_sink.make_source_info());
 
@@ -69,16 +65,21 @@ void BM_json_read_io(nvbench::state& state, nvbench::type_list<nvbench::enum_typ
                                          static_cast<int32_t>(data_type::STRUCT)});
 
   auto const source_type = IO;
+  cuio_source_sink_pair source_sink(source_type);
 
-  auto const tbl = create_random_table(
-    cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder());
-  auto const view = tbl->view();
+  {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder());
+    auto const view = tbl->view();
 
-  cuio_source_sink_pair source_sink(source_type);
-  cudf::io::json_writer_options const write_opts =
-    cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view).na_rep("null");
+    cudf::io::json_writer_options const write_opts =
+      cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
+        .na_rep("null")
+        .rows_per_chunk(100'000);
+    cudf::io::write_json(write_opts);
+  }
 
-  json_read_common(write_opts, source_sink, state);
+  json_read_common(source_sink, state);
 }
 
 template <data_type DataType, cudf::io::io_type IO>
@@ -87,16 +88,19 @@ void BM_json_read_data_type(
 {
   auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
   auto const source_type = IO;
-
-  auto const tbl = create_random_table(
-    cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder());
-  auto const view = tbl->view();
-
   cuio_source_sink_pair source_sink(source_type);
-  cudf::io::json_writer_options const write_opts =
-    cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view).na_rep("null");
-
-  json_read_common(write_opts, source_sink, state);
+  {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder());
+    auto const view = tbl->view();
+
+    cudf::io::json_writer_options const write_opts =
+      cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
+        .na_rep("null")
+        .rows_per_chunk(100'000);
+    cudf::io::write_json(write_opts);
+  }
+  json_read_common(source_sink, state);
 }
 
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
diff --git a/cpp/benchmarks/io/json/json_writer.cpp b/cpp/benchmarks/io/json/json_writer.cpp
index 6a0bf24c4b4..ae6bb81ff93 100644
--- a/cpp/benchmarks/io/json/json_writer.cpp
+++ b/cpp/benchmarks/io/json/json_writer.cpp
@@ -24,7 +24,7 @@
 
 #include <nvbench/nvbench.cuh>
 
-// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
 constexpr size_t data_size         = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
diff --git a/cpp/benchmarks/io/json/nested_json.cpp b/cpp/benchmarks/io/json/nested_json.cpp
index d03f36ca81f..03ccd4e245d 100644
--- a/cpp/benchmarks/io/json/nested_json.cpp
+++ b/cpp/benchmarks/io/json/nested_json.cpp
@@ -16,9 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
-
-#include <nvbench/nvbench.cuh>
 
 #include <io/json/nested_json.hpp>
 
@@ -28,6 +25,8 @@
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/types.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <string>
 #include <vector>
 
@@ -78,7 +77,7 @@ std::string generate_row(
   int num_columns, int max_depth, int max_list_size, int max_struct_size, size_t max_bytes)
 {
   std::string s = "{";
-  const std::vector<std::string> elems{
+  std::vector<std::string> const elems{
     R"(1)", R"(-2)", R"(3.4)", R"("5")", R"("abcdefghij")", R"(true)", R"(null)"};
   for (int i = 0; i < num_columns; i++) {
     s += R"("col)" + num_to_string(i) + R"(": )";
@@ -141,7 +140,7 @@ auto make_test_json_data(cudf::size_type string_size, rmm::cuda_stream_view stre
                       {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}},
                       {"a": 1, "b": 8.0, "d": { "author": "Jean-Jacques Rousseau"}},)";
 
-  const cudf::size_type repeat_times = string_size / input.size();
+  cudf::size_type const repeat_times = string_size / input.size();
 
   auto d_input_scalar   = cudf::make_string_scalar(input, stream);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_input_scalar);
@@ -192,7 +191,7 @@ void BM_NESTED_JSON_DEPTH(nvbench::state& state)
 
   auto d_scalar = cudf::string_scalar(
     generate_json(100'000'000, 10, depth, 10, 10, string_size), true, cudf::get_default_stream());
-  auto input = cudf::device_span<const char>(d_scalar.data(), d_scalar.size());
+  auto input = cudf::device_span<char const>(d_scalar.data(), d_scalar.size());
 
   state.add_element_count(input.size());
   auto const default_options = cudf::io::json_reader_options{};
diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 4705c083c02..b6e15fb3923 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -16,7 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
@@ -25,7 +24,7 @@
 
 #include <nvbench/nvbench.cuh>
 
-// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
diff --git a/cpp/benchmarks/io/orc/orc_reader_options.cpp b/cpp/benchmarks/io/orc/orc_reader_options.cpp
index 0361ba7c7a6..647a411c89d 100644
--- a/cpp/benchmarks/io/orc/orc_reader_options.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_options.cpp
@@ -16,7 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
@@ -26,7 +25,7 @@
 
 #include <nvbench/nvbench.cuh>
 
-// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size = 512 << 20;
 // The number of separate read calls to use when reading files in multiple chunks
diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp
index 67bf4cb750b..bb373297222 100644
--- a/cpp/benchmarks/io/orc/orc_writer.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer.cpp
@@ -16,7 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
@@ -38,7 +37,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
   },
   [](auto) { return std::string{}; })
 
-// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
diff --git a/cpp/benchmarks/io/orc/orc_writer_chunks.cpp b/cpp/benchmarks/io/orc/orc_writer_chunks.cpp
index eda70bc05e6..dff88d7ab6c 100644
--- a/cpp/benchmarks/io/orc/orc_writer_chunks.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer_chunks.cpp
@@ -16,7 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
@@ -29,7 +28,7 @@
 
 #include <nvbench/nvbench.cuh>
 
-// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size = 512 << 20;
 
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 6ad5d024312..80303ea04af 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -16,7 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
@@ -25,7 +24,7 @@
 
 #include <nvbench/nvbench.cuh>
 
-// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
 constexpr size_t data_size         = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
@@ -114,6 +113,38 @@ void BM_parquet_read_io_compression(
   parquet_read_common(write_opts, source_sink, state);
 }
 
+template <cudf::io::io_type IOType>
+void BM_parquet_read_io_small_mixed(nvbench::state& state,
+                                    nvbench::type_list<nvbench::enum_type<IOType>>)
+{
+  auto const d_type =
+    std::pair<cudf::type_id, cudf::type_id>{cudf::type_id::STRING, cudf::type_id::INT32};
+
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+  cudf::size_type const run_length  = state.get_int64("run_length");
+  cudf::size_type const num_strings = state.get_int64("num_string_cols");
+  auto const source_type            = IOType;
+
+  // want 80 pages total, across 4 columns, so 20 pages per column
+  cudf::size_type constexpr n_col          = 4;
+  cudf::size_type constexpr page_size_rows = 10'000;
+  cudf::size_type constexpr num_rows       = page_size_rows * (80 / n_col);
+
+  auto const tbl =
+    create_random_table(mix_dtypes(d_type, n_col, num_strings),
+                        row_count{num_rows},
+                        data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+  auto const view = tbl->view();
+
+  cuio_source_sink_pair source_sink(source_type);
+  cudf::io::parquet_writer_options write_opts =
+    cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+      .max_page_size_rows(10'000)
+      .compression(cudf::io::compression_type::NONE);
+
+  parquet_read_common(write_opts, source_sink, state);
+}
+
 template <data_type DataType, cudf::io::io_type IOType>
 void BM_parquet_read_chunks(
   nvbench::state& state,
@@ -140,7 +171,6 @@ void BM_parquet_read_chunks(
 
   cudf::io::parquet_reader_options read_opts =
     cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
-  auto reader = cudf::io::chunked_parquet_reader(byte_limit, read_opts);
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
@@ -149,8 +179,9 @@ void BM_parquet_read_chunks(
                try_drop_l3_cache();
 
                timer.start();
+               auto reader = cudf::io::chunked_parquet_reader(byte_limit, read_opts);
                do {
-                 auto chunk = reader.read_chunk();
+                 [[maybe_unused]] auto const chunk = reader.read_chunk();
                } while (reader.has_next());
                timer.stop();
              });
@@ -203,3 +234,12 @@ NVBENCH_BENCH_TYPES(BM_parquet_read_chunks,
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("byte_limit", {0, 500'000});
+
+NVBENCH_BENCH_TYPES(BM_parquet_read_io_small_mixed,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<cudf::io::io_type::FILEPATH>))
+  .set_name("parquet_read_io_small_mixed")
+  .set_type_axes_names({"io"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32})
+  .add_int64_axis("num_string_cols", {1, 2, 3});
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
index 5a6e4a8cb72..4105f2182d7 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
@@ -16,7 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
@@ -25,7 +24,7 @@
 
 #include <nvbench/nvbench.cuh>
 
-// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
 constexpr std::size_t data_size      = 512 << 20;
 constexpr std::size_t row_group_size = 128 << 20;
diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
index d3d22e06086..13b396ea267 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -16,7 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
@@ -38,7 +37,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
   },
   [](auto) { return std::string{}; })
 
-// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
 constexpr size_t data_size         = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
index ed70f53cad8..b85c97f65f7 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
@@ -16,7 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
@@ -27,7 +26,7 @@
 
 #include <nvbench/nvbench.cuh>
 
-// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size = 512 << 20;
 
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index a697c98a320..b5d855d8881 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -16,7 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index 2ea2ec34ee8..7acf24c30a5 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -131,7 +131,7 @@ class Iterator : public cudf::benchmark {};
 template <class TypeParam, bool cub_or_thrust, bool raw_or_iterator>
 void BM_iterator(benchmark::State& state)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
   using T      = TypeParam;
   auto num_gen = thrust::counting_iterator<cudf::size_type>(0);
 
@@ -195,7 +195,7 @@ void pair_iterator_bench_thrust(cudf::column_view& col,
 template <class TypeParam, bool cub_or_thrust>
 void BM_pair_iterator(benchmark::State& state)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
   using T      = TypeParam;
   auto num_gen = thrust::counting_iterator<cudf::size_type>(0);
   auto null_gen =
diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index c606cd8b4c0..84e607a9f28 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@
 
 #include <cassert>
 
-__global__ static void init_curand(curandState* state, const int nstates)
+__global__ static void init_curand(curandState* state, int const nstates)
 {
   int ithread = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -40,10 +40,10 @@ __global__ static void init_curand(curandState* state, const int nstates)
 
 template <typename key_type, typename size_type>
 __global__ static void init_build_tbl(key_type* const build_tbl,
-                                      const size_type build_tbl_size,
-                                      const int multiplicity,
+                                      size_type const build_tbl_size,
+                                      int const multiplicity,
                                       curandState* state,
-                                      const int num_states)
+                                      int const num_states)
 {
   auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
   auto const stride    = blockDim.x * gridDim.x;
@@ -52,7 +52,7 @@ __global__ static void init_build_tbl(key_type* const build_tbl,
   curandState localState = state[start_idx];
 
   for (size_type idx = start_idx; idx < build_tbl_size; idx += stride) {
-    const double x = curand_uniform_double(&localState);
+    double const x = curand_uniform_double(&localState);
 
     build_tbl[idx] = static_cast<key_type>(x * (build_tbl_size / multiplicity));
   }
@@ -62,13 +62,13 @@ __global__ static void init_build_tbl(key_type* const build_tbl,
 
 template <typename key_type, typename size_type>
 __global__ void init_probe_tbl(key_type* const probe_tbl,
-                               const size_type probe_tbl_size,
-                               const size_type build_tbl_size,
-                               const key_type rand_max,
-                               const double selectivity,
-                               const int multiplicity,
+                               size_type const probe_tbl_size,
+                               size_type const build_tbl_size,
+                               key_type const rand_max,
+                               double const selectivity,
+                               int const multiplicity,
                                curandState* state,
-                               const int num_states)
+                               int const num_states)
 {
   auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
   auto const stride    = blockDim.x * gridDim.x;
@@ -123,11 +123,11 @@ __global__ void init_probe_tbl(key_type* const probe_tbl,
  */
 template <typename key_type, typename size_type>
 void generate_input_tables(key_type* const build_tbl,
-                           const size_type build_tbl_size,
+                           size_type const build_tbl_size,
                            key_type* const probe_tbl,
-                           const size_type probe_tbl_size,
-                           const double selectivity,
-                           const int multiplicity)
+                           size_type const probe_tbl_size,
+                           double const selectivity,
+                           int const multiplicity)
 {
   // With large values of rand_max the a lot of temporary storage is needed for the lottery. At the
   // expense of not being that accurate with applying the selectivity an especially more memory
@@ -152,7 +152,7 @@ void generate_input_tables(key_type* const build_tbl,
   int num_sms{-1};
   CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
 
-  const int num_states =
+  int const num_states =
     num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
   rmm::device_uvector<curandState> devStates(num_states, cudf::get_default_stream());
 
diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu
index 1b9e8cb1cfe..1c02a4488ac 100644
--- a/cpp/benchmarks/join/join.cu
+++ b/cpp/benchmarks/join/join.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/join/join_common.hpp>
 
 template <typename key_type, typename payload_type, bool Nullable>
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 44b7bc0af62..7d1b1c74465 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -96,8 +96,8 @@ void BM_join(state_type& state, Join JoinFunc)
     }
   }();
 
-  const double selectivity = 0.3;
-  const int multiplicity   = 1;
+  double const selectivity = 0.3;
+  int const multiplicity   = 1;
 
   // Generate build and probe tables
   auto build_random_null_mask = [](int size) {
diff --git a/cpp/benchmarks/join/mixed_join.cu b/cpp/benchmarks/join/mixed_join.cu
index 1420625bbcd..67be4640f84 100644
--- a/cpp/benchmarks/join/mixed_join.cu
+++ b/cpp/benchmarks/join/mixed_join.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 #include <benchmarks/join/join_common.hpp>
 
 template <typename key_type, typename payload_type, bool Nullable>
diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu
index 8e2ca8e677a..dbc3234dabf 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists.cu
@@ -40,9 +40,9 @@ void BM_lists_scatter(::benchmark::State& state)
   auto stream = cudf::get_default_stream();
   auto mr     = rmm::mr::get_current_device_resource();
 
-  const cudf::size_type base_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type num_elements_per_row{(cudf::size_type)state.range(1)};
-  const auto num_rows = (cudf::size_type)ceil(double(base_size) / num_elements_per_row);
+  cudf::size_type const base_size{(cudf::size_type)state.range(0)};
+  cudf::size_type const num_elements_per_row{(cudf::size_type)state.range(1)};
+  auto const num_rows = (cudf::size_type)ceil(double(base_size) / num_elements_per_row);
 
   auto source_base_col = make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                                  base_size,
@@ -62,26 +62,26 @@ void BM_lists_scatter(::benchmark::State& state)
                    target_base_col->mutable_view().end<TypeParam>());
 
   auto source_offsets =
-    make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::offset_type>()},
+    make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                             num_rows + 1,
                             cudf::mask_state::UNALLOCATED,
                             stream,
                             mr);
   auto target_offsets =
-    make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::offset_type>()},
+    make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                             num_rows + 1,
                             cudf::mask_state::UNALLOCATED,
                             stream,
                             mr);
 
   thrust::sequence(rmm::exec_policy(stream),
-                   source_offsets->mutable_view().begin<cudf::offset_type>(),
-                   source_offsets->mutable_view().end<cudf::offset_type>(),
+                   source_offsets->mutable_view().begin<cudf::size_type>(),
+                   source_offsets->mutable_view().end<cudf::size_type>(),
                    0,
                    num_elements_per_row);
   thrust::sequence(rmm::exec_policy(stream),
-                   target_offsets->mutable_view().begin<cudf::offset_type>(),
-                   target_offsets->mutable_view().end<cudf::offset_type>(),
+                   target_offsets->mutable_view().begin<cudf::size_type>(),
+                   target_offsets->mutable_view().end<cudf::size_type>(),
                    0,
                    num_elements_per_row);
 
@@ -122,7 +122,11 @@ void BM_lists_scatter(::benchmark::State& state)
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    scatter(cudf::table_view{{*source}}, *scatter_map, cudf::table_view{{*target}}, mr);
+    scatter(cudf::table_view{{*source}},
+            *scatter_map,
+            cudf::table_view{{*target}},
+            cudf::get_default_stream(),
+            mr);
   }
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) * 2 *
diff --git a/cpp/benchmarks/lists/set_operations.cpp b/cpp/benchmarks/lists/set_operations.cpp
index 7a001b75376..5b240923358 100644
--- a/cpp/benchmarks/lists/set_operations.cpp
+++ b/cpp/benchmarks/lists/set_operations.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/lists/set_operations.hpp>
 
diff --git a/cpp/benchmarks/null_mask/set_null_mask.cpp b/cpp/benchmarks/null_mask/set_null_mask.cpp
index 6d605b06c23..4ac4c9617e2 100644
--- a/cpp/benchmarks/null_mask/set_null_mask.cpp
+++ b/cpp/benchmarks/null_mask/set_null_mask.cpp
@@ -23,7 +23,7 @@ class SetNullmask : public cudf::benchmark {};
 
 void BM_setnullmask(benchmark::State& state)
 {
-  const cudf::size_type size{(cudf::size_type)state.range(0)};
+  cudf::size_type const size{(cudf::size_type)state.range(0)};
   rmm::device_buffer mask = cudf::create_null_mask(size, cudf::mask_state::UNINITIALIZED);
   auto begin = 0, end = size;
 
diff --git a/cpp/benchmarks/quantiles/quantiles.cpp b/cpp/benchmarks/quantiles/quantiles.cpp
index 313a1270d91..24f9cc9c68e 100644
--- a/cpp/benchmarks/quantiles/quantiles.cpp
+++ b/cpp/benchmarks/quantiles/quantiles.cpp
@@ -30,9 +30,9 @@ static void BM_quantiles(benchmark::State& state, bool nulls)
 {
   using Type = int;
 
-  const cudf::size_type n_rows{(cudf::size_type)state.range(0)};
-  const cudf::size_type n_cols{(cudf::size_type)state.range(1)};
-  const cudf::size_type n_quantiles{(cudf::size_type)state.range(2)};
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  cudf::size_type const n_cols{(cudf::size_type)state.range(1)};
+  cudf::size_type const n_quantiles{(cudf::size_type)state.range(2)};
 
   // Create columns with values in the range [0,100)
   data_profile profile = data_profile_builder().cardinality(0).distribution(
diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp
index ba723c16c4b..8b1e71c1585 100644
--- a/cpp/benchmarks/reduction/anyall.cpp
+++ b/cpp/benchmarks/reduction/anyall.cpp
@@ -30,7 +30,7 @@ template <typename type>
 void BM_reduction_anyall(benchmark::State& state,
                          std::unique_ptr<cudf::reduce_aggregation> const& agg)
 {
-  const cudf::size_type column_size{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const column_size{static_cast<cudf::size_type>(state.range(0))};
   auto const dtype           = cudf::type_to_id<type>();
   data_profile const profile = data_profile_builder().no_validity().distribution(
     dtype, distribution_id::UNIFORM, 0, agg->kind == cudf::aggregation::ANY ? 0 : 100);
diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp
index 97ac5f56b2d..c1c44c919ac 100644
--- a/cpp/benchmarks/reduction/dictionary.cpp
+++ b/cpp/benchmarks/reduction/dictionary.cpp
@@ -29,7 +29,7 @@ template <typename T>
 void BM_reduction_dictionary(benchmark::State& state,
                              std::unique_ptr<cudf::reduce_aggregation> const& agg)
 {
-  const cudf::size_type column_size{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const column_size{static_cast<cudf::size_type>(state.range(0))};
 
   // int column and encoded dictionary column
   data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp
index 2cabcdf680c..963c26692e7 100644
--- a/cpp/benchmarks/reduction/minmax.cpp
+++ b/cpp/benchmarks/reduction/minmax.cpp
@@ -27,7 +27,7 @@ class Reduction : public cudf::benchmark {};
 template <typename type>
 void BM_reduction(benchmark::State& state)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
   auto const dtype = cudf::type_to_id<type>();
   auto const input_column =
     create_random_column(dtype, row_count{column_size}, data_profile_builder().no_validity());
diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
index 41295f787fc..e55f3b9e09f 100644
--- a/cpp/benchmarks/reduction/rank.cpp
+++ b/cpp/benchmarks/reduction/rank.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/detail/scan.hpp>
 #include <cudf/filling.hpp>
diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp
index 34e4a47c09d..5bd3e2e3bba 100644
--- a/cpp/benchmarks/reduction/reduce.cpp
+++ b/cpp/benchmarks/reduction/reduce.cpp
@@ -30,7 +30,7 @@ class Reduction : public cudf::benchmark {};
 template <typename type>
 void BM_reduction(benchmark::State& state, std::unique_ptr<cudf::reduce_aggregation> const& agg)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
   auto const dtype = cudf::type_to_id<type>();
   data_profile const profile =
     data_profile_builder().no_validity().distribution(dtype, distribution_id::UNIFORM, 0, 100);
diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp
index d5b19faf773..ee97b54fbef 100644
--- a/cpp/benchmarks/reduction/scan_structs.cpp
+++ b/cpp/benchmarks/reduction/scan_structs.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/scan.hpp>
diff --git a/cpp/benchmarks/reduction/segmented_reduce.cpp b/cpp/benchmarks/reduction/segmented_reduce.cpp
index 590a014ad76..7accb82734a 100644
--- a/cpp/benchmarks/reduction/segmented_reduce.cpp
+++ b/cpp/benchmarks/reduction/segmented_reduce.cpp
@@ -15,8 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
-#include <nvbench/nvbench.cuh>
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
@@ -28,6 +26,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <memory>
 
 bool constexpr is_boolean_output_agg(cudf::segmented_reduce_aggregation::Kind kind)
diff --git a/cpp/benchmarks/search/contains.cpp b/cpp/benchmarks/search/contains.cpp
index 01a0a37b21a..8d3c3f596d5 100644
--- a/cpp/benchmarks/search/contains.cpp
+++ b/cpp/benchmarks/search/contains.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/detail/search.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
diff --git a/cpp/benchmarks/sort/nested_types_common.hpp b/cpp/benchmarks/sort/nested_types_common.hpp
index e0626b1b96f..93853ba5768 100644
--- a/cpp/benchmarks/sort/nested_types_common.hpp
+++ b/cpp/benchmarks/sort/nested_types_common.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -33,8 +32,8 @@ inline std::unique_ptr<cudf::table> create_lists_data(nvbench::state& state,
                                                       cudf::size_type const min_val     = 0,
                                                       cudf::size_type const max_val     = 5)
 {
-  const size_t size_bytes(state.get_int64("size_bytes"));
-  const cudf::size_type depth{static_cast<cudf::size_type>(state.get_int64("depth"))};
+  size_t const size_bytes(state.get_int64("size_bytes"));
+  cudf::size_type const depth{static_cast<cudf::size_type>(state.get_int64("depth"))};
   auto const null_frequency{state.get_float64("null_frequency")};
 
   data_profile table_profile;
@@ -55,9 +54,9 @@ inline std::unique_ptr<cudf::table> create_structs_data(nvbench::state& state,
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution(0, 100);
 
-  const cudf::size_type n_rows{static_cast<cudf::size_type>(state.get_int64("NumRows"))};
-  const cudf::size_type depth{static_cast<cudf::size_type>(state.get_int64("Depth"))};
-  const bool nulls{static_cast<bool>(state.get_int64("Nulls"))};
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.get_int64("NumRows"))};
+  cudf::size_type const depth{static_cast<cudf::size_type>(state.get_int64("Depth"))};
+  bool const nulls{static_cast<bool>(state.get_int64("Nulls"))};
 
   // Create columns with values in the range [0,100)
   std::vector<column_wrapper> columns;
diff --git a/cpp/benchmarks/sort/rank.cpp b/cpp/benchmarks/sort/rank.cpp
index b2178f8f187..6231c7016aa 100644
--- a/cpp/benchmarks/sort/rank.cpp
+++ b/cpp/benchmarks/sort/rank.cpp
@@ -27,7 +27,7 @@ class Rank : public cudf::benchmark {};
 static void BM_rank(benchmark::State& state, bool nulls)
 {
   using Type = int;
-  const cudf::size_type n_rows{(cudf::size_type)state.range(0)};
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
 
   // Create columns with values in the range [0,100)
   data_profile profile = data_profile_builder().cardinality(0).distribution(
diff --git a/cpp/benchmarks/sort/rank_structs.cpp b/cpp/benchmarks/sort/rank_structs.cpp
index c0227e85191..85427e2128f 100644
--- a/cpp/benchmarks/sort/rank_structs.cpp
+++ b/cpp/benchmarks/sort/rank_structs.cpp
@@ -26,7 +26,7 @@ void nvbench_rank_structs(nvbench::state& state, nvbench::type_list<nvbench::enu
 {
   auto const table = create_structs_data(state);
 
-  const bool nulls{static_cast<bool>(state.get_int64("Nulls"))};
+  bool const nulls{static_cast<bool>(state.get_int64("Nulls"))};
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     cudf::rank(table->view().column(0),
diff --git a/cpp/benchmarks/sort/segmented_sort.cpp b/cpp/benchmarks/sort/segmented_sort.cpp
index 22d2b1c4029..2e835259cbc 100644
--- a/cpp/benchmarks/sort/segmented_sort.cpp
+++ b/cpp/benchmarks/sort/segmented_sort.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
diff --git a/cpp/benchmarks/sort/sort.cpp b/cpp/benchmarks/sort/sort.cpp
index cab25f442bb..267a740aee9 100644
--- a/cpp/benchmarks/sort/sort.cpp
+++ b/cpp/benchmarks/sort/sort.cpp
@@ -29,8 +29,8 @@ static void BM_sort(benchmark::State& state, bool nulls)
 {
   using Type       = int;
   auto const dtype = cudf::type_to_id<Type>();
-  const cudf::size_type n_rows{(cudf::size_type)state.range(0)};
-  const cudf::size_type n_cols{(cudf::size_type)state.range(1)};
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  cudf::size_type const n_cols{(cudf::size_type)state.range(1)};
 
   // Create table with values in the range [0,100)
   data_profile const profile = data_profile_builder()
diff --git a/cpp/benchmarks/sort/sort_lists.cpp b/cpp/benchmarks/sort/sort_lists.cpp
index 3cab60a29ec..4b04323a99f 100644
--- a/cpp/benchmarks/sort/sort_lists.cpp
+++ b/cpp/benchmarks/sort/sort_lists.cpp
@@ -84,7 +84,7 @@ void sort_lists_of_structs(nvbench::state& state)
 
 void nvbench_sort_lists(nvbench::state& state)
 {
-  const auto has_lists_of_structs = state.get_int64("lists_of_structs") > 0;
+  auto const has_lists_of_structs = state.get_int64("lists_of_structs") > 0;
   if (has_lists_of_structs) {
     sort_lists_of_structs(state);
   } else {
diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
index 216ebc6bfd7..a6feaf04842 100644
--- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
+++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
@@ -65,7 +65,7 @@ void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns)
     (column_bytes_out + validity_bytes_out) * num_columns;  // writing columns
 
   state.SetItemsProcessed(state.iterations() * column_size * num_columns);
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * bytes_read + bytes_written);
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * (bytes_read + bytes_written));
 }
 
 }  // namespace
@@ -73,8 +73,8 @@ void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns)
 template <class T>
 void BM_apply_boolean_mask(benchmark::State& state, cudf::size_type num_columns)
 {
-  const cudf::size_type column_size{static_cast<cudf::size_type>(state.range(0))};
-  const cudf::size_type percent_true{static_cast<cudf::size_type>(state.range(1))};
+  cudf::size_type const column_size{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const percent_true{static_cast<cudf::size_type>(state.range(1))};
 
   data_profile profile = data_profile_builder().cardinality(0).null_probability(0.0).distribution(
     cudf::type_to_id<T>(), distribution_id::UNIFORM, 0, 100);
diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp
index 81eafa3044f..c04b6516903 100644
--- a/cpp/benchmarks/stream_compaction/distinct.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/list_view.hpp>
diff --git a/cpp/benchmarks/stream_compaction/stable_distinct.cpp b/cpp/benchmarks/stream_compaction/stable_distinct.cpp
new file mode 100644
index 00000000000..bcee3048013
--- /dev/null
+++ b/cpp/benchmarks/stream_compaction/stable_distinct.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/lists/list_view.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
+
+template <typename Type>
+void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
+{
+  cudf::size_type const num_rows = state.get_int64("NumRows");
+
+  data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
+    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+
+  auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+
+  auto input_column = source_column->view();
+  auto input_table  = cudf::table_view({input_column, input_column, input_column, input_column});
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::stable_distinct(input_table,
+                                        {0},
+                                        cudf::duplicate_keep_option::KEEP_ANY,
+                                        cudf::null_equality::EQUAL,
+                                        cudf::nan_equality::ALL_EQUAL);
+  });
+}
+
+using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+
+NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type))
+  .set_name("stable_distinct")
+  .set_type_axes_names({"Type"})
+  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
+
+template <typename Type>
+void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const size               = state.get_int64("ColumnSize");
+  auto const dtype              = cudf::type_to_id<Type>();
+  double const null_probability = state.get_float64("null_probability");
+
+  auto builder = data_profile_builder().null_probability(null_probability);
+  if (dtype == cudf::type_id::LIST) {
+    builder.distribution(dtype, distribution_id::UNIFORM, 0, 4)
+      .distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4)
+      .list_depth(1);
+  } else {
+    // We're comparing stable_distinct() on a non-nested column to that on a list column with the
+    // same number of stable_distinct rows. The max list size is 4 and the number of distinct values
+    // in the list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 +
+    // 5^4 = 781 We want this column to also have 781 distinct values.
+    builder.distribution(dtype, distribution_id::UNIFORM, 0, 781);
+  }
+
+  auto const table = create_random_table(
+    {dtype}, table_size_bytes{static_cast<size_t>(size)}, data_profile{builder}, 0);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::stable_distinct(*table,
+                                        {0},
+                                        cudf::duplicate_keep_option::KEEP_ANY,
+                                        cudf::null_equality::EQUAL,
+                                        cudf::nan_equality::ALL_EQUAL);
+  });
+}
+
+NVBENCH_BENCH_TYPES(nvbench_stable_distinct_list,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
+  .set_name("stable_distinct_list")
+  .set_type_axes_names({"Type"})
+  .add_float64_axis("null_probability", {0.0, 0.1})
+  .add_int64_axis("ColumnSize", {100'000'000});
diff --git a/cpp/benchmarks/stream_compaction/unique.cpp b/cpp/benchmarks/stream_compaction/unique.cpp
index dafb9d506c7..854bc17e9c1 100644
--- a/cpp/benchmarks/stream_compaction/unique.cpp
+++ b/cpp/benchmarks/stream_compaction/unique.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
diff --git a/cpp/benchmarks/stream_compaction/unique_count.cpp b/cpp/benchmarks/stream_compaction/unique_count.cpp
index f8319e0385c..e003c476685 100644
--- a/cpp/benchmarks/stream_compaction/unique_count.cpp
+++ b/cpp/benchmarks/stream_compaction/unique_count.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index 0cdd5fbac32..385bb7630f8 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -26,7 +25,7 @@
 void bench_case(nvbench::state& state)
 {
   auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const max_width = static_cast<int32_t>(state.get_int64("width"));
+  auto const max_width = static_cast<int32_t>(state.get_int64("row_width"));
   auto const encoding  = state.get_string("encoding");
 
   if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(max_width) >=
@@ -72,7 +71,7 @@ void bench_case(nvbench::state& state)
 }
 
 NVBENCH_BENCH(bench_case)
-  .set_name("strings_case")
-  .add_int64_axis("width", {32, 64, 128, 256, 512, 1024, 2048})
+  .set_name("case")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
   .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
   .add_string_axis("encoding", {"ascii", "utf8"});
diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp
new file mode 100644
index 00000000000..8e9e595fcef
--- /dev/null
+++ b/cpp/benchmarks/string/char_types.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/strings/char_types/char_types.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_char_types(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const api_type  = state.get_string("api");
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  auto input_types = cudf::strings::string_character_types::SPACE;
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  // gather some throughput statistics as well
+  auto chars_size = input.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
+  if (api_type == "all") {
+    state.add_global_memory_writes<nvbench::int8_t>(num_rows);  // output is a bool8 per row
+  } else {
+    state.add_global_memory_writes<nvbench::int8_t>(chars_size);
+  }
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    if (api_type == "all") {
+      auto result = cudf::strings::all_characters_of_type(input, input_types);
+    } else {
+      auto result = cudf::strings::filter_characters_of_type(input, input_types);
+    }
+  });
+}
+
+NVBENCH_BENCH(bench_char_types)
+  .set_name("char_types")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_string_axis("api", {"all", "filter"});
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index 714d50ffce3..af45d5d8fee 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -16,35 +16,46 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/filling.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/contains.hpp>
-#include <cudf/strings/findall.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-class StringContains : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows, int32_t hit_rate)
+std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
+                                                 cudf::size_type row_width,
+                                                 int32_t hit_rate)
 {
   // build input table using the following data
-  auto data      = cudf::test::strings_column_wrapper({
-    "123 abc 4567890 DEFGHI 0987 5W43",  // matches both patterns;
-    "012345 6789 01234 56789 0123 456",  // the rest do not match
-    "abc 4567890 DEFGHI 0987 Wxyz 123",
-    "abcdefghijklmnopqrstuvwxyz 01234",
-    "",
-    "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
-    "9876543210,abcdefghijklmnopqrstU",
-    "9876543210,abcdefghijklmnopqrstU",
-    "123 édf 4567890 DéFG 0987 X5",
-    "1",
-  });
-  auto data_view = cudf::column_view(data);
+  auto raw_data = cudf::test::strings_column_wrapper(
+                    {
+                      "123 abc 4567890 DEFGHI 0987 5W43",  // matches both patterns;
+                      "012345 6789 01234 56789 0123 456",  // the rest do not match
+                      "abc 4567890 DEFGHI 0987 Wxyz 123",
+                      "abcdefghijklmnopqrstuvwxyz 01234",
+                      "",
+                      "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
+                      "9876543210,abcdefghijklmnopqrstU",
+                      "9876543210,abcdefghijklmnopqrstU",
+                      "123 édf 4567890 DéFG 0987 X5",
+                      "1",
+                    })
+                    .release();
+
+  if (row_width / 32 > 1) {
+    std::vector<cudf::column_view> columns;
+    for (int i = 0; i < row_width / 32; ++i) {
+      columns.push_back(raw_data->view());
+    }
+    raw_data = cudf::strings::concatenate(cudf::table_view(columns));
+  }
+  auto data_view = raw_data->view();
 
   // compute number of rows in n_rows that should match
   auto matches = static_cast<int32_t>(n_rows * hit_rate) / 100;
@@ -68,51 +79,39 @@ std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows, int32_t
   return std::move(table->release().front());
 }
 
-enum contains_type { contains, count, findall };
-
 // longer pattern lengths demand more working memory per string
 std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$"};
 
-static void BM_contains(benchmark::State& state, contains_type ct)
+static void bench_contains(nvbench::state& state)
 {
-  auto const n_rows        = static_cast<cudf::size_type>(state.range(0));
-  auto const pattern_index = static_cast<int32_t>(state.range(1));
-  auto const hit_rate      = static_cast<int32_t>(state.range(2));
+  auto const n_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
+  auto const hit_rate      = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
 
-  auto col   = build_input_column(n_rows, hit_rate);
+  auto col   = build_input_column(n_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
   auto pattern = patterns[pattern_index];
   auto program = cudf::strings::regex_program::create(pattern);
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (ct) {
-      case contains_type::contains:  // contains_re and matches_re use the same main logic
-        cudf::strings::contains_re(input, *program);
-        break;
-      case contains_type::count:  // counts occurrences of matches
-        cudf::strings::count_re(input, *program);
-        break;
-      case contains_type::findall:  // returns occurrences of all matches
-        cudf::strings::findall(input, *program);
-        break;
-    }
-  }
+  auto chars_size = input.chars_size();
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(input.size());
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { cudf::strings::contains_re(input, *program); });
 }
 
-#define STRINGS_BENCHMARK_DEFINE(name, b)                                         \
-  BENCHMARK_DEFINE_F(StringContains, name)                                        \
-  (::benchmark::State & st) { BM_contains(st, contains_type::b); }                \
-  BENCHMARK_REGISTER_F(StringContains, name)                                      \
-    ->ArgsProduct({{4096, 32768, 262144, 2097152, 16777216}, /* row count */      \
-                   {0, 1},                                   /* patterns index */ \
-                   {1, 5, 10, 25, 70, 100}})                 /* hit rate */       \
-    ->UseManualTime()                                                             \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(contains_re, contains)
-STRINGS_BENCHMARK_DEFINE(count_re, count)
-STRINGS_BENCHMARK_DEFINE(findall_re, findall)
+NVBENCH_BENCH(bench_contains)
+  .set_name("contains")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("hit_rate", {50, 100})  // percentage
+  .add_int64_axis("pattern", {0, 1});
diff --git a/cpp/benchmarks/string/convert_durations.cpp b/cpp/benchmarks/string/convert_durations.cpp
index 0db38bb5add..f12d292c2e7 100644
--- a/cpp/benchmarks/string/convert_durations.cpp
+++ b/cpp/benchmarks/string/convert_durations.cpp
@@ -31,7 +31,7 @@ class DurationsToString : public cudf::benchmark {};
 template <class TypeParam>
 void BM_convert_from_durations(benchmark::State& state)
 {
-  const cudf::size_type source_size = state.range(0);
+  cudf::size_type const source_size = state.range(0);
 
   // Every element is valid
   auto data = cudf::detail::make_counting_transform_iterator(
@@ -51,7 +51,7 @@ class StringToDurations : public cudf::benchmark {};
 template <class TypeParam>
 void BM_convert_to_durations(benchmark::State& state)
 {
-  const cudf::size_type source_size = state.range(0);
+  cudf::size_type const source_size = state.range(0);
 
   // Every element is valid
   auto data = cudf::detail::make_counting_transform_iterator(
diff --git a/cpp/benchmarks/string/convert_fixed_point.cpp b/cpp/benchmarks/string/convert_fixed_point.cpp
index 69fc65333b8..0cc98ee146c 100644
--- a/cpp/benchmarks/string/convert_fixed_point.cpp
+++ b/cpp/benchmarks/string/convert_fixed_point.cpp
@@ -38,14 +38,14 @@ class StringsToFixedPoint : public cudf::benchmark {};
 template <typename fixed_point_type>
 void convert_to_fixed_point(benchmark::State& state)
 {
-  const auto rows         = static_cast<cudf::size_type>(state.range(0));
-  const auto strings_col  = get_strings_column(rows);
-  const auto strings_view = cudf::strings_column_view(strings_col->view());
-  const auto dtype = cudf::data_type{cudf::type_to_id<fixed_point_type>(), numeric::scale_type{-2}};
+  auto const rows         = static_cast<cudf::size_type>(state.range(0));
+  auto const strings_col  = get_strings_column(rows);
+  auto const strings_view = cudf::strings_column_view(strings_col->view());
+  auto const dtype = cudf::data_type{cudf::type_to_id<fixed_point_type>(), numeric::scale_type{-2}};
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);
-    volatile auto results = cudf::strings::to_fixed_point(strings_view, dtype);
+    auto volatile results = cudf::strings::to_fixed_point(strings_view, dtype);
   }
 
   // bytes_processed = bytes_input + bytes_output
@@ -58,10 +58,10 @@ class StringsFromFixedPoint : public cudf::benchmark {};
 template <typename fixed_point_type>
 void convert_from_fixed_point(benchmark::State& state)
 {
-  const auto rows        = static_cast<cudf::size_type>(state.range(0));
-  const auto strings_col = get_strings_column(rows);
-  const auto dtype = cudf::data_type{cudf::type_to_id<fixed_point_type>(), numeric::scale_type{-2}};
-  const auto fp_col =
+  auto const rows        = static_cast<cudf::size_type>(state.range(0));
+  auto const strings_col = get_strings_column(rows);
+  auto const dtype = cudf::data_type{cudf::type_to_id<fixed_point_type>(), numeric::scale_type{-2}};
+  auto const fp_col =
     cudf::strings::to_fixed_point(cudf::strings_column_view(strings_col->view()), dtype);
 
   std::unique_ptr<cudf::column> results = nullptr;
diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp
new file mode 100644
index 00000000000..08406462632
--- /dev/null
+++ b/cpp/benchmarks/string/count.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/strings/contains.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_count(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  std::string pattern = "\\d+";
+
+  auto prog = cudf::strings::regex_program::create(pattern);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  // gather some throughput statistics as well
+  auto chars_size = input.chars_size();
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(input.size());
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto result = cudf::strings::count_re(input, *prog); });
+}
+
+NVBENCH_BENCH(bench_count)
+  .set_name("count")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp
index 021062ee479..9e67c5a5b52 100644
--- a/cpp/benchmarks/string/extract.cpp
+++ b/cpp/benchmarks/string/extract.cpp
@@ -14,34 +14,37 @@
  * limitations under the License.
  */
 
-#include "string_bench_args.hpp"
-
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
-#include <random>
+#include <nvbench/nvbench.cuh>
 
-class StringExtract : public cudf::benchmark {};
+#include <random>
 
-static void BM_extract(benchmark::State& state, int groups)
+static void bench_extract(nvbench::state& state)
 {
-  auto const n_rows   = static_cast<cudf::size_type>(state.range(0));
-  auto const n_length = static_cast<cudf::size_type>(state.range(1));
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  auto groups = static_cast<cudf::size_type>(state.get_int64("groups"));
 
   std::default_random_engine generator;
   std::uniform_int_distribution<int> words_dist(0, 999);
-
   std::vector<std::string> samples(100);  // 100 unique rows of data to reuse
   std::generate(samples.begin(), samples.end(), [&]() {
     std::string row;                      // build a row of random tokens
-    while (static_cast<int>(row.size()) < n_length) {
+    while (static_cast<cudf::size_type>(row.size()) < row_width) {
       row += std::to_string(words_dist(generator)) + " ";
     }
     return row;
@@ -55,41 +58,27 @@ static void BM_extract(benchmark::State& state, int groups)
   cudf::test::strings_column_wrapper samples_column(samples.begin(), samples.end());
   data_profile const profile = data_profile_builder().no_validity().distribution(
     cudf::type_to_id<cudf::size_type>(), distribution_id::UNIFORM, 0ul, samples.size() - 1);
-  auto map = create_random_column(cudf::type_to_id<cudf::size_type>(), row_count{n_rows}, profile);
+  auto map =
+    create_random_column(cudf::type_to_id<cudf::size_type>(), row_count{num_rows}, profile);
   auto input = cudf::gather(
     cudf::table_view{{samples_column}}, map->view(), cudf::out_of_bounds_policy::DONT_CHECK);
   cudf::strings_column_view strings_view(input->get_column(0).view());
   auto prog = cudf::strings::regex_program::create(pattern);
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    auto results = cudf::strings::extract(strings_view, *prog);
-  }
-
-  state.SetBytesProcessed(state.iterations() * strings_view.chars_size());
-}
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  // gather some throughput statistics as well
+  auto chars_size = strings_view.chars_size();
+  state.add_element_count(chars_size, "chars_size");            // number of bytes;
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows          = 1 << 12;
-  int const max_rows          = 1 << 24;
-  int const row_multiplier    = 8;
-  int const min_row_length    = 1 << 5;
-  int const max_row_length    = 1 << 13;
-  int const length_multiplier = 4;
-  generate_string_bench_args(
-    b, min_rows, max_rows, row_multiplier, min_row_length, max_row_length, length_multiplier);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::strings::extract(strings_view, *prog);
+  });
 }
 
-#define STRINGS_BENCHMARK_DEFINE(name, instructions)          \
-  BENCHMARK_DEFINE_F(StringExtract, name)                     \
-  (::benchmark::State & st) { BM_extract(st, instructions); } \
-  BENCHMARK_REGISTER_F(StringExtract, name)                   \
-    ->Apply(generate_bench_args)                              \
-    ->UseManualTime()                                         \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(one, 1)
-STRINGS_BENCHMARK_DEFINE(two, 2)
-STRINGS_BENCHMARK_DEFINE(four, 4)
-STRINGS_BENCHMARK_DEFINE(eight, 8)
+NVBENCH_BENCH(bench_extract)
+  .set_name("extract")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("groups", {1, 2, 4});
diff --git a/cpp/benchmarks/string/gather.cpp b/cpp/benchmarks/string/gather.cpp
new file mode 100644
index 00000000000..530b09b7d6a
--- /dev/null
+++ b/cpp/benchmarks/string/gather.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_gather(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const input_table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+
+  data_profile const map_profile = data_profile_builder().no_validity().distribution(
+    cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_rows);
+  auto const map_table =
+    create_random_table({cudf::type_id::INT32}, row_count{num_rows}, map_profile);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = cudf::strings_column_view(input_table->view().column(0)).chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::gather(
+      input_table->view(), map_table->view().column(0), cudf::out_of_bounds_policy::NULLIFY);
+  });
+}
+
+NVBENCH_BENCH(bench_gather)
+  .set_name("gather")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp
new file mode 100644
index 00000000000..a122c0022a9
--- /dev/null
+++ b/cpp/benchmarks/string/join_strings.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_join(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  // gather some throughput statistics as well
+  auto const chars_size = input.chars_size();
+  state.add_element_count(chars_size, "chars_size");            // number of bytes;
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  std::string separator(":");
+  std::string narep("null");
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::strings::join_strings(input, separator, narep);
+  });
+}
+
+NVBENCH_BENCH(bench_join)
+  .set_name("strings_join")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/benchmarks/string/json.cu b/cpp/benchmarks/string/json.cu
index 1d19e63102d..7e89edf3e17 100644
--- a/cpp/benchmarks/string/json.cu
+++ b/cpp/benchmarks/string/json.cu
@@ -32,7 +32,7 @@
 
 class JsonPath : public cudf::benchmark {};
 
-const std::vector<std::string> Books{
+std::vector<std::string> const Books{
   R"json({
 "category": "reference",
 "author": "Nigel Rees",
@@ -60,7 +60,7 @@ const std::vector<std::string> Books{
 "price": 22.99
 })json"};
 constexpr int Approx_book_size = 110;
-const std::vector<std::string> Bicycles{
+std::vector<std::string> const Bicycles{
   R"json({"color": "red", "price": 9.95})json",
   R"json({"color": "green", "price": 29.95})json",
   R"json({"color": "blue", "price": 399.95})json",
diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp
index 4540e4a8f42..36c4bf64a00 100644
--- a/cpp/benchmarks/string/lengths.cpp
+++ b/cpp/benchmarks/string/lengths.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/strings/attributes.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -51,6 +50,6 @@ static void bench_lengths(nvbench::state& state)
 }
 
 NVBENCH_BENCH(bench_lengths)
-  .set_name("strings_lengths")
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096});
+  .set_name("lengths")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp
index d86c31480dd..6ac832471a5 100644
--- a/cpp/benchmarks/string/like.cpp
+++ b/cpp/benchmarks/string/like.cpp
@@ -15,12 +15,12 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/filling.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -28,22 +28,33 @@
 #include <nvbench/nvbench.cuh>
 
 namespace {
-std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows, int32_t hit_rate)
+std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
+                                                 cudf::size_type row_width,
+                                                 int32_t hit_rate)
 {
   // build input table using the following data
-  auto data      = cudf::test::strings_column_wrapper({
-    "123 abc 4567890 DEFGHI 0987 5W43",  // matches always;
-    "012345 6789 01234 56789 0123 456",  // the rest do not match
-    "abc 4567890 DEFGHI 0987 Wxyz 123",
-    "abcdefghijklmnopqrstuvwxyz 01234",
-    "",
-    "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
-    "9876543210,abcdefghijklmnopqrstU",
-    "9876543210,abcdefghijklmnopqrstU",
-    "123 édf 4567890 DéFG 0987 X5",
-    "1",
-  });
-  auto data_view = cudf::column_view(data);
+  auto raw_data = cudf::test::strings_column_wrapper(
+                    {
+                      "123 abc 4567890 DEFGHI 0987 5W43",  // matches always;
+                      "012345 6789 01234 56789 0123 456",  // the rest do not match
+                      "abc 4567890 DEFGHI 0987 Wxyz 123",
+                      "abcdefghijklmnopqrstuvwxyz 01234",
+                      "",
+                      "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
+                      "9876543210,abcdefghijklmnopqrstU",
+                      "9876543210,abcdefghijklmnopqrstU",
+                      "123 édf 4567890 DéFG 0987 X5",
+                      "1",
+                    })
+                    .release();
+  if (row_width / 32 > 1) {
+    std::vector<cudf::column_view> columns;
+    for (int i = 0; i < row_width / 32; ++i) {
+      columns.push_back(raw_data->view());
+    }
+    raw_data = cudf::strings::concatenate(cudf::table_view(columns));
+  }
+  auto data_view = raw_data->view();
 
   // compute number of rows in n_rows that should match
   auto matches = static_cast<int32_t>(n_rows * hit_rate) / 100;
@@ -71,14 +82,20 @@ std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows, int32_t
 
 static void bench_like(nvbench::state& state)
 {
-  auto const n_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const hit_rate = static_cast<int32_t>(state.get_int64("hit_rate"));
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const hit_rate  = static_cast<int32_t>(state.get_int64("hit_rate"));
 
-  auto col   = build_input_column(n_rows, hit_rate);
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  auto col   = build_input_column(n_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
   // This pattern forces reading the entire target string (when matched expected)
-  auto pattern = std::string("% 5W4_");  // regex equivalent: ".* 5W4."
+  auto pattern = std::string("% 5W4_");  // regex equivalent: ".* 5W4.$"
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
@@ -93,5 +110,6 @@ static void bench_like(nvbench::state& state)
 
 NVBENCH_BENCH(bench_like)
   .set_name("strings_like")
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
-  .add_int64_axis("hit_rate", {1, 5, 10, 25, 70, 100});
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("hit_rate", {10, 25, 70, 100});
diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp
index f719fe31bd8..b8efd76ab41 100644
--- a/cpp/benchmarks/string/replace_re.cpp
+++ b/cpp/benchmarks/string/replace_re.cpp
@@ -14,72 +14,54 @@
  * limitations under the License.
  */
 
-#include "string_bench_args.hpp"
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-class StringReplace : public cudf::benchmark {};
-
-enum replace_type { replace_re, replace_re_multi, replace_backref };
+#include <nvbench/nvbench.cuh>
 
-static void BM_replace(benchmark::State& state, replace_type rt)
+static void bench_replace(nvbench::state& state)
 {
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const rtype     = state.get_string("type");
+
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
   cudf::strings_column_view input(column->view());
-  cudf::test::strings_column_wrapper repls({"#", ""});
-  auto prog         = cudf::strings::regex_program::create("\\d+");
-  auto prog_backref = cudf::strings::regex_program::create("(\\d+)");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (rt) {
-      case replace_type::replace_re:  // contains_re and matches_re use the same main logic
-        cudf::strings::replace_re(input, *prog);
-        break;
-      case replace_type::replace_re_multi:  // counts occurrences of pattern
-        cudf::strings::replace_re(input, {"\\d+", "\\s+"}, cudf::strings_column_view(repls));
-        break;
-      case replace_type::replace_backref:  // returns occurrences of matches
-        cudf::strings::replace_with_backrefs(input, *prog_backref, "#\\1X");
-        break;
-    }
-  }
+  auto program = cudf::strings::regex_program::create("(\\d+)");
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
-}
+  auto chars_size = input.chars_size();
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  if (rtype == "backref") {
+    auto replacement = std::string("#\\1X");
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::replace_with_backrefs(input, *program, replacement);
+    });
+  } else {
+    auto replacement = std::string("77");
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::replace_re(input, *program, replacement);
+    });
+  }
 }
 
-#define STRINGS_BENCHMARK_DEFINE(name)                \
-  BENCHMARK_DEFINE_F(StringReplace, name)             \
-  (::benchmark::State & st) { BM_replace(st, name); } \
-  BENCHMARK_REGISTER_F(StringReplace, name)           \
-    ->Apply(generate_bench_args)                      \
-    ->UseManualTime()                                 \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(replace_re)
-STRINGS_BENCHMARK_DEFINE(replace_re_multi)
-STRINGS_BENCHMARK_DEFINE(replace_backref)
+NVBENCH_BENCH(bench_replace)
+  .set_name("replace_re")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_string_axis("type", {"replace", "backref"});
diff --git a/cpp/benchmarks/string/reverse.cpp b/cpp/benchmarks/string/reverse.cpp
index 4c3846c79bb..31cd4639115 100644
--- a/cpp/benchmarks/string/reverse.cpp
+++ b/cpp/benchmarks/string/reverse.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/strings/reverse.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -51,6 +50,6 @@ static void bench_reverse(nvbench::state& state)
 }
 
 NVBENCH_BENCH(bench_reverse)
-  .set_name("strings_reverse")
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
-  .add_int64_axis("row_width", {8, 16, 32, 64, 128});
+  .set_name("reverse")
+  .add_int64_axis("row_width", {8, 16, 32, 64, 128})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp
index e0b801ea0a7..6c1d7d98d3a 100644
--- a/cpp/benchmarks/string/slice.cpp
+++ b/cpp/benchmarks/string/slice.cpp
@@ -33,7 +33,7 @@
 
 class StringSlice : public cudf::benchmark {};
 
-enum slice_type { position, multi_position, delimiter, multi_delimiter };
+enum slice_type { position, multi_position };
 
 static void BM_slice(benchmark::State& state, slice_type rt)
 {
@@ -47,8 +47,6 @@ static void BM_slice(benchmark::State& state, slice_type rt)
   auto stops_itr  = thrust::constant_iterator<cudf::size_type>(max_str_length / 2);
   cudf::test::fixed_width_column_wrapper<int32_t> starts(starts_itr, starts_itr + n_rows);
   cudf::test::fixed_width_column_wrapper<int32_t> stops(stops_itr, stops_itr + n_rows);
-  auto delim_itr = thrust::constant_iterator<std::string>(" ");
-  cudf::test::strings_column_wrapper delimiters(delim_itr, delim_itr + n_rows);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, cudf::get_default_stream());
@@ -57,10 +55,6 @@ static void BM_slice(benchmark::State& state, slice_type rt)
         cudf::strings::slice_strings(input, max_str_length / 3, max_str_length / 2);
         break;
       case multi_position: cudf::strings::slice_strings(input, starts, stops); break;
-      case delimiter: cudf::strings::slice_strings(input, std::string{" "}, 1); break;
-      case multi_delimiter:
-        cudf::strings::slice_strings(input, cudf::strings_column_view(delimiters), 1);
-        break;
     }
   }
 
@@ -88,5 +82,3 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
 
 STRINGS_BENCHMARK_DEFINE(position)
 STRINGS_BENCHMARK_DEFINE(multi_position)
-STRINGS_BENCHMARK_DEFINE(delimiter)
-STRINGS_BENCHMARK_DEFINE(multi_delimiter)
diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp
index 021a7341ddd..eb724fabfd1 100644
--- a/cpp/benchmarks/string/split.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -15,8 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -25,63 +23,49 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <limits>
+#include <nvbench/nvbench.cuh>
 
-class StringSplit : public cudf::benchmark {};
+static void bench_split(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const stype     = state.get_string("type");
 
-enum split_type { split, split_ws, record, record_ws };
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
 
-static void BM_split(benchmark::State& state, split_type rt)
-{
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
   cudf::string_scalar target("+");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (rt) {
-      case split: cudf::strings::split(input, target); break;
-      case split_ws: cudf::strings::split(input); break;
-      case record: cudf::strings::split_record(input, target); break;
-      case record_ws: cudf::strings::split_record(input); break;
-    }
-  }
-
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
-}
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  // gather some throughput statistics as well
+  auto chars_size = input.chars_size();
+  state.add_element_count(chars_size, "chars_size");            // number of bytes;
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int constexpr min_rows   = 1 << 12;
-  int constexpr max_rows   = 1 << 24;
-  int constexpr row_mult   = 8;
-  int constexpr min_rowlen = 1 << 5;
-  int constexpr max_rowlen = 1 << 13;
-  int constexpr len_mult   = 2;
-  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
-    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
-      // avoid generating combinations that exceed the cudf column limit
-      size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
-        b->Args({row_count, rowlen});
-      }
-    }
+  if (stype == "split") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::split(input, target); });
+  } else if (stype == "split_ws") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::split(input); });
+  } else if (stype == "record") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::split_record(input, target); });
+  } else {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::split_record(input); });
   }
 }
 
-#define STRINGS_BENCHMARK_DEFINE(name)                          \
-  BENCHMARK_DEFINE_F(StringSplit, name)                         \
-  (::benchmark::State & st) { BM_split(st, split_type::name); } \
-  BENCHMARK_REGISTER_F(StringSplit, name)                       \
-    ->Apply(generate_bench_args)                                \
-    ->UseManualTime()                                           \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(split)
-STRINGS_BENCHMARK_DEFINE(split_ws)
-STRINGS_BENCHMARK_DEFINE(record)
-STRINGS_BENCHMARK_DEFINE(record_ws)
+NVBENCH_BENCH(bench_split)
+  .set_name("split")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_string_axis("type", {"split", "split_ws", "record", "record_ws"});
diff --git a/cpp/benchmarks/string/split_re.cpp b/cpp/benchmarks/string/split_re.cpp
new file mode 100644
index 00000000000..67aa6f0e008
--- /dev/null
+++ b/cpp/benchmarks/string/split_re.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/strings/regex/regex_program.hpp>
+#include <cudf/strings/split/split_re.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_split(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  auto prog = cudf::strings::regex_program::create("\\d+");
+
+  data_profile const profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+  cudf::strings_column_view input(column->view());
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  // gather some throughput statistics as well
+  auto chars_size = input.chars_size();
+  state.add_element_count(chars_size, "chars_size");            // number of bytes;
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::strings::split_record_re(input, *prog);
+  });
+}
+
+NVBENCH_BENCH(bench_split)
+  .set_name("split_re")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp
index bbf90e6f68a..5993bb23542 100644
--- a/cpp/benchmarks/synchronization/synchronization.cpp
+++ b/cpp/benchmarks/synchronization/synchronization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state,
     CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
 
     if (l2_cache_bytes > 0) {
-      const int memset_value = 0;
+      int const memset_value = 0;
       rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
       CUDF_CUDA_TRY(
         cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp
new file mode 100644
index 00000000000..8a8bd9ae586
--- /dev/null
+++ b/cpp/benchmarks/text/edit_distance.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/edit_distance.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <rmm/device_buffer.hpp>
+
+static void bench_edit_distance(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const strings_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const strings_table = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
+  cudf::strings_column_view input1(strings_table->view().column(0));
+  cudf::strings_column_view input2(strings_table->view().column(1));
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  auto chars_size = input1.chars_size() + input2.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  // output are integers (one per row)
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto result = nvtext::edit_distance(input1, input2); });
+}
+
+NVBENCH_BENCH(bench_edit_distance)
+  .set_name("edit_distance")
+  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
+  .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256});
diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp
new file mode 100644
index 00000000000..5bbd2fc6819
--- /dev/null
+++ b/cpp/benchmarks/text/hash_ngrams.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/generate_ngrams.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <rmm/device_buffer.hpp>
+
+static void bench_hash_ngrams(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const ngrams    = static_cast<cudf::size_type>(state.get_int64("ngrams"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const strings_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const strings_table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
+  cudf::strings_column_view input(strings_table->view().column(0));
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  auto chars_size = input.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  // output are hashes: approximate total number of hashes
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows * ngrams);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = nvtext::hash_character_ngrams(input, ngrams);
+  });
+}
+
+NVBENCH_BENCH(bench_hash_ngrams)
+  .set_name("hash_ngrams")
+  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
+  .add_int64_axis("row_width", {128, 512, 2048})
+  .add_int64_axis("ngrams", {5, 10});
diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp
new file mode 100644
index 00000000000..70470b829bd
--- /dev/null
+++ b/cpp/benchmarks/text/jaccard.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/jaccard.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <rmm/device_buffer.hpp>
+
+static void bench_jaccard(nvbench::state& state)
+{
+  auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width       = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const substring_width = static_cast<cudf::size_type>(state.get_int64("substring_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const strings_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .no_validity();
+  auto const input_table = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
+  cudf::strings_column_view input1(input_table->view().column(0));
+  cudf::strings_column_view input2(input_table->view().column(1));
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  auto chars_size = input1.chars_size() + input2.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::float32_t>(num_rows);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = nvtext::jaccard_index(input1, input2, substring_width);
+  });
+}
+
+NVBENCH_BENCH(bench_jaccard)
+  .set_name("jaccard")
+  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
+  .add_int64_axis("row_width", {128, 512, 2048})
+  .add_int64_axis("substring_width", {5, 10});
diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index 15c39015d74..1b60caa24de 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/strings/strings_column_view.hpp>
 
@@ -31,6 +30,7 @@ static void bench_minhash(nvbench::state& state)
   auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const hash_width = static_cast<cudf::size_type>(state.get_int64("hash_width"));
   auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const base64     = state.get_int64("hash_type") == 64;
 
   if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
       static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
@@ -45,9 +45,9 @@ static void bench_minhash(nvbench::state& state)
 
   data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution(
     cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, row_width);
-  auto const seeds_table = create_random_table(
-    {cudf::type_to_id<cudf::hash_value_type>()}, row_count{seed_count}, seeds_profile);
-  auto seeds = seeds_table->get_column(0);
+  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
+  auto seeds             = seeds_table->get_column(0);
   seeds.set_null_mask(rmm::device_buffer{}, 0);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
@@ -57,13 +57,15 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = nvtext::minhash(input, seeds.view(), hash_width);
+    auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width)
+                         : nvtext::minhash(input, seeds.view(), hash_width);
   });
 }
 
 NVBENCH_BENCH(bench_minhash)
   .set_name("minhash")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
+  .add_int64_axis("num_rows", {1024, 8192, 16364, 131072})
   .add_int64_axis("row_width", {128, 512, 2048})
-  .add_int64_axis("hash_width", {5, 10, 25})
-  .add_int64_axis("seed_count", {2, 26});
+  .add_int64_axis("hash_width", {5, 10})
+  .add_int64_axis("seed_count", {2, 26})
+  .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 733f2da8b2a..6878fa4f8b6 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -16,7 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -24,51 +23,43 @@
 
 #include <nvtext/normalize.hpp>
 
-class TextNormalize : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-static void BM_normalize(benchmark::State& state, bool to_lower)
+static void bench_normalize(nvbench::state& state)
 {
-  auto const n_rows          = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length  = static_cast<cudf::size_type>(state.range(1));
+  auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width      = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const normalize_type = state.get_string("type");
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    nvtext::normalize_characters(input, to_lower);
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
-}
+  auto chars_size = input.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
-  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
-    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
-      // avoid generating combinations that exceed the cudf column limit
-      size_t total_chars = static_cast<size_t>(row_count) * rowlen * 4;
-      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
-        b->Args({row_count, rowlen});
-      }
-    }
+  if (normalize_type == "spaces") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); });
+  } else {
+    bool const to_lower = (normalize_type == "to_lower");
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::normalize_characters(input, to_lower);
+    });
   }
 }
 
-#define NVTEXT_BENCHMARK_DEFINE(name, lower)             \
-  BENCHMARK_DEFINE_F(TextNormalize, name)                \
-  (::benchmark::State & st) { BM_normalize(st, lower); } \
-  BENCHMARK_REGISTER_F(TextNormalize, name)              \
-    ->Apply(generate_bench_args)                         \
-    ->UseManualTime()                                    \
-    ->Unit(benchmark::kMillisecond);
-
-NVTEXT_BENCHMARK_DEFINE(characters, false)
-NVTEXT_BENCHMARK_DEFINE(to_lower, true)
+NVBENCH_BENCH(bench_normalize)
+  .set_name("normalize")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_string_axis("type", {"spaces", "characters", "to_lower"});
diff --git a/cpp/benchmarks/text/normalize_spaces.cpp b/cpp/benchmarks/text/normalize_spaces.cpp
deleted file mode 100644
index 82d9316e25b..00000000000
--- a/cpp/benchmarks/text/normalize_spaces.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/string/string_bench_args.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <nvtext/normalize.hpp>
-
-class TextNormalize : public cudf::benchmark {};
-
-static void BM_normalize(benchmark::State& state)
-{
-  auto const n_rows          = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length  = static_cast<cudf::size_type>(state.range(1));
-  data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
-  cudf::strings_column_view input(column->view());
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    nvtext::normalize_spaces(input);
-  }
-
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
-}
-
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
-}
-
-#define NVTEXT_BENCHMARK_DEFINE(name)             \
-  BENCHMARK_DEFINE_F(TextNormalize, name)         \
-  (::benchmark::State & st) { BM_normalize(st); } \
-  BENCHMARK_REGISTER_F(TextNormalize, name)       \
-    ->Apply(generate_bench_args)                  \
-    ->UseManualTime()                             \
-    ->Unit(benchmark::kMillisecond);
-
-NVTEXT_BENCHMARK_DEFINE(spaces)
diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp
index 21d69c4d40e..257f62aa728 100644
--- a/cpp/benchmarks/text/replace.cpp
+++ b/cpp/benchmarks/text/replace.cpp
@@ -15,8 +15,6 @@
  */
 
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/string/string_bench_args.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -24,14 +22,19 @@
 
 #include <nvtext/replace.hpp>
 
-#include <random>
+#include <nvbench/nvbench.cuh>
 
-class TextReplace : public cudf::benchmark {};
+#include <random>
 
-static void BM_replace(benchmark::State& state)
+static void bench_replace(nvbench::state& state)
 {
-  auto const n_rows   = static_cast<cudf::size_type>(state.range(0));
-  auto const n_length = static_cast<cudf::size_type>(state.range(1));
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
 
   std::vector<std::string> words{" ",        "one  ",    "two ",       "three ",     "four ",
                                  "five ",    "six  ",    "sevén  ",    "eight ",     "nine ",
@@ -41,46 +44,32 @@ static void BM_replace(benchmark::State& state)
   std::default_random_engine generator;
   std::uniform_int_distribution<int> tokens_dist(0, words.size() - 1);
   std::string row;  // build a row of random tokens
-  while (static_cast<int>(row.size()) < n_length)
+  while (static_cast<cudf::size_type>(row.size()) < row_width)
     row += words[tokens_dist(generator)];
 
   std::uniform_int_distribution<int> position_dist(0, 16);
 
   auto elements = cudf::detail::make_counting_transform_iterator(
     0, [&](auto idx) { return row.c_str() + position_dist(generator); });
-  cudf::test::strings_column_wrapper input(elements, elements + n_rows);
+  cudf::test::strings_column_wrapper input(elements, elements + num_rows);
   cudf::strings_column_view view(input);
 
   cudf::test::strings_column_wrapper targets({"one", "two", "sevén", "zero"});
   cudf::test::strings_column_wrapper replacements({"1", "2", "7", "0"});
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    nvtext::replace_tokens(
-      view, cudf::strings_column_view(targets), cudf::strings_column_view(replacements));
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.SetBytesProcessed(state.iterations() * view.chars_size());
-}
+  auto chars_size = view.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows          = 1 << 12;
-  int const max_rows          = 1 << 24;
-  int const row_multiplier    = 8;
-  int const min_row_length    = 1 << 5;
-  int const max_row_length    = 1 << 13;
-  int const length_multiplier = 4;
-  generate_string_bench_args(
-    b, min_rows, max_rows, row_multiplier, min_row_length, max_row_length, length_multiplier);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = nvtext::replace_tokens(
+      view, cudf::strings_column_view(targets), cudf::strings_column_view(replacements));
+  });
 }
 
-#define NVTEXT_BENCHMARK_DEFINE(name)           \
-  BENCHMARK_DEFINE_F(TextReplace, name)         \
-  (::benchmark::State & st) { BM_replace(st); } \
-  BENCHMARK_REGISTER_F(TextReplace, name)       \
-    ->Apply(generate_bench_args)                \
-    ->UseManualTime()                           \
-    ->Unit(benchmark::kMillisecond);
-
-NVTEXT_BENCHMARK_DEFINE(replace)
+NVBENCH_BENCH(bench_replace)
+  .set_name("replace")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
index a683214448f..1dd7322a5c8 100644
--- a/cpp/benchmarks/text/subword.cpp
+++ b/cpp/benchmarks/text/subword.cpp
@@ -27,12 +27,10 @@
 #include <iostream>
 #include <vector>
 
-#define MAX_ROWS_TENSOR 300
-
 static std::string create_hash_vocab_file()
 {
   std::string dir_template{std::filesystem::temp_directory_path().string()};
-  if (const char* env_p = std::getenv("WORKSPACE")) dir_template = env_p;
+  if (char const* env_p = std::getenv("WORKSPACE")) dir_template = env_p;
   std::string hash_file = dir_template + "/hash_vocab.txt";
   // create a fake hashed vocab text file for this test
   // this only works with words in the strings in the benchmark code below
@@ -57,7 +55,7 @@ static std::string create_hash_vocab_file()
 static void BM_subword_tokenizer(benchmark::State& state)
 {
   auto const nrows = static_cast<cudf::size_type>(state.range(0));
-  std::vector<const char*> h_strings(nrows, "This is a test ");
+  std::vector<char const*> h_strings(nrows, "This is a test ");
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = create_hash_vocab_file();
   std::vector<uint32_t> offsets{14};
@@ -74,8 +72,7 @@ static void BM_subword_tokenizer(benchmark::State& state)
                                            max_sequence_length,
                                            stride,
                                            do_lower,
-                                           do_truncate,
-                                           MAX_ROWS_TENSOR);
+                                           do_truncate);
   }
 }
 
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index bd80af08a74..423fe667b05 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -16,8 +16,6 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/string/string_bench_args.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -28,73 +26,57 @@
 #include <nvtext/ngrams_tokenize.hpp>
 #include <nvtext/tokenize.hpp>
 
-class TextTokenize : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-enum class tokenize_type { single, multi, count, count_multi, ngrams, characters };
-
-static void BM_tokenize(benchmark::State& state, tokenize_type tt)
+static void bench_tokenize(nvbench::state& state)
 {
-  auto const n_rows          = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length  = static_cast<cudf::size_type>(state.range(1));
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const tokenize_type = state.get_string("type");
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
-  cudf::test::strings_column_wrapper delimiters({" ", "+", "-"});
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (tt) {
-      case tokenize_type::single:
-        // single whitespace delimiter
-        nvtext::tokenize(input);
-        break;
-      case tokenize_type::multi:
-        nvtext::tokenize(input, cudf::strings_column_view(delimiters));
-        break;
-      case tokenize_type::count:
-        // single whitespace delimiter
-        nvtext::count_tokens(input);
-        break;
-      case tokenize_type::count_multi:
-        nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
-        break;
-      case tokenize_type::ngrams:
-        // default is bigrams
-        nvtext::ngrams_tokenize(input);
-        break;
-      case tokenize_type::characters:
-        // every character becomes a string
-        nvtext::character_tokenize(input);
-        break;
-    }
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
-}
+  auto chars_size = input.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  if (tokenize_type == "whitespace") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = nvtext::tokenize(input); });
+  } else if (tokenize_type == "multi") {
+    cudf::test::strings_column_wrapper delimiters({" ", "+", "-"});
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::tokenize(input, cudf::strings_column_view(delimiters));
+    });
+  } else if (tokenize_type == "count") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = nvtext::count_tokens(input); });
+  } else if (tokenize_type == "count_multi") {
+    cudf::test::strings_column_wrapper delimiters({" ", "+", "-"});
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
+    });
+  } else if (tokenize_type == "ngrams") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); });
+  } else if (tokenize_type == "characters") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); });
+  }
 }
 
-#define NVTEXT_BENCHMARK_DEFINE(name)                                 \
-  BENCHMARK_DEFINE_F(TextTokenize, name)                              \
-  (::benchmark::State & st) { BM_tokenize(st, tokenize_type::name); } \
-  BENCHMARK_REGISTER_F(TextTokenize, name)                            \
-    ->Apply(generate_bench_args)                                      \
-    ->UseManualTime()                                                 \
-    ->Unit(benchmark::kMillisecond);
-
-NVTEXT_BENCHMARK_DEFINE(single)
-NVTEXT_BENCHMARK_DEFINE(multi)
-NVTEXT_BENCHMARK_DEFINE(count)
-NVTEXT_BENCHMARK_DEFINE(count_multi)
-NVTEXT_BENCHMARK_DEFINE(ngrams)
-NVTEXT_BENCHMARK_DEFINE(characters)
+NVBENCH_BENCH(bench_tokenize)
+  .set_name("tokenize")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_string_axis("type", {"whitespace", "multi", "count", "count_multi", "ngrams", "characters"});
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index 362d3825f81..3f985cffb1f 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -126,7 +126,7 @@ struct RowHandle {
 template <FunctorType functor_type>
 __global__ void device_dispatching_kernel(cudf::mutable_table_device_view source)
 {
-  const cudf::size_type n_rows = source.num_rows();
+  cudf::size_type const n_rows = source.num_rows();
   cudf::size_type index        = threadIdx.x + blockIdx.x * blockDim.x;
 
   while (index < n_rows) {
@@ -141,8 +141,8 @@ __global__ void device_dispatching_kernel(cudf::mutable_table_device_view source
 template <FunctorType functor_type, DispatchingType dispatching_type, class T>
 void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_thread)
 {
-  const cudf::size_type n_rows = input.num_rows();
-  const cudf::size_type n_cols = input.num_columns();
+  cudf::size_type const n_rows = input.num_rows();
+  cudf::size_type const n_cols = input.num_columns();
 
   cudf::detail::grid_1d grid_config{n_rows, block_size};
   int grid_size = grid_config.num_blocks;
@@ -169,9 +169,9 @@ void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_threa
 template <class TypeParam, FunctorType functor_type, DispatchingType dispatching_type>
 void type_dispatcher_benchmark(::benchmark::State& state)
 {
-  const auto n_cols          = static_cast<cudf::size_type>(state.range(0));
-  const auto source_size     = static_cast<cudf::size_type>(state.range(1));
-  const auto work_per_thread = static_cast<cudf::size_type>(state.range(2));
+  auto const n_cols          = static_cast<cudf::size_type>(state.range(0));
+  auto const source_size     = static_cast<cudf::size_type>(state.range(1));
+  auto const work_per_thread = static_cast<cudf::size_type>(state.range(2));
 
   auto init = cudf::make_fixed_width_scalar<TypeParam>(static_cast<TypeParam>(0));
 
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index c877c9c6466..894dc9649e2 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -162,13 +162,14 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 
   rapids_cpm_find(
     Arrow ${VERSION}
-    GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static
-                   arrow_dataset_static
+    GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static
+                   parquet_static arrow_acero_static arrow_dataset_static
     CPM_ARGS
     GIT_REPOSITORY https://github.com/apache/arrow.git
     GIT_TAG apache-arrow-${VERSION}
     GIT_SHALLOW TRUE SOURCE_SUBDIR cpp
     OPTIONS "CMAKE_VERBOSE_MAKEFILE ON"
+            "ARROW_ACERO ON"
             "ARROW_IPC ON"
             "ARROW_DATASET ON"
             "ARROW_WITH_BACKTRACE ON"
@@ -221,7 +222,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
         # Set this to enable `find_package(Parquet)`
         set(Parquet_DIR "${Arrow_DIR}")
       endif()
-      # Set this to enable `find_package(ArrowDataset)`
+      # Set this to enable `find_package(ArrowDataset)`. This will call find_package(ArrowAcero) for
+      # us
       set(ArrowDataset_DIR "${Arrow_DIR}")
       find_package(ArrowDataset REQUIRED QUIET)
     endif()
@@ -295,9 +297,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
         APPEND
         arrow_code_string
         "
-          if(NOT TARGET xsimd)
-            add_library(xsimd INTERFACE IMPORTED)
-            target_include_directories(xsimd INTERFACE \"${Arrow_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install/include\")
+          if(NOT TARGET arrow::xsimd)
+            add_library(arrow::xsimd INTERFACE IMPORTED)
+            target_include_directories(arrow::xsimd INTERFACE \"${Arrow_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install/include\")
           endif()
         "
       )
@@ -314,6 +316,26 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 
     if(ENABLE_PARQUET)
 
+      set(arrow_acero_code_string
+          [=[
+              if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared))
+                  add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared)
+              endif()
+              if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static))
+                  add_library(arrow_acero_static ALIAS cudf::arrow_acero_static)
+              endif()
+            ]=]
+      )
+
+      rapids_export(
+        BUILD ArrowAcero
+        VERSION ${VERSION}
+        EXPORT_SET arrow_acero_targets
+        GLOBAL_TARGETS arrow_acero_shared arrow_acero_static
+        NAMESPACE cudf::
+        FINAL_CODE_BLOCK arrow_acero_code_string
+      )
+
       set(arrow_dataset_code_string
           [=[
               if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared))
@@ -381,7 +403,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      11.0.0
+      12.0.1
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/cpp/cmake/thirdparty/get_cufile.cmake b/cpp/cmake/thirdparty/get_cufile.cmake
index 21088f4ec0f..c0235eba508 100644
--- a/cpp/cmake/thirdparty/get_cufile.cmake
+++ b/cpp/cmake/thirdparty/get_cufile.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -16,7 +16,7 @@
 function(find_and_configure_cufile)
 
   list(APPEND CMAKE_MODULE_PATH ${CUDF_SOURCE_DIR}/cmake/Modules)
-  rapids_find_package(cuFile QUIET)
+  rapids_find_package(cuFile)
 
   if(cuFile_FOUND AND NOT BUILD_SHARED_LIBS)
     include("${rapids-cmake-dir}/export/find_package_file.cmake")
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
new file mode 100644
index 00000000000..0e03352c335
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -0,0 +1,37 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds thrust and sets any additional necessary environment variables.
+function(find_and_configure_libcudacxx)
+  # Make sure we install libcudacxx beside our patched version of thrust
+  include(GNUInstallDirs)
+  set(CMAKE_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/libcudf")
+  set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_INCLUDEDIR}/lib")
+
+  include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
+
+  if(libcudacxx_SOURCE_DIR)
+    # Store where CMake can find our custom Thrust install
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    rapids_export_find_package_root(
+      INSTALL
+      libcudacxx
+      [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/libcudacxx]=]
+      cudf-exports
+    )
+  endif()
+endfunction()
+
+find_and_configure_libcudacxx()
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 25a4c9dd3ba..39a9de15fa6 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -37,8 +37,8 @@ function(find_and_configure_thrust)
     # Store where CMake can find our custom Thrust install
     include("${rapids-cmake-dir}/export/find_package_root.cmake")
     rapids_export_find_package_root(
-      INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/cmake/thrust]=]
-      cudf-exports
+      INSTALL Thrust
+      [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/thrust]=] cudf-exports
     )
   endif()
 endfunction()
diff --git a/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff b/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff
index 3e7a0f8ed77..04f96f49b48 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff
+++ b/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff
@@ -1,15 +1,17 @@
 diff --git a/nvbench/main.cuh b/nvbench/main.cuh
-index 0ba82d7..7ab02c1 100644
+index 0ba82d7..cca5273 100644
 --- a/nvbench/main.cuh
 +++ b/nvbench/main.cuh
-@@ -54,6 +54,14 @@
+@@ -54,6 +54,16 @@
  // clang-format on
  #endif
 
 +#ifndef NVBENCH_ENVIRONMENT
 +namespace nvbench {
 +struct no_environment
-+{};
++{
++  no_environment(int, char const *const *) {}
++};
 +}
 +#define NVBENCH_ENVIRONMENT nvbench::no_environment
 +#endif
@@ -17,11 +19,11 @@ index 0ba82d7..7ab02c1 100644
  #define NVBENCH_MAIN_PARSE(argc, argv)                                                             \
    nvbench::option_parser parser;                                                                   \
    parser.parse(argc, argv)
-@@ -77,6 +85,7 @@
+@@ -77,6 +87,7 @@
      printer.set_total_state_count(total_states);                                                   \
                                                                                                     \
      printer.set_completed_state_count(0);                                                          \
-+    [[maybe_unused]] auto env_state = NVBENCH_ENVIRONMENT();                                       \
++    [[maybe_unused]] auto env_state = NVBENCH_ENVIRONMENT(argc, argv);                             \
      for (auto &bench_ptr : benchmarks)                                                             \
      {                                                                                              \
        bench_ptr->set_printer(printer);                                                             \
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
index d5df222ae37..7be868081b6 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -12,11 +12,6 @@
           "file" : "nvbench/use_existing_fmt.diff",
           "issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]",
           "fixed_in" : ""
-        },
-        {
-          "file" : "nvbench/public_fmt_dep_in_conda.diff",
-          "issue" : "Propagate fmt requirement in conda envs [https://github.com/NVIDIA/nvbench/pull/127]",
-          "fixed_in" : ""
         }
       ]
     }
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index e1e8a0fa31b..b072d252881 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.20
+# Doxyfile 1.9.1
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -32,13 +32,13 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "libcudf"
+PROJECT_NAME           = libcudf
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 23.06.00
+PROJECT_NUMBER         = 23.10.00
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES    = NO
 
 OUTPUT_LANGUAGE        = English
 
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -305,7 +313,10 @@ OPTIMIZE_OUTPUT_SLICE  = NO
 # Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      = cu=C++ \
                          cuh=C++
@@ -516,6 +527,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -553,11 +571,18 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# (including Cygwin) and Mac users are advised to set this option to NO.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = YES
@@ -796,7 +821,10 @@ WARN_IF_DOC_ERROR      = YES
 WARN_NO_PARAMDOC       = YES
 
 # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
-# a warning is encountered.
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
 # The default value is: NO.
 
 WARN_AS_ERROR          = NO
@@ -846,8 +874,8 @@ INPUT                  = main_page.md \
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
@@ -860,13 +888,15 @@ INPUT_ENCODING         = UTF-8
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # read by doxygen.
 #
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
 # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
 # *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
-# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen
-# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
-# *.vhdl, *.ucf, *.qsf and *.ice.
+# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
+# *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.cpp \
                          *.hpp \
@@ -1270,10 +1300,11 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: https://developer.apple.com/xcode/), introduced with OSX
-# 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
 # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
 # genXcode/_index.html for more information.
@@ -1315,8 +1346,8 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# (see:
+# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1391,7 +1422,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1399,8 +1431,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1408,16 +1440,16 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
@@ -1429,9 +1461,9 @@ QHP_CUST_FILTER_ATTRS  =
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1558,7 +1590,7 @@ USE_MATHJAX            = NO
 
 # When MathJax is enabled you can set the default output format to be used for
 # the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
 # Possible values are: HTML-CSS (which is slower, but has the best
 # compatibility), NativeMML (i.e. MathML) and SVG.
 # The default value is: HTML-CSS.
@@ -1588,7 +1620,8 @@ MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1635,7 +1668,8 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1648,8 +1682,9 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
 SEARCHENGINE_URL       =
@@ -1839,6 +1874,16 @@ LATEX_BATCHMODE        = NO
 
 LATEX_HIDE_INDICES     = NO
 
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
 # https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
@@ -1919,6 +1964,16 @@ RTF_STYLESHEET_FILE    =
 
 RTF_EXTENSIONS_FILE    =
 
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the man page output
 #---------------------------------------------------------------------------
@@ -2015,6 +2070,15 @@ GENERATE_DOCBOOK       = NO
 
 DOCBOOK_OUTPUT         = docbook
 
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
 #---------------------------------------------------------------------------
 # Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
@@ -2162,7 +2226,7 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/23.06
+TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/23.10
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
@@ -2301,10 +2365,32 @@ UML_LOOK               = NO
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
 
 UML_LIMIT_NUM_FIELDS   = 10
 
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
 # instances.
@@ -2494,9 +2580,11 @@ DOT_MULTI_TARGETS      = NO
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc and
+# plantuml temporary files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 91c3dccfdc6..fc2f72de33c 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -440,17 +440,18 @@ libcudf throws under different circumstances, see the [section on error handling
 
 ## Streams
 
-CUDA streams are not yet exposed in external libcudf APIs. However, in order to ease the transition
-to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be
-implemented using asynchronous APIs on the default stream (e.g., stream 0).
-
-The recommended pattern for doing this is to make the definition of the external API invoke an
-internal API in the `detail` namespace. The internal `detail` API has the same parameters as the
-public API, plus a `rmm::cuda_stream_view` parameter at the end with no default value. If the
-detail API also accepts a memory resource parameter, the stream parameter should be ideally placed
-just *before* the memory resource. The public API will call the detail API and provide
-`cudf::get_default_stream()`. The implementation should be wholly contained in the `detail` API
-definition and use only asynchronous versions of CUDA APIs with the stream parameter.
+libcudf is in the process of adding support for asynchronous execution using
+CUDA streams. In order to facilitate the usage of streams, all new libcudf APIs
+that allocate device memory or execute a kernel should accept an
+`rmm::cuda_stream_view` parameter at the end with a default value of
+`cudf::get_default_stream()`.  There is one exception to this rule: if the API
+also accepts a memory resource parameter, the stream parameter should be placed
+just *before* the memory resource. This API should then forward the call to a
+corresponding `detail` API with an identical signature, except that the
+`detail` API should not have a default parameter for the stream ([detail APIs
+should always avoid default parameters](#default-parameters)). The
+implementation should be wholly contained in the `detail` API definition and
+use only asynchronous versions of CUDA APIs with the stream parameter.
 
 In order to make the `detail` API callable from other libcudf functions, it should be exposed in a
 header placed in the `cudf/cpp/include/detail/` directory.
@@ -488,7 +489,7 @@ void external_function(...){
 when a non-pointer value is returned from the API that is the result of an asynchronous
 device-to-host copy, the stream used for the copy should be synchronized before returning. However,
 when a column is returned, the stream should not be synchronized because doing so will break
-asynchrony if and when we add an asynchronous API to libcudf.
+asynchrony.
 
 **Note:** `cudaDeviceSynchronize()` should *never* be used.
 This limits the ability to do any multi-stream/multi-threaded work with libcudf APIs.
diff --git a/cpp/doxygen/developer_guide/TESTING.md b/cpp/doxygen/developer_guide/TESTING.md
index 2d9b32362bf..c19976a956b 100644
--- a/cpp/doxygen/developer_guide/TESTING.md
+++ b/cpp/doxygen/developer_guide/TESTING.md
@@ -458,3 +458,69 @@ Column comparison functions in the `cudf::test::detail` namespace should **NOT**
 `include/cudf_test/column_utilities.hpp` defines various functions and overloads for printing
 columns (`print`), converting column data to string (`to_string`, `to_strings`), and copying data to
 the host (`to_host`).
+
+
+## Validating Stream Usage
+
+### Background
+
+libcudf employs a custom-built [preload library
+docs](https://man7.org/linux/man-pages/man8/ld.so.8.html) to validate its internal stream usage (the
+code may be found
+[`here`](https://github.com/rapidsai/cudf/blob/main/cpp/tests/utilities/identify_stream_usage.cpp)).
+This library wraps every asynchronous CUDA runtime API call that accepts a stream with a check to
+ensure that the passed CUDA stream is a valid one, immediately throwing an exception if an invalid
+stream is detected. Running tests with this library loaded immediately triggers errors if any test
+accidentally runs code on an invalid stream.
+
+Stream validity is determined by overloading the definition of libcudf's default stream. Normally, in
+libcudf `cudf::get_default_stream` returns one of `rmm`'s default stream values (depending on
+whether or not libcudf is compiled with per thread default stream enabled). In the preload library,
+this function is redefined to instead return a new user-created stream managed using a
+function-local static `rmm::cuda_stream`. An invalid stream in this situation is defined as any of
+CUDA's default stream values (cudaStreamLegacy, cudaStreamDefault, or cudaStreamPerThread), since
+any kernel that properly uses `cudf::get_default_stream` will now instead be using the custom stream
+created by the preload library.
+
+The preload library supports two different modes, `cudf` mode and `testing` mode. The previous
+paragraph describes the behavior of `cudf` mode, where `cudf::get_default_stream` is overloaded. In
+`cudf` mode, the preload library ensures that all CUDA runtime APIs are being provided cudf's
+default stream. This will detect oversights where, for example, a Thrust call has no stream specified, or
+when one of CUDA's default stream values is explicitly specified to a kernel. However, it will not
+detect cases where a stream is not correctly forwarded down the call stack, for instance if
+some `detail` function that accepts a stream parameter fails to forward it along and instead
+erroneously calls `cudf::get_default_stream` instead.
+
+In `testing` mode, the library instead overloads `cudf::test::get_default_stream`. This function
+defined in the `cudf::test` namespace enables a more stringent mode of testing. In `testing` mode,
+the preload library instead verifies that all CUDA runtime APIs are instead called using the test
+namespace's default stream. This distinction is important because cudf internals never use
+`cudf::test::get_default_stream`, so this stream value can only appear internally if it was provided
+to a public API and forwarded properly all the way down the call stack. While `testing` mode is more
+strict than `cudf` mode, it is also more intrusive. `cudf` mode can operate with no changes to the
+library or the tests because the preload library overwrites the relevant APIs in place. `testing`
+mode, however, can only be used to validate tests that are correctly passing
+`cudf::test::get_default_stream` to public libcudf APIs.
+
+In addition to the preload library, the test suite also implements a [custom memory
+resource](https://github.com/rapidsai/cudf/blob/main/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp)
+that performs analogous stream verification when its `do_allocate` method is called. During testing
+this rmm's default memory resource is set to use this adaptor for additional stream validation.
+
+### Usage
+
+When writing tests for a libcudf API, a special set of additional tests should be added to validate
+the API's stream usage. These tests should be placed in the `cpp/tests/streams` directory in a file
+corresponding to the header containing the tested APIs, e.g. `cpp/tests/streams/copying_test.cpp`
+for all APIs declared in `cpp/include/cudf/copying.hpp`. These tests should contain a minimal
+invocation of the tested API with no additional assertions since they are solely designed to check
+stream usage. When adding these tests to `cpp/tests/CMakeLists.txt`, the `ConfigureTest` CMake
+function should be provided the arguments `STREAM_MODE testing`. This change is sufficient for
+CTest to set up the test to automatically load the preload library compiled in `testing` mode when
+running the test.
+
+The rest of the test suite is configured to run with the preload library in `cudf` mode. As a
+result, all test runs with `ctest` will always include stream validation. Since this configuration
+is managed via CMake and CTest, direct execution of the test executables will not use the preload
+library at all. Tests will still run and pass normally in this situation, however (with the
+exception of the test of the preload library itself).
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index 0922611482a..1c1952c4616 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-cmake_minimum_required(VERSION 3.23.1)
+cmake_minimum_required(VERSION 3.26.4)
 
 project(
   basic_example
@@ -16,7 +16,7 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-23.06)
+set(CUDF_TAG branch-23.10)
 CPMFindPackage(
   NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index f9c49e24bf5..31a6b12a4bc 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
-cmake_minimum_required(VERSION 3.23.1)
+cmake_minimum_required(VERSION 3.26.4)
 
 project(
   strings_examples
@@ -16,7 +16,7 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-23.06)
+set(CUDF_TAG branch-23.10)
 CPMFindPackage(
   NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp
index dbd3c4dbd1b..2fd9daf9339 100644
--- a/cpp/examples/strings/common.hpp
+++ b/cpp/examples/strings/common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@
  * @brief Main example function returns redacted strings column.
  *
  * This function returns a redacted version of the input `names` column
- * using the the `visibilities` column as in the following example
+ * using the `visibilities` column as in the following example
  * ```
  * names        visibility  --> redacted
  * John Doe     public          D John
diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu
index a956550f505..0af4c47e947 100644
--- a/cpp/examples/strings/custom_prealloc.cu
+++ b/cpp/examples/strings/custom_prealloc.cu
@@ -41,7 +41,7 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
                               cudf::column_device_view const d_visibilities,
                               cudf::string_view redaction,
                               char* working_memory,
-                              cudf::offset_type const* d_offsets,
+                              cudf::size_type const* d_offsets,
                               cudf::string_view* d_output)
 {
   // The row index is resolved from the CUDA thread/block objects
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index 84fb7cfbd5a..db0abe435b0 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -15,12 +15,12 @@
  */
 #pragma once
 
+#include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <thrust/optional.h>
 #include <thrust/scan.h>
 
 #include <functional>
@@ -72,24 +72,6 @@ struct alignas(8) device_data_reference {
   }
 };
 
-// Type trait for wrapping nullable types in a thrust::optional. Non-nullable
-// types are returned as is.
-template <typename T, bool has_nulls>
-struct possibly_null_value;
-
-template <typename T>
-struct possibly_null_value<T, true> {
-  using type = thrust::optional<T>;
-};
-
-template <typename T>
-struct possibly_null_value<T, false> {
-  using type = T;
-};
-
-template <typename T, bool has_nulls>
-using possibly_null_value_t = typename possibly_null_value<T, has_nulls>::type;
-
 // Type used for intermediate storage in expression evaluation.
 template <bool has_nulls>
 using IntermediateDataType = possibly_null_value_t<std::int64_t, has_nulls>;
@@ -193,6 +175,13 @@ class expression_parser {
    */
   cudf::size_type visit(operation const& expr);
 
+  /**
+   * @brief Visit a column name reference expression.
+   *
+   * @param expr Column name reference expression.
+   * @return cudf::size_type Index of device data reference for the expression.
+   */
+  cudf::size_type visit(column_name_reference const& expr);
   /**
    * @brief Internal class used to track the utilization of intermediate storage locations.
    *
diff --git a/cpp/include/cudf/ast/detail/expression_transformer.hpp b/cpp/include/cudf/ast/detail/expression_transformer.hpp
new file mode 100644
index 00000000000..a6529c338e6
--- /dev/null
+++ b/cpp/include/cudf/ast/detail/expression_transformer.hpp
@@ -0,0 +1,64 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/ast/expressions.hpp>
+
+namespace cudf::ast::detail {
+/**
+ * @brief Base "visitor" pattern class with the `expression` class for expression transformer.
+ *
+ * This class can be used to implement recursive traversal of AST tree, and used to validate or
+ * translate an AST expression.
+ */
+class expression_transformer {
+ public:
+  /**
+   * @brief Visit a literal expression.
+   *
+   * @param expr Literal expression
+   * @return Reference wrapper of transformed expression
+   */
+  virtual std::reference_wrapper<expression const> visit(literal const& expr) = 0;
+
+  /**
+   * @brief Visit a column reference expression.
+   *
+   * @param expr Column reference expression
+   * @return Reference wrapper of transformed expression
+   */
+  virtual std::reference_wrapper<expression const> visit(column_reference const& expr) = 0;
+
+  /**
+   * @brief Visit an expression expression
+   *
+   * @param expr Expression expression
+   * @return Reference wrapper of transformed expression
+   */
+  virtual std::reference_wrapper<expression const> visit(operation const& expr) = 0;
+
+  /**
+   * @brief Visit a column name reference expression.
+   *
+   * @param expr Column name reference expression
+   * @return Reference wrapper of transformed expression
+   */
+  virtual std::reference_wrapper<expression const> visit(column_name_reference const& expr) = 0;
+
+  virtual ~expression_transformer() {}
+};
+}  // namespace cudf::ast::detail
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index 350ce99bcf4..ed7f2d97cef 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -20,6 +20,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <thrust/optional.h>
+
 #include <cuda/std/type_traits>
 
 #include <cmath>
@@ -33,6 +35,24 @@ namespace ast {
 
 namespace detail {
 
+// Type trait for wrapping nullable types in a thrust::optional. Non-nullable
+// types are returned as is.
+template <typename T, bool has_nulls>
+struct possibly_null_value;
+
+template <typename T>
+struct possibly_null_value<T, true> {
+  using type = thrust::optional<T>;
+};
+
+template <typename T>
+struct possibly_null_value<T, false> {
+  using type = T;
+};
+
+template <typename T, bool has_nulls>
+using possibly_null_value_t = typename possibly_null_value<T, has_nulls>::type;
+
 // Traits for valid operator / type combinations
 template <typename Op, typename LHS, typename RHS>
 constexpr bool is_valid_binary_op = cuda::std::is_invocable_v<Op, LHS, RHS>;
@@ -124,6 +144,9 @@ CUDF_HOST_DEVICE inline constexpr void ast_operator_dispatcher(ast_operator op,
     case ast_operator::IDENTITY:
       f.template operator()<ast_operator::IDENTITY>(std::forward<Ts>(args)...);
       break;
+    case ast_operator::IS_NULL:
+      f.template operator()<ast_operator::IS_NULL>(std::forward<Ts>(args)...);
+      break;
     case ast_operator::SIN:
       f.template operator()<ast_operator::SIN>(std::forward<Ts>(args)...);
       break;
@@ -534,6 +557,17 @@ struct operator_functor<ast_operator::IDENTITY, false> {
   }
 };
 
+template <>
+struct operator_functor<ast_operator::IS_NULL, false> {
+  static constexpr auto arity{1};
+
+  template <typename InputT>
+  __device__ inline auto operator()(InputT input) -> bool
+  {
+    return false;
+  }
+};
+
 template <>
 struct operator_functor<ast_operator::SIN, false> {
   static constexpr auto arity{1};
@@ -831,6 +865,19 @@ struct operator_functor<op, true> {
   }
 };
 
+// IS_NULL(null) is true, IS_NULL(valid) is false
+template <>
+struct operator_functor<ast_operator::IS_NULL, true> {
+  using NonNullOperator       = operator_functor<ast_operator::IS_NULL, false>;
+  static constexpr auto arity = NonNullOperator::arity;
+
+  template <typename LHS>
+  __device__ inline auto operator()(LHS const lhs) -> decltype(!lhs.has_value())
+  {
+    return !lhs.has_value();
+  }
+};
+
 // NULL_EQUAL(null, null) is true, NULL_EQUAL(null, valid) is false, and NULL_EQUAL(valid, valid) ==
 // EQUAL(valid, valid)
 template <>
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index 6df6ba71b4c..c5172486fa6 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -29,7 +29,8 @@ namespace ast {
 // Forward declaration.
 namespace detail {
 class expression_parser;
-}
+class expression_transformer;
+}  // namespace detail
 
 /**
  * @brief A generic expression that can be evaluated to return a value.
@@ -46,6 +47,15 @@ struct expression {
    */
   virtual cudf::size_type accept(detail::expression_parser& visitor) const = 0;
 
+  /**
+   * @brief Accepts a visitor class.
+   *
+   * @param visitor The `expression_transformer` transforming this expression tree
+   * @return Reference wrapper of transformed expression
+   */
+  virtual std::reference_wrapper<expression const> accept(
+    detail::expression_transformer& visitor) const = 0;
+
   /**
    * @brief Returns true if the expression may evaluate to null.
    *
@@ -112,6 +122,7 @@ enum class ast_operator : int32_t {
                      ///< LOGICAL_OR(valid, valid)
   // Unary operators
   IDENTITY,        ///< Identity function
+  IS_NULL,         ///< Check if operand is null
   SIN,             ///< Trigonometric sine
   COS,             ///< Trigonometric cosine
   TAN,             ///< Trigonometric tangent
@@ -300,13 +311,16 @@ class literal : public expression {
   [[nodiscard]] generic_scalar_device_view get_value() const { return value; }
 
   /**
-   * @brief Accepts a visitor class.
-   *
-   * @param visitor The `expression_parser` parsing this expression tree
-   * @return Index of device data reference for this instance
+   * @copydoc expression::accept
    */
   cudf::size_type accept(detail::expression_parser& visitor) const override;
 
+  /**
+   * @copydoc expression::accept
+   */
+  std::reference_wrapper<expression const> accept(
+    detail::expression_transformer& visitor) const override;
+
   [[nodiscard]] bool may_evaluate_null(table_view const& left,
                                        table_view const& right,
                                        rmm::cuda_stream_view stream) const override
@@ -396,13 +410,16 @@ class column_reference : public expression {
   }
 
   /**
-   * @brief Accepts a visitor class.
-   *
-   * @param visitor The `expression_parser` parsing this expression tree
-   * @return Index of device data reference for this instance
+   * @copydoc expression::accept
    */
   cudf::size_type accept(detail::expression_parser& visitor) const override;
 
+  /**
+   * @copydoc expression::accept
+   */
+  std::reference_wrapper<expression const> accept(
+    detail::expression_transformer& visitor) const override;
+
   [[nodiscard]] bool may_evaluate_null(table_view const& left,
                                        table_view const& right,
                                        rmm::cuda_stream_view stream) const override
@@ -459,13 +476,16 @@ class operation : public expression {
   std::vector<std::reference_wrapper<expression const>> get_operands() const { return operands; }
 
   /**
-   * @brief Accepts a visitor class.
-   *
-   * @param visitor The `expression_parser` parsing this expression tree
-   * @return Index of device data reference for this instance
+   * @copydoc expression::accept
    */
   cudf::size_type accept(detail::expression_parser& visitor) const override;
 
+  /**
+   * @copydoc expression::accept
+   */
+  std::reference_wrapper<expression const> accept(
+    detail::expression_transformer& visitor) const override;
+
   [[nodiscard]] bool may_evaluate_null(table_view const& left,
                                        table_view const& right,
                                        rmm::cuda_stream_view stream) const override
@@ -482,6 +502,48 @@ class operation : public expression {
   std::vector<std::reference_wrapper<expression const>> const operands;
 };
 
+/**
+ * @brief A expression referring to data from a column in a table.
+ */
+class column_name_reference : public expression {
+ public:
+  /**
+   * @brief Construct a new column name reference object
+   *
+   * @param column_name Name of this column in the table metadata (provided when the expression is
+   * evaluated).
+   */
+  column_name_reference(std::string column_name) : column_name(std::move(column_name)) {}
+
+  /**
+   * @brief Get the column name.
+   *
+   * @return The name of this column reference
+   */
+  [[nodiscard]] std::string get_column_name() const { return column_name; }
+
+  /**
+   * @copydoc expression::accept
+   */
+  cudf::size_type accept(detail::expression_parser& visitor) const override;
+
+  /**
+   * @copydoc expression::accept
+   */
+  std::reference_wrapper<expression const> accept(
+    detail::expression_transformer& visitor) const override;
+
+  [[nodiscard]] bool may_evaluate_null(table_view const& left,
+                                       table_view const& right,
+                                       rmm::cuda_stream_view stream) const override
+  {
+    return true;
+  }
+
+ private:
+  std::string column_name;
+};
+
 }  // namespace ast
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index c160cecbf84..a38186458c4 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -80,21 +80,17 @@ class column {
    * @brief Construct a new column by taking ownership of the contents of a device_uvector.
    *
    * @param other The device_uvector whose contents will be moved into the new column.
-   * @param null_mask Optional, column's null value indicator bitmask. May
-   * be empty if `null_count` is 0 or `UNKNOWN_NULL_COUNT`.
-   * @param null_count Optional, the count of null elements. If unknown, specify
-   * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on
-   * the first invocation of `null_count()`.
+   * @param null_mask Column's null value indicator bitmask. May be empty if `null_count` is 0.
+   * @param null_count The count of null elements.
    */
   template <typename T, CUDF_ENABLE_IF(cudf::is_numeric<T>() or cudf::is_chrono<T>())>
-  column(rmm::device_uvector<T>&& other,
-         rmm::device_buffer&& null_mask = {},
-         size_type null_count           = UNKNOWN_NULL_COUNT)
+  column(rmm::device_uvector<T>&& other, rmm::device_buffer&& null_mask, size_type null_count)
     : _type{cudf::data_type{cudf::type_to_id<T>()}},
       _size{[&]() {
         CUDF_EXPECTS(
           other.size() <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-          "The device_uvector size exceeds the maximum size_type.");
+          "The device_uvector size exceeds the column size limit",
+          std::overflow_error);
         return static_cast<size_type>(other.size());
       }()},
       _data{other.release()},
@@ -111,22 +107,19 @@ class column {
    *
    * @throws cudf::logic_error if `size < 0`
    *
-   * @param[in] dtype The element type
-   * @param[in] size The number of elements in the column
-   * @param[in] data The column's data
-   * @param[in] null_mask Optional, column's null value indicator bitmask. May
-   * be empty if `null_count` is 0 or `UNKNOWN_NULL_COUNT`.
-   * @param null_count Optional, the count of null elements. If unknown, specify
-   * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on
-   * the first invocation of `null_count()`.
+   * @param dtype The element type
+   * @param size The number of elements in the column
+   * @param data The column's data
+   * @param null_mask Column's null value indicator bitmask. May be empty if `null_count` is 0.
+   * @param null_count Optional, the count of null elements.
    * @param children Optional, vector of child columns
    */
   template <typename B1, typename B2 = rmm::device_buffer>
   column(data_type dtype,
          size_type size,
          B1&& data,
-         B2&& null_mask                                  = {},
-         size_type null_count                            = UNKNOWN_NULL_COUNT,
+         B2&& null_mask,
+         size_type null_count,
          std::vector<std::unique_ptr<column>>&& children = {})
     : _type{dtype},
       _size{size},
@@ -169,14 +162,9 @@ class column {
   /**
    * @brief Returns the count of null elements.
    *
-   * @note If the column was constructed with `UNKNOWN_NULL_COUNT`, or if at any
-   * point `set_null_count(UNKNOWN_NULL_COUNT)` was invoked, then the
-   * first invocation of `null_count()` will compute and store the count of null
-   * elements indicated by the `null_mask` (if it exists).
-   *
    * @return The number of null elements
    */
-  [[nodiscard]] size_type null_count() const;
+  [[nodiscard]] size_type null_count() const { return _null_count; }
 
   /**
    * @brief Sets the column's null value indicator bitmask to `new_null_mask`.
@@ -186,13 +174,10 @@ class column {
    *
    * @param new_null_mask New null value indicator bitmask (rvalue overload &
    * moved) to set the column's null value indicator mask. May be empty if
-   * `new_null_count` is 0 or `UNKOWN_NULL_COUNT`.
-   * @param new_null_count Optional, the count of null elements. If unknown,
-   * specify `UNKNOWN_NULL_COUNT` to indicate that the null count should be
-   * computed on the first invocation of `null_count()`.
+   * `new_null_count` is 0.
+   * @param new_null_count The count of null elements.
    */
-  void set_null_mask(rmm::device_buffer&& new_null_mask,
-                     size_type new_null_count = UNKNOWN_NULL_COUNT);
+  void set_null_mask(rmm::device_buffer&& new_null_mask, size_type new_null_count);
 
   /**
    * @brief Sets the column's null value indicator bitmask to `new_null_mask`.
@@ -201,25 +186,18 @@ class column {
    * does not match the size of this column.
    *
    * @param new_null_mask New null value indicator bitmask (lvalue overload & copied) to set the
-   * column's null value indicator mask. May be empty if `new_null_count` is 0 or
-   * `UNKOWN_NULL_COUNT`.
-   * @param new_null_count Optional, the count of null elements. If unknown, specify
-   * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on the first invocation
-   * of `null_count()`.
+   * column's null value indicator mask. May be empty if `new_null_count` is 0.
+   * @param new_null_count The count of null elements
    * @param stream The stream on which to perform the allocation and copy. Uses the default CUDF
    * stream if none is specified.
    */
   void set_null_mask(rmm::device_buffer const& new_null_mask,
-                     size_type new_null_count     = UNKNOWN_NULL_COUNT,
+                     size_type new_null_count,
                      rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Updates the count of null elements.
    *
-   * @note `UNKNOWN_NULL_COUNT` can be specified as `new_null_count` to force
-   * the next invocation of `null_count()` to recompute the null count from the
-   * null mask.
-   *
    * @throws cudf::logic_error if `new_null_count > 0 and nullable() == false`
    *
    * @param new_null_count The new null count.
@@ -321,14 +299,8 @@ class column {
   operator column_view() const { return this->view(); };
 
   /**
-   * @brief Creates a mutable, non-owning view of the column's data and
-   * children.
-   *
-   * @note Creating a mutable view of a `column` invalidates the `column`'s
-   * `null_count()` by setting it to `UNKNOWN_NULL_COUNT`. The user can
-   * either explicitly update the null count with `set_null_count()`, or
-   * if not, the null count will be recomputed on the next invocation of
-   *`null_count()`.
+   * @brief Creates a mutable, non-owning view of the column's data, null mask,
+   * and children
    *
    * @return The mutable, non-owning view
    */
@@ -338,13 +310,10 @@ class column {
    * @brief Implicit conversion operator to a `mutable_column_view`.
    *
    * This allows passing a `column` object into a function that accepts a
-   *`mutable_column_view`. The conversion is automatic.
-
-   * @note Creating a mutable view of a `column` invalidates the `column`'s
-   * `null_count()` by setting it to `UNKNOWN_NULL_COUNT`. For best performance,
-   * the user should explicitly update the null count with `set_null_count()`.
-   * Otherwise, the null count will be recomputed on the next invocation of
-   * `null_count()`.
+   * `mutable_column_view`. The conversion is automatic.
+   *
+   * The caller is expected to update the null count appropriately if the null mask
+   * is modified.
    *
    * @return Mutable, non-owning `mutable_column_view`
    */
@@ -357,9 +326,9 @@ class column {
                                           ///< buffer containing the column elements
   rmm::device_buffer _null_mask{};        ///< Bitmask used to represent null values.
                                           ///< May be empty if `null_count() == 0`
-  mutable cudf::size_type _null_count{UNKNOWN_NULL_COUNT};  ///< The number of null elements
-  std::vector<std::unique_ptr<column>> _children{};         ///< Depending on element type, child
-                                                            ///< columns may contain additional data
+  mutable cudf::size_type _null_count{};  ///< The number of null elements
+  std::vector<std::unique_ptr<column>> _children{};  ///< Depending on element type, child
+                                                     ///< columns may contain additional data
 };
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index efb96cb6af6..05ef21bd750 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -275,7 +275,7 @@ class alignas(16) column_device_view_base {
   }
 
   /**
-   * @brief Returns the the specified bitmask word from the `null_mask()`.
+   * @brief Returns the specified bitmask word from the `null_mask()`.
    *
    * @note It is undefined behavior to call this function if `nullable() ==
    * false`.
@@ -442,8 +442,8 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   __device__ T element(size_type element_index) const noexcept
   {
     size_type index       = element_index + offset();  // account for this view's _offset
-    const auto* d_offsets = d_children[strings_column_view::offsets_column_index].data<int32_t>();
-    const char* d_strings = d_children[strings_column_view::chars_column_index].data<char>();
+    auto const* d_offsets = d_children[strings_column_view::offsets_column_index].data<int32_t>();
+    char const* d_strings = d_children[strings_column_view::chars_column_index].data<char>();
     size_type offset      = d_offsets[index];
     return string_view{d_strings + offset, d_offsets[index + 1] - offset};
   }
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index f0f7bf092d3..68d7df7e4eb 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -374,7 +374,7 @@ std::unique_ptr<column> make_fixed_width_column(
  * @return Constructed strings column
  */
 std::unique_ptr<column> make_strings_column(
-  cudf::device_span<thrust::pair<const char*, size_type> const> strings,
+  cudf::device_span<thrust::pair<char const*, size_type> const> strings,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -406,7 +406,7 @@ std::unique_ptr<column> make_strings_column(
  */
 std::unique_ptr<column> make_strings_column(
   cudf::device_span<string_view const> string_views,
-  const string_view null_placeholder,
+  string_view const null_placeholder,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 6d722675626..d80c720a255 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -160,14 +160,9 @@ class column_view_base {
   /**
    * @brief Returns the count of null elements
    *
-   * @note If the column was constructed with `UNKNOWN_NULL_COUNT`, or if at any
-   * point `set_null_count(UNKNOWN_NULL_COUNT)` was invoked, then the
-   * first invocation of `null_count()` will compute and store the count of null
-   * elements indicated by the `null_mask` (if it exists).
-   *
    * @return The count of null elements
    */
-  [[nodiscard]] size_type null_count() const;
+  [[nodiscard]] size_type null_count() const { return _null_count; }
 
   /**
    * @brief Returns the count of null elements in the range [begin, end)
@@ -263,10 +258,6 @@ class column_view_base {
    *
    * If `null_count()` is zero, `null_mask` is optional.
    *
-   * If the null count of the `null_mask` is not specified, it defaults to
-   * `UNKNOWN_NULL_COUNT`. The first invocation of `null_count()` will then
-   * compute the null count if `null_mask` exists.
-   *
    * If `type` is `EMPTY`, the specified `null_count` will be ignored and
    * `null_count()` will always return the same value as `size()`
    *
@@ -280,17 +271,17 @@ class column_view_base {
    * @param type The element type
    * @param size The number of elements
    * @param data Pointer to device memory containing the column elements
-   * @param null_mask Optional, pointer to device memory containing the null
+   * @param null_mask Pointer to device memory containing the null
    * indicator bitmask
-   * @param null_count Optional, the number of null elements.
-   * @param offset optional, index of the first element
+   * @param null_count The number of null elements.
+   * @param offset Optional, index of the first element
    */
   column_view_base(data_type type,
                    size_type size,
                    void const* data,
-                   bitmask_type const* null_mask = nullptr,
-                   size_type null_count          = UNKNOWN_NULL_COUNT,
-                   size_type offset              = 0);
+                   bitmask_type const* null_mask,
+                   size_type null_count,
+                   size_type offset = 0);
 };
 
 class mutable_column_view_base : public column_view_base {
@@ -357,10 +348,6 @@ class column_view : public detail::column_view_base {
    *
    * If `null_count()` is zero, `null_mask` is optional.
    *
-   * If the null count of the `null_mask` is not specified, it defaults to
-   * `UNKNOWN_NULL_COUNT`. The first invocation of `null_count()` will then
-   * compute the null count if `null_mask` exists.
-   *
    * If `type` is `EMPTY`, the specified `null_count` will be ignored and
    * `null_count()` will always return the same value as `size()`
    *
@@ -374,18 +361,18 @@ class column_view : public detail::column_view_base {
    * @param type The element type
    * @param size The number of elements
    * @param data Pointer to device memory containing the column elements
-   * @param null_mask Optional, pointer to device memory containing the null
+   * @param null_mask Pointer to device memory containing the null
    * indicator bitmask
-   * @param null_count Optional, the number of null elements.
-   * @param offset optional, index of the first element
-   * @param children optional, depending on the element type, child columns may
+   * @param null_count The number of null elements.
+   * @param offset Optional, index of the first element
+   * @param children Optional, depending on the element type, child columns may
    * contain additional data
    */
   column_view(data_type type,
               size_type size,
               void const* data,
-              bitmask_type const* null_mask            = nullptr,
-              size_type null_count                     = UNKNOWN_NULL_COUNT,
+              bitmask_type const* null_mask,
+              size_type null_count,
               size_type offset                         = 0,
               std::vector<column_view> const& children = {});
 
@@ -435,8 +422,9 @@ class column_view : public detail::column_view_base {
         cudf::data_type{cudf::type_to_id<T>()}, data.size(), data.data(), nullptr, 0, 0, {})
   {
     CUDF_EXPECTS(
-      data.size() < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-      "Data exceeds the maximum size of a column view.");
+      data.size() <= static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+      "Data exceeds the column size limit",
+      std::overflow_error);
   }
 
   /**
@@ -509,12 +497,8 @@ class mutable_column_view : public detail::column_view_base {
 
   /**
    * @brief Construct a `mutable_column_view` from pointers to device memory for
-   *the elements and bitmask of the column.
+   * the elements and bitmask of the column.
 
-   * If the null count of the `null_mask` is not specified, it defaults to
-   * `UNKNOWN_NULL_COUNT`. The first invocation of `null_count()` will then
-   * compute the null count.
-   *
    * If `type` is `EMPTY`, the specified `null_count` will be ignored and
    * `null_count()` will always return the same value as `size()`
    *
@@ -528,19 +512,19 @@ class mutable_column_view : public detail::column_view_base {
    * @param type The element type
    * @param size The number of elements
    * @param data Pointer to device memory containing the column elements
-   * @param null_mask Optional, pointer to device memory containing the null
+   * @param null_mask Pointer to device memory containing the null
    indicator
    * bitmask
-   * @param null_count Optional, the number of null elements.
-   * @param offset optional, index of the first element
-   * @param children optional, depending on the element type, child columns may
+   * @param null_count The number of null elements.
+   * @param offset Optional, index of the first element
+   * @param children Optional, depending on the element type, child columns may
    * contain additional data
    */
   mutable_column_view(data_type type,
                       size_type size,
                       void* data,
-                      bitmask_type* null_mask                          = nullptr,
-                      size_type null_count                             = cudf::UNKNOWN_NULL_COUNT,
+                      bitmask_type* null_mask,
+                      size_type null_count,
                       size_type offset                                 = 0,
                       std::vector<mutable_column_view> const& children = {});
 
diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp
index 2b4eee607e2..9ee55275a5e 100644
--- a/cpp/include/cudf/concatenate.hpp
+++ b/cpp/include/cudf/concatenate.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -34,61 +35,64 @@ namespace cudf {
 /**
  * @brief Concatenates `views[i]`'s bitmask from the bits
  * `[views[i].offset(), views[i].offset() + views[i].size())` for all elements
- * views[i] in views into a `device_buffer`
+ * `views` into an `rmm::device_buffer`
  *
- * Returns empty `device_buffer` if the column is not nullable
+ * Returns an empty buffer if the column is not nullable.
  *
- * @param views host_span of column views whose bitmasks will be concatenated
- * @param mr Device memory resource used for allocating the new device_buffer
- * @return A `device_buffer` containing the bitmasks of all the column views in the views vector
+ * @param views Column views whose bitmasks will be concatenated
+ * @param mr Device memory resource used for allocating the returned memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Bitmasks of all the column views in the views vector
  */
 rmm::device_buffer concatenate_masks(
   host_span<column_view const> views,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Concatenates multiple columns into a single column.
+ * @brief Concatenates multiple columns into a single column
  *
  * @throws cudf::logic_error If types of the input columns mismatch
- * @throws std::overflow_error If the the total number of output rows exceeds cudf::size_type
+ * @throws std::overflow_error If the total number of output rows exceeds cudf::size_type
  *
- * @param columns_to_concat host_span of column views to be concatenated into a single column
+ * @param columns_to_concat Column views to be concatenated into a single column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A single column having all the rows from the elements of `columns_to_concat` respectively
  * in the same order.
  */
 std::unique_ptr<column> concatenate(
   host_span<column_view const> columns_to_concat,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Columns of `tables_to_concat` are concatenated vertically to return a
  * single table
  *
- * @ingroup column_concatenate
- *
- * example:
- * ```
- * column_view c0; //Contains {0,1,2,3}
- * column_view c1; //Contains {4,5,6,7}
+ * @code{.pseudo}
+ * column_view c0 is {0,1,2,3}
+ * column_view c1 is {4,5,6,7}
  * table_view t0{{c0, c0}};
  * table_view t1{{c1, c1}};
  * ...
  * auto t = concatenate({t0.view(), t1.view()});
- * column_view tc0 = (t->view()).column(0); //Contains {0,1,2,3,4,5,6,7}
- * column_view tc1 = (t->view()).column(1); //Contains {0,1,2,3,4,5,6,7}
- * ```
+ * column_view tc0 = (t->view()).column(0) is {0,1,2,3,4,5,6,7}
+ * column_view tc1 = (t->view()).column(1) is {0,1,2,3,4,5,6,7}
+ * @endcode
  *
  * @throws cudf::logic_error If number of columns mismatch
- * @throws std::overflow_error If the the total number of output rows exceeds cudf::size_type
+ * @throws std::overflow_error If the total number of output rows exceeds cudf::size_type
  *
- * @param tables_to_concat host_span of table views to be concatenated into a single table
+ * @param tables_to_concat Table views to be concatenated into a single table
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return A single table having all the rows from the elements of
  * `tables_to_concat` respectively in the same order.
  */
 std::unique_ptr<table> concatenate(
   host_span<table_view const> tables_to_concat,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 62d668a98cb..bf10f1fd489 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -28,7 +28,7 @@ namespace cudf {
  * @addtogroup column_copy
  * @{
  * @file
- * @brief Table APIs for contiguous_split, pack, unpack, and metadadata
+ * @brief Table APIs for contiguous_split, pack, unpack, and metadata
  */
 
 /**
@@ -127,6 +127,153 @@ std::vector<packed_table> contiguous_split(
   std::vector<size_type> const& splits,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+namespace detail {
+struct contiguous_split_state;
+};
+
+/**
+ * @brief Perform a chunked "pack" operation of the input `table_view` using a user provided
+ * buffer of size `user_buffer_size`.
+ *
+ * The intent of this operation is to be used in a streamed fashion at times of GPU
+ * out-of-memory, where we want to minimize the number of small cudaMemcpy calls and
+ * tracking of all the metadata associated with cudf tables. Because of the memory constraints,
+ * all thrust and scratch memory allocations are using the passed-in memory resource exclusively,
+ * not a per-device memory resource.
+ *
+ * This class defines two methods that must be used in concert to carry out the chunked_pack:
+ * has_next and next. Here is an example:
+ *
+ * @code{.pseudo}
+ * // Create a table_view
+ * cudf::table_view tv = ...;
+ *
+ * // Choose a memory resource (optional). This memory resource is used for scratch/thrust temporary
+ * // data. In memory constrained cases, this can be used to set aside scratch memory
+ * // for `chunked_pack` at the beginning of a program.
+ * auto mr = rmm::mr::get_current_device_resource();
+ *
+ * // Define a buffer size for each chunk: the larger the buffer is, the more SMs can be
+ * // occupied by this algorithm.
+ * //
+ * // Internally, the GPU unit of work is a 1MB batch. When we instantiate `cudf::chunked_pack`,
+ * // all the 1MB batches for the source table_view are computed up front. Additionally,
+ * // chunked_pack calculates the number of iterations that are required to go through all those
+ * // batches given a `user_buffer_size` buffer. The number of 1MB batches in each iteration (chunk)
+ * // equals the number of CUDA blocks that will be used for the main kernel launch.
+ * //
+ * std::size_t user_buffer_size = 128*1024*1024;
+ *
+ * auto chunked_packer = cudf::chunked_pack::create(tv, user_buffer_size, mr);
+ *
+ * std::size_t host_offset = 0;
+ * auto host_buffer = ...; // obtain a host buffer you would like to copy to
+ *
+ * while (chunked_packer->has_next()) {
+ *   // get a user buffer of size `user_buffer_size`
+ *   cudf::device_span<uint8_t> user_buffer = ...;
+ *   std::size_t bytes_copied = chunked_packer->next(user_buffer);
+ *
+ *   // buffer will hold the contents of at most `user_buffer_size` bytes
+ *   // of the contiguously packed input `table_view`. You are now free to copy
+ *   // this memory somewhere else, for example, to host.
+ *   cudaMemcpyAsync(
+ *     host_buffer.data() + host_offset,
+ *     user_buffer.data(),
+ *     bytes_copied,
+ *     cudaMemcpyDefault,
+ *     stream);
+ *
+ *   host_offset += bytes_copied;
+ * }
+ * @endcode
+ */
+class chunked_pack {
+ public:
+  /**
+   * @brief Construct a `chunked_pack` class.
+   *
+   * @param input source `table_view` to pack
+   * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be
+   *                         at least 1MB
+   * @param temp_mr An optional memory resource to be used for temporary and scratch allocations
+   * only
+   */
+  explicit chunked_pack(
+    cudf::table_view const& input,
+    std::size_t user_buffer_size,
+    rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Destructor that will be implemented as default. Declared with definition here because
+   * contiguous_split_state is incomplete at this stage.
+   */
+  ~chunked_pack();
+
+  /**
+   * @brief Obtain the total size of the contiguously packed `table_view`.
+   *
+   * @return total size (in bytes) of all the chunks
+   */
+  [[nodiscard]] std::size_t get_total_contiguous_size() const;
+
+  /**
+   * @brief Function to check if there are chunks left to be copied.
+   *
+   * @return true if there are chunks left to be copied, and false otherwise
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @brief Packs the next chunk into `user_buffer`. This should be called as long as
+   * `has_next` returns true. If `next` is called when `has_next` is false, an exception
+   * is thrown.
+   *
+   * @throws cudf::logic_error If the size of `user_buffer` is different than `user_buffer_size`
+   * @throws cudf::logic_error If called after all chunks have been copied
+   *
+   * @param user_buffer device span target for the chunk. The size of this span must equal
+   *                    the `user_buffer_size` parameter passed at construction
+   * @return The number of bytes that were written to `user_buffer` (at most
+   *          `user_buffer_size`)
+   */
+  [[nodiscard]] std::size_t next(cudf::device_span<uint8_t> const& user_buffer);
+
+  /**
+   * @brief Build the opaque metadata for all added columns.
+   *
+   * @return A vector containing the serialized column metadata
+   */
+  [[nodiscard]] std::unique_ptr<std::vector<uint8_t>> build_metadata() const;
+
+  /**
+   * @brief Creates a `chunked_pack` instance to perform a "pack" of the `table_view`
+   * "input", where a buffer of `user_buffer_size` is filled with chunks of the
+   * overall operation. This operation can be used in cases where GPU memory is constrained.
+   *
+   * The memory resource (`temp_mr`) could be a special memory resource to be used in
+   * situations when GPU memory is low and we want scratch and temporary allocations to
+   * happen from a small reserved pool of memory. Note that it defaults to the regular cuDF
+   * per-device resource.
+   *
+   * @throws cudf::logic_error When user_buffer_size is less than 1MB
+   *
+   * @param input source `table_view` to pack
+   * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be
+   *                         at least 1MB
+   * @param temp_mr RMM memory resource to be used for temporary and scratch allocations only
+   * @return a unique_ptr of chunked_pack
+   */
+  [[nodiscard]] static std::unique_ptr<chunked_pack> create(
+    cudf::table_view const& input,
+    std::size_t user_buffer_size,
+    rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource());
+
+ private:
+  // internal state of contiguous split
+  std::unique_ptr<detail::contiguous_split_state> state;
+};
+
 /**
  * @brief Deep-copy a `table_view` into a serialized contiguous memory format.
  *
@@ -147,7 +294,7 @@ packed_columns pack(cudf::table_view const& input,
  *
  * The metadata from the `table_view` is copied into a host vector of bytes which can be used to
  * construct a `packed_columns` or `packed_table` structure. The caller is responsible for
- * guaranteeing that that all of the columns in the table point into `contiguous_buffer`.
+ * guaranteeing that all of the columns in the table point into `contiguous_buffer`.
  *
  * @param table View of the table to pack
  * @param contiguous_buffer A contiguous buffer of device memory which contains the data referenced
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index 921ef5f65f1..63680473c14 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -68,22 +68,24 @@ enum class out_of_bounds_policy : bool {
  *
  * @throws cudf::logic_error if gather_map contains null values.
  *
- * @param[in] source_table The input columns whose rows will be gathered
- * @param[in] gather_map View into a non-nullable column of integral indices that maps the
+ * @param source_table The input columns whose rows will be gathered
+ * @param gather_map View into a non-nullable column of integral indices that maps the
  * rows in the source columns to rows in the destination columns.
- * @param[in] bounds_policy Policy to apply to account for possible out-of-bounds indices
+ * @param bounds_policy Policy to apply to account for possible out-of-bounds indices
  * `DONT_CHECK` skips all bounds checking for gather map values. `NULLIFY` coerces rows that
  * corresponds to out-of-bounds indices in the gather map to be null elements. Callers should
  * use `DONT_CHECK` when they are certain that the gather_map contains only valid indices for
  * better performance. If `policy` is set to `DONT_CHECK` and there are out-of-bounds indices
  * in the gather map, the behavior is undefined. Defaults to `DONT_CHECK`.
- * @param[in] mr Device memory resource used to allocate the returned table's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of the gather
  */
 std::unique_ptr<table> gather(
   table_view const& source_table,
   column_view const& gather_map,
   out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -97,11 +99,13 @@ std::unique_ptr<table> gather(
  * ```
  *
  * @param source_table Table that will be reversed
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Reversed table
  */
 std::unique_ptr<table> reverse(
   table_view const& source_table,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -115,11 +119,13 @@ std::unique_ptr<table> reverse(
  * ```
  *
  * @param source_column Column that will be reversed
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Reversed column
  */
 std::unique_ptr<column> reverse(
   column_view const& source_column,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -153,6 +159,7 @@ std::unique_ptr<column> reverse(
  * to or less than the number of elements in the source columns.
  * @param target The set of columns into which values from the source_table
  * are to be scattered
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  */
@@ -160,6 +167,7 @@ std::unique_ptr<table> scatter(
   table_view const& source,
   column_view const& scatter_map,
   table_view const& target,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -189,13 +197,15 @@ std::unique_ptr<table> scatter(
  * the rows in the target table to be replaced by source.
  * @param target The set of columns into which values from the source_table
  * are to be scattered
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  */
 std::unique_ptr<table> scatter(
-  std::vector<std::reference_wrapper<const scalar>> const& source,
+  std::vector<std::reference_wrapper<scalar const>> const& source,
   column_view const& indices,
   table_view const& target,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -228,15 +238,20 @@ std::unique_ptr<column> empty_like(scalar const& input);
  *
  * Supports only fixed-width types.
  *
- * @param[in] input Immutable view of input column to emulate
- * @param[in] mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
+ * If the `mask_alloc` allocates a validity mask that mask is also uninitialized
+ * and the validity bits and the null count should be set by the caller.
+ *
+ * @param input Immutable view of input column to emulate
+ * @param mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return A column with sufficient uninitialized capacity to hold the same
  * number of elements as `input` of the same type as `input.type()`
  */
 std::unique_ptr<column> allocate_like(
   column_view const& input,
   mask_allocation_policy mask_alloc   = mask_allocation_policy::RETAIN,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -244,10 +259,14 @@ std::unique_ptr<column> allocate_like(
  *
  * Supports only fixed-width types.
  *
- * @param[in] input Immutable view of input column to emulate
- * @param[in] size The desired number of elements that the new column should have capacity for
- * @param[in] mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
+ * If the `mask_alloc` allocates a validity mask that mask is also uninitialized
+ * and the validity bits and the null count should be set by the caller.
+ *
+ * @param input Immutable view of input column to emulate
+ * @param size The desired number of elements that the new column should have capacity for
+ * @param mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A column with sufficient uninitialized capacity to hold the specified number of elements
  * as `input` of the same type as `input.type()`
  */
@@ -255,6 +274,7 @@ std::unique_ptr<column> allocate_like(
   column_view const& input,
   size_type size,
   mask_allocation_policy mask_alloc   = mask_allocation_policy::RETAIN,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -299,12 +319,14 @@ std::unique_ptr<table> empty_like(table_view const& input_table);
  * @param source_end The index of the last element in the source range
  * (exclusive)
  * @param target_begin The starting index of the target range (inclusive)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void copy_range_in_place(column_view const& source,
                          mutable_column_view& target,
                          size_type source_begin,
                          size_type source_end,
-                         size_type target_begin);
+                         size_type target_begin,
+                         rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Copies a range of elements out-of-place from one column to another.
@@ -332,6 +354,7 @@ void copy_range_in_place(column_view const& source,
  * @param source_end The index of the last element in the source range
  * (exclusive)
  * @param target_begin The starting index of the target range (inclusive)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return The result target column
  */
@@ -341,6 +364,7 @@ std::unique_ptr<column> copy_range(
   size_type source_begin,
   size_type source_end,
   size_type target_begin,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -372,6 +396,7 @@ std::unique_ptr<column> copy_range(
  * @param input      Column to be shifted
  * @param offset     The offset by which to shift the input
  * @param fill_value Fill value for indeterminable outputs
+ * @param stream     CUDA stream used for device memory operations and kernel launches
  * @param mr         Device memory resource used to allocate the returned result's device memory
  *
  * @throw cudf::logic_error if @p input dtype is neither fixed-width nor string type
@@ -383,6 +408,7 @@ std::unique_ptr<column> shift(
   column_view const& input,
   size_type offset,
   scalar const& fill_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -413,14 +439,19 @@ std::unique_ptr<column> shift(
  *
  * @param input View of column to slice
  * @param indices Indices used to take slices of `input`
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return Vector of views of `input` indicated by the ranges in `indices`
  */
-std::vector<column_view> slice(column_view const& input, host_span<size_type const> indices);
+std::vector<column_view> slice(column_view const& input,
+                               host_span<size_type const> indices,
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 /**
  * @ingroup copy_slice
- * @copydoc cudf::slice(column_view const&, host_span<size_type const>)
+ * @copydoc cudf::slice(column_view const&, host_span<size_type const>, rmm::cuda_stream_view)
  */
-std::vector<column_view> slice(column_view const& input, std::initializer_list<size_type> indices);
+std::vector<column_view> slice(column_view const& input,
+                               std::initializer_list<size_type> indices,
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Slices a `table_view` into a set of `table_view`s according to a set of indices.
@@ -452,14 +483,19 @@ std::vector<column_view> slice(column_view const& input, std::initializer_list<s
  *
  * @param input View of table to slice
  * @param indices Indices used to take slices of `input`
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return Vector of views of `input` indicated by the ranges in `indices`
  */
-std::vector<table_view> slice(table_view const& input, host_span<size_type const> indices);
+std::vector<table_view> slice(table_view const& input,
+                              host_span<size_type const> indices,
+                              rmm::cuda_stream_view stream = cudf::get_default_stream());
 /**
  * @ingroup copy_slice
- * @copydoc cudf::slice(table_view const&, host_span<size_type const>)
+ * @copydoc cudf::slice(table_view const&, host_span<size_type const>, rmm::cuda_stream_view stream)
  */
-std::vector<table_view> slice(table_view const& input, std::initializer_list<size_type> indices);
+std::vector<table_view> slice(table_view const& input,
+                              std::initializer_list<size_type> indices,
+                              rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Splits a `column_view` into a set of `column_view`s according to a set of indices
@@ -491,14 +527,19 @@ std::vector<table_view> slice(table_view const& input, std::initializer_list<siz
  *
  * @param input View of column to split
  * @param splits Indices where the view will be split
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return The set of requested views of `input` indicated by the `splits`
  */
-std::vector<column_view> split(column_view const& input, host_span<size_type const> splits);
+std::vector<column_view> split(column_view const& input,
+                               host_span<size_type const> splits,
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 /**
  * @ingroup copy_split
- * @copydoc cudf::split(column_view const&, host_span<size_type const>)
+ * @copydoc cudf::split(column_view const&, host_span<size_type const>, rmm::cuda_stream_view)
  */
-std::vector<column_view> split(column_view const& input, std::initializer_list<size_type> splits);
+std::vector<column_view> split(column_view const& input,
+                               std::initializer_list<size_type> splits,
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Splits a `table_view` into a set of `table_view`s according to a set of indices
@@ -532,14 +573,19 @@ std::vector<column_view> split(column_view const& input, std::initializer_list<s
  *
  * @param input View of a table to split
  * @param splits Indices where the view will be split
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return The set of requested views of `input` indicated by the `splits`
  */
-std::vector<table_view> split(table_view const& input, host_span<size_type const> splits);
+std::vector<table_view> split(table_view const& input,
+                              host_span<size_type const> splits,
+                              rmm::cuda_stream_view stream = cudf::get_default_stream());
 /**
  * @ingroup copy_split
- * @copydoc cudf::split(table_view const&, host_span<size_type const>)
+ * @copydoc cudf::split(table_view const&, host_span<size_type const>, rmm::cuda_stream_view)
  */
-std::vector<table_view> split(table_view const& input, std::initializer_list<size_type> splits);
+std::vector<table_view> split(table_view const& input,
+                              std::initializer_list<size_type> splits,
+                              rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -552,11 +598,12 @@ std::vector<table_view> split(table_view const& input, std::initializer_list<siz
  * @throws cudf::logic_error if lhs and rhs are not of the same length
  * @throws cudf::logic_error if boolean mask is not of type bool
  * @throws cudf::logic_error if boolean mask is not of the same length as lhs and rhs
- * @param[in] lhs left-hand column_view
- * @param[in] rhs right-hand column_view
- * @param[in] boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)"
+ * @param lhs left-hand column_view
+ * @param rhs right-hand column_view
+ * @param boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)"
  * boolean for each element. Null element represents false.
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns new column with the selected elements
  */
@@ -564,6 +611,7 @@ std::unique_ptr<column> copy_if_else(
   column_view const& lhs,
   column_view const& rhs,
   column_view const& boolean_mask,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -576,11 +624,12 @@ std::unique_ptr<column> copy_if_else(
  * @throws cudf::logic_error if lhs and rhs are not of the same type
  * @throws cudf::logic_error if boolean mask is not of type bool
  * @throws cudf::logic_error if boolean mask is not of the same length as rhs
- * @param[in] lhs left-hand scalar
- * @param[in] rhs right-hand column_view
- * @param[in] boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)"
+ * @param lhs left-hand scalar
+ * @param rhs right-hand column_view
+ * @param boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)"
  * boolean for each element. Null element represents false.
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns new column with the selected elements
  */
@@ -588,6 +637,7 @@ std::unique_ptr<column> copy_if_else(
   scalar const& lhs,
   column_view const& rhs,
   column_view const& boolean_mask,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -600,11 +650,12 @@ std::unique_ptr<column> copy_if_else(
  * @throws cudf::logic_error if lhs and rhs are not of the same type
  * @throws cudf::logic_error if boolean mask is not of type bool
  * @throws cudf::logic_error if boolean mask is not of the same length as lhs
- * @param[in] lhs left-hand column_view
- * @param[in] rhs right-hand scalar
- * @param[in] boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)"
+ * @param lhs left-hand column_view
+ * @param rhs right-hand scalar
+ * @param boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)"
  * boolean for each element. Null element represents false.
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns new column with the selected elements
  */
@@ -612,6 +663,7 @@ std::unique_ptr<column> copy_if_else(
   column_view const& lhs,
   scalar const& rhs,
   column_view const& boolean_mask,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -622,11 +674,12 @@ std::unique_ptr<column> copy_if_else(
  * rule: `output[i] = (boolean_mask.valid(i) and boolean_mask[i]) ? lhs : rhs`
  *
  * @throws cudf::logic_error if boolean mask is not of type bool
- * @param[in] lhs left-hand scalar
- * @param[in] rhs right-hand scalar
- * @param[in] boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)"
+ * @param lhs left-hand scalar
+ * @param rhs right-hand scalar
+ * @param boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)"
  * boolean for each element. null element represents false.
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns new column with the selected elements
  */
@@ -634,6 +687,7 @@ std::unique_ptr<column> copy_if_else(
   scalar const& lhs,
   scalar const& rhs,
   column_view const& boolean_mask,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -665,10 +719,11 @@ std::unique_ptr<column> copy_if_else(
  * @throws cudf::logic_error if boolean_mask.size() != target.num_rows()
  * @throws cudf::logic_error if number of `true` in `boolean_mask` > input.num_rows()
  *
- * @param[in] input table_view (set of dense columns) to scatter
- * @param[in] target table_view to modify with scattered values from `input`
- * @param[in] boolean_mask column_view which acts as boolean mask
- * @param[in] mr Device memory resource used to allocate device memory of the returned table
+ * @param input table_view (set of dense columns) to scatter
+ * @param target table_view to modify with scattered values from `input`
+ * @param boolean_mask column_view which acts as boolean mask
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate device memory of the returned table
  *
  * @returns Returns a table by scattering `input` into `target` as per `boolean_mask`
  */
@@ -676,6 +731,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& input,
   table_view const& target,
   column_view const& boolean_mask,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -702,17 +758,19 @@ std::unique_ptr<table> boolean_mask_scatter(
  * @throws cudf::logic_error if boolean_mask.type() != bool
  * @throws cudf::logic_error if boolean_mask.size() != target.size()
  *
- * @param[in] input scalars to scatter
- * @param[in] target table_view to modify with scattered values from `input`
- * @param[in] boolean_mask column_view which acts as boolean mask
- * @param[in] mr Device memory resource used to allocate device memory of the returned table
+ * @param input scalars to scatter
+ * @param target table_view to modify with scattered values from `input`
+ * @param boolean_mask column_view which acts as boolean mask
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate device memory of the returned table
  *
  * @returns Returns a table by scattering `input` into `target` as per `boolean_mask`
  */
 std::unique_ptr<table> boolean_mask_scatter(
-  std::vector<std::reference_wrapper<const scalar>> const& input,
+  std::vector<std::reference_wrapper<scalar const>> const& input,
   table_view const& target,
   column_view const& boolean_mask,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -725,12 +783,14 @@ std::unique_ptr<table> boolean_mask_scatter(
  *
  * @param input Column view to get the element from
  * @param index Index into `input` to get the element at
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Scalar containing the single value
  */
 std::unique_ptr<scalar> get_element(
   column_view const& input,
   size_type index,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -764,6 +824,7 @@ enum class sample_with_replacement : bool {
  * @param n non-negative number of samples expected from `input`
  * @param replacement Allow or disallow sampling of the same row more than once
  * @param seed Seed value to initiate random number generator
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  *
  * @return Table containing samples from `input`
@@ -773,6 +834,7 @@ std::unique_ptr<table> sample(
   size_type const n,
   sample_with_replacement replacement = sample_with_replacement::FALSE,
   int64_t const seed                  = 0,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -788,10 +850,12 @@ std::unique_ptr<table> sample(
  *
  * @param input The column which is (and whose descendants are) to be checked for
  * non-empty null rows.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return true If either the column or its descendants have non-empty null rows
  * @return false If neither the column or its descendants have non-empty null rows
  */
-bool has_nonempty_nulls(column_view const& input);
+bool has_nonempty_nulls(column_view const& input,
+                        rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Approximates if a column or its descendants *may* have non-empty null elements
@@ -881,11 +945,13 @@ bool may_have_nonempty_nulls(column_view const& input);
  * @endcode
  *
  * @param input The column whose null rows are to be checked and purged
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A new column with equivalent contents to `input`, but with null rows purged
  */
 std::unique_ptr<column> purge_nonempty_nulls(
   column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index fb04336871f..44736ca0762 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,7 +78,7 @@ std::unique_ptr<cudf::column> extract_day(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief  Extracts day from any datetime type and returns an int16_t
+ * @brief  Extracts a weekday from any datetime type and returns an int16_t
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp
index 89bab94faaf..41eec156c47 100644
--- a/cpp/include/cudf/detail/aggregation/result_cache.hpp
+++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp
@@ -18,7 +18,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/hashing.hpp>
+#include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/types.hpp>
 
 #include <unordered_map>
@@ -36,7 +36,7 @@ struct pair_column_aggregation_equal_to {
 struct pair_column_aggregation_hash {
   size_t operator()(std::pair<column_view, aggregation const&> const& key) const
   {
-    return hash_combine(shallow_hash(key.first), key.second.do_hash());
+    return cudf::hashing::detail::hash_combine(shallow_hash(key.first), key.second.do_hash());
   }
 };
 
@@ -45,7 +45,7 @@ class result_cache {
   result_cache()                                     = delete;
   ~result_cache()                                    = default;
   result_cache(result_cache const&)                  = delete;
-  result_cache& operator=(const result_cache& other) = delete;
+  result_cache& operator=(result_cache const& other) = delete;
 
   result_cache(size_t num_columns) : _cache(num_columns) {}
 
diff --git a/cpp/include/cudf/detail/concatenate.cuh b/cpp/include/cudf/detail/concatenate_masks.hpp
similarity index 76%
rename from cpp/include/cudf/detail/concatenate.cuh
rename to cpp/include/cudf/detail/concatenate_masks.hpp
index 51bcb1afa1f..e7086ea17a5 100644
--- a/cpp/include/cudf/detail/concatenate.cuh
+++ b/cpp/include/cudf/detail/concatenate_masks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,14 +17,11 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
-#include <cudf/concatenate.hpp>
-#include <cudf/detail/concatenate.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-
-#include <vector>
+#include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 namespace cudf {
 //! Inner interfaces and implementations
@@ -39,12 +36,13 @@ namespace detail {
  * @param dest_mask The output buffer to copy null masks into
  * @param output_size The total number of null masks bits that are being copied
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return The number of nulls
  */
-void concatenate_masks(device_span<column_device_view const> d_views,
-                       device_span<size_t const> d_offsets,
-                       bitmask_type* dest_mask,
-                       size_type output_size,
-                       rmm::cuda_stream_view stream);
+size_type concatenate_masks(device_span<column_device_view const> d_views,
+                            device_span<size_t const> d_offsets,
+                            bitmask_type* dest_mask,
+                            size_type output_size,
+                            rmm::cuda_stream_view stream);
 
 /**
  * @brief Concatenates `views[i]`'s bitmask from the bits
@@ -54,10 +52,11 @@ void concatenate_masks(device_span<column_device_view const> d_views,
  * @param views Column views whose bitmasks will be concatenated
  * @param dest_mask The output buffer to copy null masks into
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return The number of nulls
  */
-void concatenate_masks(host_span<column_view const> views,
-                       bitmask_type* dest_mask,
-                       rmm::cuda_stream_view stream);
+size_type concatenate_masks(host_span<column_view const> views,
+                            bitmask_type* dest_mask,
+                            rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::concatenate_masks(host_span<column_view const>, rmm::mr::device_memory_resource*)
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index cb3e20b36f2..1dd91dcd865 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -22,7 +22,6 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/table/table.hpp>
@@ -44,6 +43,8 @@
 
 #include <cub/cub.cuh>
 
+#include <cuda/atomic>
+
 #include <algorithm>
 
 namespace cudf {
@@ -126,7 +127,7 @@ __launch_bounds__(block_size) __global__
 
     cudf::size_type tmp_block_sum = 0;
     // get output location using a scan of the mask result
-    const cudf::size_type local_index = block_scan_mask<block_size>(mask_true, tmp_block_sum);
+    cudf::size_type const local_index = block_scan_mask<block_size>(mask_true, tmp_block_sum);
     block_sum += tmp_block_sum;
 
     if (has_validity) {
@@ -141,7 +142,7 @@ __launch_bounds__(block_size) __global__
       // scatter validity mask to shared memory
       if (has_validity and input_view.is_valid(tid)) {
         // determine aligned offset for this warp's output
-        const cudf::size_type aligned_offset      = block_offset % cudf::detail::warp_size;
+        cudf::size_type const aligned_offset      = block_offset % cudf::detail::warp_size;
         temp_valids[local_index + aligned_offset] = true;
       }
     }
@@ -161,10 +162,10 @@ __launch_bounds__(block_size) __global__
 
       constexpr int num_warps = block_size / cudf::detail::warp_size;
       // account for partial blocks with non-warp-aligned offsets
-      const int last_index = tmp_block_sum + (block_offset % cudf::detail::warp_size) - 1;
-      const int last_warp  = min(num_warps, last_index / cudf::detail::warp_size);
-      const int wid        = threadIdx.x / cudf::detail::warp_size;
-      const int lane       = threadIdx.x % cudf::detail::warp_size;
+      int const last_index = tmp_block_sum + (block_offset % cudf::detail::warp_size) - 1;
+      int const last_warp  = min(num_warps, last_index / cudf::detail::warp_size);
+      int const wid        = threadIdx.x / cudf::detail::warp_size;
+      int const lane       = threadIdx.x % cudf::detail::warp_size;
 
       cudf::size_type tmp_warp_valid_counts{0};
 
@@ -181,7 +182,9 @@ __launch_bounds__(block_size) __global__
           if (wid > 0 && wid < last_warp)
             output_valid[valid_index] = valid_warp;
           else {
-            atomicOr(&output_valid[valid_index], valid_warp);
+            cuda::atomic_ref<cudf::bitmask_type, cuda::thread_scope_device> ref{
+              output_valid[valid_index]};
+            ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed);
           }
         }
 
@@ -190,7 +193,9 @@ __launch_bounds__(block_size) __global__
           uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[block_size + threadIdx.x]);
           if (lane == 0 && valid_warp != 0) {
             tmp_warp_valid_counts += __popc(valid_warp);
-            atomicOr(&output_valid[valid_index + num_warps], valid_warp);
+            cuda::atomic_ref<cudf::bitmask_type, cuda::thread_scope_device> ref{
+              output_valid[valid_index + num_warps]};
+            ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed);
           }
         }
       }
@@ -206,7 +211,8 @@ __launch_bounds__(block_size) __global__
     cudf::detail::single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_counts);
 
   if (threadIdx.x == 0) {  // one thread computes and adds to null count
-    atomicAdd(output_null_count, block_sum - block_valid_count);
+    cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*output_null_count};
+    ref.fetch_add(block_sum - block_valid_count, cuda::std::memory_order_relaxed);
   }
 }
 
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 083b12edbf8..04ad1f20196 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -44,9 +44,9 @@ __launch_bounds__(block_size) __global__
                            mutable_column_device_view out,
                            size_type* __restrict__ const valid_count)
 {
-  const size_type tid            = threadIdx.x + blockIdx.x * block_size;
-  const int warp_id              = tid / warp_size;
-  const size_type warps_per_grid = gridDim.x * block_size / warp_size;
+  size_type const tid            = threadIdx.x + blockIdx.x * block_size;
+  int const warp_id              = tid / warp_size;
+  size_type const warps_per_grid = gridDim.x * block_size / warp_size;
 
   // begin/end indices for the column data
   size_type begin = 0;
@@ -59,7 +59,7 @@ __launch_bounds__(block_size) __global__
 
   // lane id within the current warp
   constexpr size_type leader_lane{0};
-  const int lane_id = threadIdx.x % warp_size;
+  int const lane_id = threadIdx.x % warp_size;
 
   size_type warp_valid_count{0};
 
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 0d5aa509e08..16e4e7a1297 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -54,17 +54,17 @@ __global__ void copy_range_kernel(SourceValueIterator source_value_begin,
                 "copy_range_kernel assumes bitmask element size in bits == warp size");
 
   constexpr cudf::size_type leader_lane{0};
-  const int lane_id = threadIdx.x % warp_size;
+  int const lane_id = threadIdx.x % warp_size;
 
-  const cudf::size_type tid = threadIdx.x + blockIdx.x * blockDim.x;
-  const int warp_id         = tid / warp_size;
+  cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int const warp_id         = tid / warp_size;
 
-  const cudf::size_type offset         = target.offset();
-  const cudf::size_type begin_mask_idx = cudf::word_index(offset + target_begin);
-  const cudf::size_type end_mask_idx   = cudf::word_index(offset + target_end);
+  cudf::size_type const offset         = target.offset();
+  cudf::size_type const begin_mask_idx = cudf::word_index(offset + target_begin);
+  cudf::size_type const end_mask_idx   = cudf::word_index(offset + target_end);
 
   cudf::size_type mask_idx             = begin_mask_idx + warp_id;
-  const cudf::size_type masks_per_grid = gridDim.x * blockDim.x / warp_size;
+  cudf::size_type const masks_per_grid = gridDim.x * blockDim.x / warp_size;
 
   cudf::size_type target_offset = begin_mask_idx * warp_size - (offset + target_begin);
   cudf::size_type source_idx    = tid + target_offset;
@@ -79,10 +79,10 @@ __global__ void copy_range_kernel(SourceValueIterator source_value_begin,
     if (in_range) target.element<T>(index) = *(source_value_begin + source_idx);
 
     if (has_validity) {  // update bitmask
-      const bool valid      = in_range && *(source_validity_begin + source_idx);
-      const int active_mask = __ballot_sync(0xFFFF'FFFFu, in_range);
-      const int valid_mask  = __ballot_sync(0xFFFF'FFFFu, valid);
-      const int warp_mask   = active_mask & valid_mask;
+      bool const valid      = in_range && *(source_validity_begin + source_idx);
+      int const active_mask = __ballot_sync(0xFFFF'FFFFu, in_range);
+      int const valid_mask  = __ballot_sync(0xFFFF'FFFFu, valid);
+      int const warp_mask   = active_mask & valid_mask;
 
       cudf::bitmask_type old_mask = target.get_mask_word(mask_idx);
       if (lane_id == leader_lane) {
@@ -154,11 +154,6 @@ void copy_range(SourceValueIterator source_value_begin,
   auto grid = cudf::detail::grid_1d{num_items, block_size, 1};
 
   if (target.nullable()) {
-    // TODO: if null_count is UNKNOWN_NULL_COUNT, no need to update null
-    // count (if null_count is UNKNOWN_NULL_COUNT, invoking null_count()
-    // will scan the entire bitmask array, and this can be surprising
-    // in performance if the copy range is small and the column size is
-    // large).
     rmm::device_scalar<size_type> null_count(target.null_count(), stream);
 
     auto kernel =
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 3146005ca49..0ab9da0dbd0 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -582,7 +582,7 @@ struct indexalator_factory {
   };
 
   /**
-   * @brief An index accessor that returns an index value if corresponding validity flag is true.
+   * @brief An index accessor that returns an index value if the scalar's validity flag is true.
    *
    * This is suitable as an `optional_iterator`.
    */
@@ -605,7 +605,7 @@ struct indexalator_factory {
   };
 
   /**
-   * @brief Create an index iterator with a nullable index accessor.
+   * @brief Create an index iterator with an optional index accessor.
    */
   static auto make_input_optional_iterator(column_view const& col)
   {
@@ -613,7 +613,7 @@ struct indexalator_factory {
   }
 
   /**
-   * @brief Create an index iterator with a nullable index accessor for a scalar.
+   * @brief Create an index iterator with an optional index accessor for a scalar.
    */
   static auto make_input_optional_iterator(scalar const& input)
   {
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index c424c20d7c7..6fcf10aef57 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -16,7 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/hashing.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -86,7 +86,6 @@ struct hash_join {
    * @brief Constructor that internally builds the hash table based on the given `build` table.
    *
    * @throw cudf::logic_error if the number of columns in `build` table is 0.
-   * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE.
    *
    * @param build The build table, from which the hash table is built.
    * @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or
@@ -177,7 +176,6 @@ struct hash_join {
    * @copydoc cudf::detail::hash_join::probe_join_indices
    *
    * @throw cudf::logic_error if probe table is empty.
-   * @throw cudf::logic_error if the size of probe table exceeds `MAX_JOIN_SIZE`.
    * @throw cudf::logic_error if the number of columns in build table and probe table do not match.
    * @throw cudf::logic_error if the column data types in build table and probe table do not match.
    */
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 3ff3bb4cf3c..78cd3d7bcb7 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -66,9 +66,11 @@ __global__ void offset_bitmask_binop(Binop op,
                                      size_type source_size_bits,
                                      size_type* count_ptr)
 {
-  constexpr auto const word_size{detail::size_in_bits<bitmask_type>()};
   auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
+  auto const last_bit_index  = source_size_bits - 1;
+  auto const last_word_index = cudf::word_index(last_bit_index);
+
   size_type thread_count = 0;
 
   for (size_type destination_word_index = tid; destination_word_index < destination.size();
@@ -86,20 +88,19 @@ __global__ void offset_bitmask_binop(Binop op,
                                                          source_begin_bits[i] + source_size_bits));
     }
 
+    if (destination_word_index == last_word_index) {
+      // mask out any bits not part of this word
+      auto const num_bits_in_last_word = intra_word_index(last_bit_index);
+      if (num_bits_in_last_word <
+          static_cast<size_type>(detail::size_in_bits<bitmask_type>() - 1)) {
+        destination_word &= set_least_significant_bits(num_bits_in_last_word + 1);
+      }
+    }
+
     destination[destination_word_index] = destination_word;
     thread_count += __popc(destination_word);
   }
 
-  // Subtract any slack bits from the last word
-  if (tid == 0) {
-    size_type const last_bit_index = source_size_bits - 1;
-    size_type const num_slack_bits = word_size - (last_bit_index % word_size) - 1;
-    if (num_slack_bits > 0) {
-      size_type const word_index = cudf::word_index(last_bit_index);
-      thread_count -= __popc(destination[word_index] & set_most_significant_bits(num_slack_bits));
-    }
-  }
-
   using BlockReduce = cub::BlockReduce<size_type, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   size_type block_count = BlockReduce(temp_storage).Sum(thread_count);
@@ -262,7 +263,7 @@ __global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bi
  */
 struct bit_to_word_index {
   bit_to_word_index(bool inclusive) : inclusive(inclusive) {}
-  __device__ inline size_type operator()(const size_type& bit_index) const
+  __device__ inline size_type operator()(size_type const& bit_index) const
   {
     return word_index(bit_index) + ((inclusive || intra_word_index(bit_index) == 0) ? 0 : 1);
   }
@@ -378,13 +379,13 @@ size_type validate_segmented_indices(IndexIterator indices_begin, IndexIterator
 }
 
 struct index_alternator {
-  __device__ inline size_type operator()(const size_type& i) const
+  __device__ inline size_type operator()(size_type const& i) const
   {
     return *(d_indices + 2 * i + (is_end ? 1 : 0));
   }
 
   bool const is_end = false;
-  const size_type* d_indices;
+  size_type const* d_indices;
 };
 
 /**
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 7f1b15893c5..8c10bbe416f 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -141,20 +141,9 @@ cudf::size_type valid_count(bitmask_type const* bitmask,
                             rmm::cuda_stream_view stream);
 
 /**
- * @brief Given a validity bitmask, counts the number of null elements (unset bits)
- * in the range `[start, stop)`.
- *
- * If `bitmask == nullptr`, all elements are assumed to be valid and the
- * function returns ``.
- *
- * @throws cudf::logic_error if `start > stop`
- * @throws cudf::logic_error if `start < 0`
+ * @copydoc null_count(bitmask_type const* bitmask, size_type start, size_type stop)
  *
- * @param[in] bitmask Validity bitmask residing in device memory.
- * @param[in] start Index of the first bit to count (inclusive).
- * @param[in] stop Index of the last bit to count (exclusive).
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- * @return The number of null elements in the specified range.
+ * @param stream Stream view on which to allocate resources and queue execution.
  */
 cudf::size_type null_count(bitmask_type const* bitmask,
                            size_type start,
diff --git a/cpp/include/cudf/detail/nvtx/nvtx3.hpp b/cpp/include/cudf/detail/nvtx/nvtx3.hpp
index 8d7323cb88e..4b840724034 100644
--- a/cpp/include/cudf/detail/nvtx/nvtx3.hpp
+++ b/cpp/include/cudf/detail/nvtx/nvtx3.hpp
@@ -1167,7 +1167,7 @@ class registered_message {
    * Registers `msg` with NVTX and associates a handle with the registered
    * message.
    *
-   * A particular message should should only be registered once and the handle
+   * A particular message should only be registered once and the handle
    * reused throughout the rest of the application.
    *
    * @param msg The contents of the message
@@ -1183,7 +1183,7 @@ class registered_message {
    * Registers `msg` with NVTX and associates a handle with the registered
    * message.
    *
-   * A particular message should should only be registered once and the handle
+   * A particular message should only be registered once and the handle
    * reused throughout the rest of the application.
    *
    * @param msg The contents of the message
@@ -1196,7 +1196,7 @@ class registered_message {
    * Registers `msg` with NVTX and associates a handle with the registered
    * message.
    *
-   * A particular message should should only be registered once and the handle
+   * A particular message should only be registered once and the handle
    * reused throughout the rest of the application.
    *
    * @param msg The contents of the message
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 39ae4fe1944..94c795f31b2 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -106,7 +106,7 @@ std::unique_ptr<table> scatter(table_view const& source,
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  */
-std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>> const& source,
+std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<scalar const>> const& source,
                                column_view const& indices,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
@@ -136,7 +136,7 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& source,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> boolean_mask_scatter(
-  std::vector<std::reference_wrapper<const scalar>> const& source,
+  std::vector<std::reference_wrapper<scalar const>> const& source,
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
index 0017ddb305d..155b1ce5691 100644
--- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
+++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -244,7 +244,7 @@ static sizes_to_offsets_iterator<ScanIterator, LastType> make_sizes_to_offsets_i
  *   auto const bytes = cudf::detail::sizes_to_offsets(
  *     d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
  *   CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
- *               "Size of output exceeds column size limit", std::overflow_error);
+ *               "Size of output exceeds the column size limit", std::overflow_error);
  * @endcode
  *
  * @tparam SizesIterator Iterator type for input of the scan using addition operation
@@ -303,9 +303,9 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
 {
   auto count          = static_cast<size_type>(std::distance(begin, end));
   auto offsets_column = make_numeric_column(
-    data_type{type_to_id<offset_type>()}, count + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, count + 1, mask_state::UNALLOCATED, stream, mr);
   auto offsets_view = offsets_column->mutable_view();
-  auto d_offsets    = offsets_view.template data<offset_type>();
+  auto d_offsets    = offsets_view.template data<size_type>();
 
   // The number of offsets is count+1 so to build the offsets from the sizes
   // using exclusive-scan technically requires count+1 input values even though
@@ -319,7 +319,7 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
   auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream);
   CUDF_EXPECTS(
     total_elements <= static_cast<decltype(total_elements)>(std::numeric_limits<size_type>::max()),
-    "Size of output exceeds column size limit",
+    "Size of output exceeds the column size limit",
     std::overflow_error);
 
   offsets_column->set_null_count(0);
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index e0fc7b71cd9..5476000fc29 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -86,24 +86,9 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Create a new table without duplicate rows.
+ * @copydoc cudf::stable_distinct
  *
- * Given an `input` table_view, each row is copied to the output table to create a set of distinct
- * rows. The row order is guaranteed to be preserved as in the input.
- *
- * If there are duplicate rows, which row to be copied depends on the specified value of the `keep`
- * parameter.
- *
- * This API produces exactly the same set of output rows as `cudf::distinct`.
- *
- * @param input The input table
- * @param keys Vector of indices indicating key columns in the `input` table
- * @param keep Copy any, first, last, or none of the found duplicates
- * @param nulls_equal Flag to specify whether null elements should be considered as equal
- * @param nans_equal Flag to specify whether NaN elements should be considered as equal
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned table
- * @return A table containing the resulting distinct rows
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> stable_distinct(table_view const& input,
                                        std::vector<size_type> const& keys,
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index d9fb0efed45..b529d4a2c53 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -27,9 +27,10 @@ namespace tdigest {
 namespace detail {
 
 /**
- * @brief Generate a tdigest column from a grouped set of numeric input values.
+ * @brief Generate a tdigest column from a grouped, sorted set of numeric input values.
  *
- * The tdigest column produced is of the following structure:
+ * The input is expected to be sorted in ascending order within each group, with
+ * nulls at the end.
  *
  * struct {
  *   // centroids for the digest
@@ -166,96 +167,6 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr);
 
-/**
- * @brief Generate a tdigest column from a grouped, sorted set of numeric input values.
- *
- * The input is expected to be sorted in ascending order within each group, with
- * nulls at the end.
- *
- * The tdigest column produced is of the following structure:
- ** struct {
- *   // centroids for the digest
- *   list {
- *    struct {
- *      double    // mean
- *      double    // weight
- *    },
- *    ...
- *   }
- *   // these are from the input stream, not the centroids. they are used
- *   // during the percentile_approx computation near the beginning or
- *   // end of the quantiles
- *   double       // min
- *   double       // max
- * }
- *
- * Each output row is a single tdigest.  The length of the row is the "size" of the
- * tdigest, each element of which represents a weighted centroid (mean, weight).
- *
- * @param values Grouped (and sorted) values to merge.
- * @param group_offsets Offsets of groups' starting points within @p values.
- * @param group_labels 0-based ID of group that the corresponding value belongs to
- * @param group_valid_counts Per-group counts of valid elements.
- * @param num_groups Number of groups.
- * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
- * values result in a larger, more precise tdigest.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory
- *
- * @returns tdigest column, with 1 tdigest per row
- */
-std::unique_ptr<column> group_tdigest(column_view const& values,
-                                      cudf::device_span<size_type const> group_offsets,
-                                      cudf::device_span<size_type const> group_labels,
-                                      cudf::device_span<size_type const> group_valid_counts,
-                                      size_type num_groups,
-                                      int max_centroids,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
-
-/**
- * @brief Merges tdigests within the same group to generate a new tdigest.
- *
- * The tdigest column produced is of the following structure:
- *
- * struct {
- *   // centroids for the digest
- *   list {
- *    struct {
- *      double    // mean
- *      double    // weight
- *    },
- *    ...
- *   }
- *   // these are from the input stream, not the centroids. they are used
- *   // during the percentile_approx computation near the beginning or
- *   // end of the quantiles
- *   double       // min
- *   double       // max
- * }
- *
- * Each output row is a single tdigest.  The length of the row is the "size" of the
- * tdigest, each element of which represents a weighted centroid (mean, weight).
- *
- * @param values Grouped tdigests to merge.
- * @param group_offsets Offsets of groups' starting points within @p values.
- * @param group_labels 0-based ID of group that the corresponding value belongs to
- * @param num_groups Number of groups.
- * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
- * values result in a larger, more precise tdigest.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory
- *
- * @returns tdigest column, with 1 tdigest per row
- */
-std::unique_ptr<column> group_merge_tdigest(column_view const& values,
-                                            cudf::device_span<size_type const> group_offsets,
-                                            cudf::device_span<size_type const> group_labels,
-                                            size_type num_groups,
-                                            int max_centroids,
-                                            rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
-
 /**
  * @brief Generate a tdigest scalar from a set of numeric input values.
  *
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 5b64f61f11a..215ad50aed6 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -41,8 +41,8 @@ std::unique_ptr<column> transform(column_view const& input,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> compute_column(table_view const table,
-                                       ast::operation const& expr,
+std::unique_ptr<column> compute_column(table_view const& table,
+                                       ast::expression const& expr,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr);
 
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index cdbc26701d1..264302df0e9 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,8 +41,8 @@ static constexpr size_type warp_size{32};
  */
 class grid_1d {
  public:
-  const int num_threads_per_block;
-  const int num_blocks;
+  int const num_threads_per_block;
+  int const num_blocks;
   /**
    * @param overall_num_elements The number of elements the kernel needs to
    * handle/process, in its main, one-dimensional/linear input (e.g. one or more
@@ -65,6 +65,56 @@ class grid_1d {
     CUDF_EXPECTS(num_threads_per_block > 0, "num_threads_per_block must be > 0");
     CUDF_EXPECTS(num_blocks > 0, "num_blocks must be > 0");
   }
+
+  /**
+   * @brief Returns the global thread index in a 1D grid.
+   *
+   * The returned index is unique across the entire grid.
+   *
+   * @param thread_id The thread index within the block
+   * @param block_id The block index within the grid
+   * @param num_threads_per_block The number of threads per block
+   * @return thread_index_type The global thread index
+   */
+  static constexpr thread_index_type global_thread_id(thread_index_type thread_id,
+                                                      thread_index_type block_id,
+                                                      thread_index_type num_threads_per_block)
+  {
+    return thread_id + block_id * num_threads_per_block;
+  }
+
+  /**
+   * @brief Returns the global thread index of the current thread in a 1D grid.
+   *
+   * @return thread_index_type The global thread index
+   */
+  static __device__ thread_index_type global_thread_id()
+  {
+    return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
+  }
+
+  /**
+   * @brief Returns the stride of a 1D grid.
+   *
+   * The returned stride is the total number of threads in the grid.
+   *
+   * @param thread_id The thread index within the block
+   * @param block_id The block index within the grid
+   * @param num_threads_per_block The number of threads per block
+   * @return thread_index_type The global thread index
+   */
+  static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
+                                                 thread_index_type num_blocks_per_grid)
+  {
+    return num_threads_per_block * num_blocks_per_grid;
+  }
+
+  /**
+   * @brief Returns the stride of the current 1D grid.
+   *
+   * @return thread_index_type The number of threads in the grid.
+   */
+  static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }
 };
 
 /**
@@ -106,6 +156,10 @@ __device__ T single_lane_block_sum_reduce(T lane_value)
     lane_value = (lane_id < warps_per_block) ? lane_values[lane_id] : T{0};
     result     = cub::WarpReduce<T>(temp).Sum(lane_value);
   }
+  // Shared memory has block scope, so sync here to ensure no data
+  // races between successive calls to this function in the same
+  // kernel.
+  __syncthreads();
   return result;
 }
 
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index c1fc96d6f43..c56e88f07a8 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,6 @@
  * cudf::duration_us, cudf::duration_ns and bool
  * where CUDA atomic operations are, `atomicAdd`, `atomicMin`, `atomicMax`,
  * `atomicCAS`.
- * `atomicAnd`, `atomicOr`, `atomicXor` are also supported for integer data types.
  * Also provides `cudf::genericAtomicOperation` which performs atomic operation
  * with the given binary operator.
  */
@@ -161,7 +160,6 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 // specialized functions for operators
 // `atomicAdd` supports int32, float, double (signed int64 is not supported.)
 // `atomicMin`, `atomicMax` support int32_t, int64_t
-// `atomicAnd`, `atomicOr`, `atomicXor` support int32_t, int64_t
 template <>
 struct genericAtomicOperationImpl<float, DeviceSum, 4> {
   using T = float;
@@ -252,63 +250,6 @@ struct genericAtomicOperationImpl<int64_t, DeviceMax, 8> {
     return ret;
   }
 };
-
-template <typename T>
-struct genericAtomicOperationImpl<T, DeviceAnd, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op)
-  {
-    return atomicAnd(addr, update_value);
-  }
-};
-
-template <typename T>
-struct genericAtomicOperationImpl<T, DeviceAnd, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op)
-  {
-    using T_int = long long int;
-    static_assert(sizeof(T) == sizeof(T_int));
-    T ret = atomicAnd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
-    return ret;
-  }
-};
-
-template <typename T>
-struct genericAtomicOperationImpl<T, DeviceOr, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op)
-  {
-    return atomicOr(addr, update_value);
-  }
-};
-
-template <typename T>
-struct genericAtomicOperationImpl<T, DeviceOr, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op)
-  {
-    using T_int = long long int;
-    static_assert(sizeof(T) == sizeof(T_int));
-    T ret = atomicOr(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
-    return ret;
-  }
-};
-
-template <typename T>
-struct genericAtomicOperationImpl<T, DeviceXor, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op)
-  {
-    return atomicXor(addr, update_value);
-  }
-};
-
-template <typename T>
-struct genericAtomicOperationImpl<T, DeviceXor, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op)
-  {
-    using T_int = long long int;
-    static_assert(sizeof(T) == sizeof(T_int));
-    T ret = atomicXor(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
-    return ret;
-  }
-};
 // -----------------------------------------------------------------------
 // the implementation of `typesAtomicCASImpl`
 template <typename T, size_t N = sizeof(T)>
@@ -598,66 +539,3 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val)
 {
   return cudf::detail::typesAtomicCASImpl<T>()(address, compare, val);
 }
-
-/**
- * @brief Overloads for `atomicAnd`
- * reads the `old` located at the `address` in global or shared memory,
- * computes (old & val), and stores the result back to memory at the same
- * address. These three operations are performed in one atomic transaction.
- *
- * The supported types for `atomicAnd` are:
- *   singed/unsigned integer 8/16/32/64 bits
- * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`.
- *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
- *
- * @returns The old value at `address`
- */
-template <typename T, std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
-__forceinline__ __device__ T atomicAnd(T* address, T val)
-{
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceAnd{});
-}
-
-/**
- * @brief Overloads for `atomicOr`
- * reads the `old` located at the `address` in global or shared memory,
- * computes (old | val), and stores the result back to memory at the same
- * address. These three operations are performed in one atomic transaction.
- *
- * The supported types for `atomicOr` are:
- *   singed/unsigned integer 8/16/32/64 bits
- * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`.
- *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
- *
- * @returns The old value at `address`
- */
-template <typename T, std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
-__forceinline__ __device__ T atomicOr(T* address, T val)
-{
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceOr{});
-}
-
-/**
- * @brief Overloads for `atomicXor`
- * reads the `old` located at the `address` in global or shared memory,
- * computes (old ^ val), and stores the result back to memory at the same
- * address. These three operations are performed in one atomic transaction.
- *
- * The supported types for `atomicXor` are:
- *   singed/unsigned integer 8/16/32/64 bits
- * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`.
- *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
- *
- * @returns The old value at `address`
- */
-template <typename T, std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
-__forceinline__ __device__ T atomicXor(T* address, T val)
-{
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceXor{});
-}
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index f6d9d32b398..46f424e051b 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,7 +62,7 @@ CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs)
  */
 struct DeviceSum {
   template <typename T, std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs + rhs)
+  CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs) -> decltype(lhs + rhs)
   {
     return lhs + rhs;
   }
@@ -93,13 +93,13 @@ struct DeviceSum {
  */
 struct DeviceCount {
   template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
-  CUDF_HOST_DEVICE inline T operator()(const T& lhs, const T& rhs)
+  CUDF_HOST_DEVICE inline T operator()(T const& lhs, T const& rhs)
   {
     return T{DeviceCount{}(lhs.time_since_epoch(), rhs.time_since_epoch())};
   }
 
   template <typename T, std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDF_HOST_DEVICE inline T operator()(const T&, const T& rhs)
+  CUDF_HOST_DEVICE inline T operator()(T const&, T const& rhs)
   {
     return rhs + T{1};
   }
@@ -116,7 +116,7 @@ struct DeviceCount {
  */
 struct DeviceMin {
   template <typename T>
-  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs)
+  CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs)
     -> decltype(cudf::detail::min(lhs, rhs))
   {
     return numeric::detail::min(lhs, rhs);
@@ -164,7 +164,7 @@ struct DeviceMin {
  */
 struct DeviceMax {
   template <typename T>
-  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs)
+  CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs)
     -> decltype(cudf::detail::max(lhs, rhs))
   {
     return numeric::detail::max(lhs, rhs);
@@ -211,7 +211,7 @@ struct DeviceMax {
  */
 struct DeviceProduct {
   template <typename T, std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs * rhs)
+  CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs) -> decltype(lhs * rhs)
   {
     return lhs * rhs;
   }
@@ -230,44 +230,11 @@ struct DeviceProduct {
   }
 };
 
-/**
- * @brief binary `and` operator
- */
-struct DeviceAnd {
-  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs & rhs)
-  {
-    return (lhs & rhs);
-  }
-};
-
-/**
- * @brief binary `or` operator
- */
-struct DeviceOr {
-  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs | rhs)
-  {
-    return (lhs | rhs);
-  }
-};
-
-/**
- * @brief binary `xor` operator
- */
-struct DeviceXor {
-  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs ^ rhs)
-  {
-    return (lhs ^ rhs);
-  }
-};
-
 /**
  * @brief Operator for calculating Lead/Lag window function.
  */
 struct DeviceLeadLag {
-  const size_type row_offset;
+  size_type const row_offset;
 
   explicit CUDF_HOST_DEVICE inline DeviceLeadLag(size_type offset_) : row_offset(offset_) {}
 };
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
deleted file mode 100644
index ca9c16043a3..00000000000
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cstddef>
-
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/utilities/assert.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/hashing.hpp>
-#include <cudf/strings/string_view.cuh>
-#include <cudf/types.hpp>
-
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/find.h>
-#include <thrust/iterator/reverse_iterator.h>
-#include <thrust/pair.h>
-#include <thrust/reverse.h>
-
-namespace cudf {
-namespace detail {
-
-/**
- * Normalization of floating point NaNs, passthrough for all other values.
- */
-template <typename T>
-T __device__ inline normalize_nans(T const& key)
-{
-  if constexpr (cudf::is_floating_point<T>()) {
-    if (std::isnan(key)) { return std::numeric_limits<T>::quiet_NaN(); }
-  }
-  return key;
-}
-
-/**
- * Normalization of floating point NaNs and zeros, passthrough for all other values.
- */
-template <typename T>
-T __device__ inline normalize_nans_and_zeros(T const& key)
-{
-  if constexpr (cudf::is_floating_point<T>()) {
-    if (key == T{0.0}) { return T{0.0}; }
-  }
-  return normalize_nans(key);
-}
-
-__device__ inline uint32_t rotate_bits_left(uint32_t x, uint32_t r)
-{
-  // This function is equivalent to (x << r) | (x >> (32 - r))
-  return __funnelshift_l(x, x, r);
-}
-
-__device__ inline uint32_t rotate_bits_right(uint32_t x, uint32_t r)
-{
-  // This function is equivalent to (x >> r) | (x << (32 - r))
-  return __funnelshift_r(x, x, r);
-}
-
-__device__ inline uint64_t rotate_bits_right(uint64_t x, uint32_t r)
-{
-  return (x >> r) | (x << (64 - r));
-}
-
-// Swap the endianness of a 32 bit value
-__device__ inline uint32_t swap_endian(uint32_t x)
-{
-  // The selector 0x0123 reverses the byte order
-  return __byte_perm(x, 0, 0x0123);
-}
-
-// Swap the endianness of a 64 bit value
-// There is no CUDA intrinsic for permuting bytes in 64 bit integers
-__device__ inline uint64_t swap_endian(uint64_t x)
-{
-  // Reverse the endianness of each 32 bit section
-  uint32_t low_bits  = swap_endian(static_cast<uint32_t>(x));
-  uint32_t high_bits = swap_endian(static_cast<uint32_t>(x >> 32));
-  // Reassemble a 64 bit result, swapping the low bits and high bits
-  return (static_cast<uint64_t>(low_bits) << 32) | (static_cast<uint64_t>(high_bits));
-};
-
-template <int capacity, typename hash_step_callable>
-struct hash_circular_buffer {
-  uint8_t storage[capacity];
-  uint8_t* cur;
-  int available_space{capacity};
-  hash_step_callable hash_step;
-
-  __device__ inline hash_circular_buffer(hash_step_callable hash_step)
-    : cur{storage}, hash_step{hash_step}
-  {
-  }
-
-  __device__ inline void put(uint8_t const* in, int size)
-  {
-    int copy_start = 0;
-    while (size >= available_space) {
-      // The buffer will be filled by this chunk of data. Copy a chunk of the
-      // data to fill the buffer and trigger a hash step.
-      memcpy(cur, in + copy_start, available_space);
-      hash_step(storage);
-      size -= available_space;
-      copy_start += available_space;
-      cur             = storage;
-      available_space = capacity;
-    }
-    // The buffer will not be filled by the remaining data. That is, `size >= 0
-    // && size < capacity`. We copy the remaining data into the buffer but do
-    // not trigger a hash step.
-    memcpy(cur, in + copy_start, size);
-    cur += size;
-    available_space -= size;
-  }
-
-  __device__ inline void pad(int const space_to_leave)
-  {
-    if (space_to_leave > available_space) {
-      memset(cur, 0x00, available_space);
-      hash_step(storage);
-      cur             = storage;
-      available_space = capacity;
-    }
-    memset(cur, 0x00, available_space - space_to_leave);
-    cur += available_space - space_to_leave;
-    available_space = space_to_leave;
-  }
-
-  __device__ inline const uint8_t& operator[](int idx) const { return storage[idx]; }
-};
-
-// Get a uint8_t pointer to a column element and its size as a pair.
-template <typename Element>
-auto __device__ inline get_element_pointer_and_size(Element const& element)
-{
-  if constexpr (is_fixed_width<Element>() && !is_chrono<Element>()) {
-    return thrust::make_pair(reinterpret_cast<uint8_t const*>(&element), sizeof(Element));
-  } else {
-    CUDF_UNREACHABLE("Unsupported type.");
-  }
-}
-
-template <>
-auto __device__ inline get_element_pointer_and_size(string_view const& element)
-{
-  return thrust::make_pair(reinterpret_cast<uint8_t const*>(element.data()), element.size_bytes());
-}
-
-/**
- * Modified GPU implementation of
- * https://johnnylee-sde.github.io/Fast-unsigned-integer-to-hex-string/
- * Copyright (c) 2015 Barry Clark
- * Licensed under the MIT license.
- * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
- */
-void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destination)
-{
-  // Transform 0xABCD'1234 => 0x0000'ABCD'0000'1234 => 0x0B0A'0D0C'0201'0403
-  uint64_t x = num;
-  x          = ((x & 0xFFFF'0000u) << 16) | ((x & 0xFFFF));
-  x          = ((x & 0x000F'0000'000Fu) << 8) | ((x & 0x00F0'0000'00F0u) >> 4) |
-      ((x & 0x0F00'0000'0F00u) << 16) | ((x & 0xF000'0000'F000) << 4);
-
-  // Calculate a mask of ascii value offsets for bytes that contain alphabetical hex digits
-  uint64_t offsets = (((x + 0x0606'0606'0606'0606) >> 4) & 0x0101'0101'0101'0101) * 0x27;
-
-  x |= 0x3030'3030'3030'3030;
-  x += offsets;
-  std::memcpy(destination, reinterpret_cast<uint8_t*>(&x), 8);
-}
-
-// MurmurHash3_32 implementation from
-// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
-template <typename Key>
-struct MurmurHash3_32 {
-  using result_type = hash_value_type;
-
-  constexpr MurmurHash3_32() = default;
-  constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {}
-
-  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
-  {
-    h ^= h >> 16;
-    h *= 0x85ebca6b;
-    h ^= h >> 13;
-    h *= 0xc2b2ae35;
-    h ^= h >> 16;
-    return h;
-  }
-
-  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
-                                                      cudf::size_type offset) const
-  {
-    // Read a 4-byte value from the data pointer as individual bytes for safe
-    // unaligned access (very likely for string types).
-    auto const block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
-  }
-
-  [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
-  {
-    return compute(detail::normalize_nans_and_zeros(key));
-  }
-
-  template <typename T>
-  result_type __device__ inline compute(T const& key) const
-  {
-    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
-  }
-
-  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
-                                                        cudf::size_type len,
-                                                        cudf::size_type tail_offset,
-                                                        result_type h) const
-  {
-    // Process remaining bytes that do not fill a four-byte chunk.
-    uint32_t k1 = 0;
-    switch (len % 4) {
-      case 3: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 2]) << 16; [[fallthrough]];
-      case 2: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 1]) << 8; [[fallthrough]];
-      case 1:
-        k1 ^= std::to_integer<uint8_t>(data[tail_offset]);
-        k1 *= c1;
-        k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
-        k1 *= c2;
-        h ^= k1;
-    };
-    return h;
-  }
-
-  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
-  {
-    constexpr cudf::size_type BLOCK_SIZE = 4;
-    cudf::size_type const nblocks        = len / BLOCK_SIZE;
-    cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
-    result_type h                        = m_seed;
-
-    // Process all four-byte chunks.
-    for (cudf::size_type i = 0; i < nblocks; i++) {
-      uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
-      k1 *= c1;
-      k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
-      k1 *= c2;
-      h ^= k1;
-      h = cudf::detail::rotate_bits_left(h, rot_c2);
-      h = h * 5 + c3;
-    }
-
-    h = compute_remaining_bytes(data, len, tail_offset, h);
-
-    // Finalize hash.
-    h ^= len;
-    h = fmix32(h);
-    return h;
-  }
-
- private:
-  uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
-  static constexpr uint32_t c1     = 0xcc9e2d51;
-  static constexpr uint32_t c2     = 0x1b873593;
-  static constexpr uint32_t c3     = 0xe6546b64;
-  static constexpr uint32_t rot_c1 = 15;
-  static constexpr uint32_t rot_c2 = 13;
-};
-
-template <>
-hash_value_type __device__ inline MurmurHash3_32<bool>::operator()(bool const& key) const
-{
-  return compute(static_cast<uint8_t>(key));
-}
-
-template <>
-hash_value_type __device__ inline MurmurHash3_32<float>::operator()(float const& key) const
-{
-  return compute(detail::normalize_nans_and_zeros(key));
-}
-
-template <>
-hash_value_type __device__ inline MurmurHash3_32<double>::operator()(double const& key) const
-{
-  return compute(detail::normalize_nans_and_zeros(key));
-}
-
-template <>
-hash_value_type __device__ inline MurmurHash3_32<cudf::string_view>::operator()(
-  cudf::string_view const& key) const
-{
-  auto const data = reinterpret_cast<std::byte const*>(key.data());
-  auto const len  = key.size_bytes();
-  return compute_bytes(data, len);
-}
-
-template <>
-hash_value_type __device__ inline MurmurHash3_32<numeric::decimal32>::operator()(
-  numeric::decimal32 const& key) const
-{
-  return compute(key.value());
-}
-
-template <>
-hash_value_type __device__ inline MurmurHash3_32<numeric::decimal64>::operator()(
-  numeric::decimal64 const& key) const
-{
-  return compute(key.value());
-}
-
-template <>
-hash_value_type __device__ inline MurmurHash3_32<numeric::decimal128>::operator()(
-  numeric::decimal128 const& key) const
-{
-  return compute(key.value());
-}
-
-template <>
-hash_value_type __device__ inline MurmurHash3_32<cudf::list_view>::operator()(
-  cudf::list_view const& key) const
-{
-  CUDF_UNREACHABLE("List column hashing is not supported");
-}
-
-template <>
-hash_value_type __device__ inline MurmurHash3_32<cudf::struct_view>::operator()(
-  cudf::struct_view const& key) const
-{
-  CUDF_UNREACHABLE("Direct hashing of struct_view is not supported");
-}
-
-/**
- * @brief  This hash function simply returns the value that is asked to be hash
- * reinterpreted as the result_type of the functor.
- */
-template <typename Key>
-struct IdentityHash {
-  using result_type = hash_value_type;
-  IdentityHash()    = default;
-  constexpr IdentityHash(uint32_t seed) : m_seed(seed) {}
-
-  template <typename return_type = result_type>
-  constexpr std::enable_if_t<!std::is_arithmetic_v<Key>, return_type> operator()(
-    Key const& key) const
-  {
-    CUDF_UNREACHABLE("IdentityHash does not support this data type");
-  }
-
-  template <typename return_type = result_type>
-  constexpr std::enable_if_t<std::is_arithmetic_v<Key>, return_type> operator()(
-    Key const& key) const
-  {
-    return static_cast<result_type>(key);
-  }
-
- private:
-  uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
-};
-
-template <typename Key>
-using default_hash = MurmurHash3_32<Key>;
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/int_fastdiv.h b/cpp/include/cudf/detail/utilities/int_fastdiv.h
index b56fe0e88c1..ff442af5194 100644
--- a/cpp/include/cudf/detail/utilities/int_fastdiv.h
+++ b/cpp/include/cudf/detail/utilities/int_fastdiv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Copyright 2014 Maxim Milakov
  *
@@ -58,7 +58,7 @@ class int_fastdiv {
 
     int p;
     unsigned int ad, anc, delta, q1, r1, q2, r2, t;
-    const unsigned two31 = 0x8000'0000u;
+    unsigned const two31 = 0x8000'0000u;
     ad                   = (d == 0) ? 1 : abs(d);
     t                    = two31 + ((unsigned int)d >> 31);
     anc                  = t - 1 - t % ad;
@@ -95,11 +95,11 @@ class int_fastdiv {
       n_add_sign = 0;
   }
 
-  __host__ __device__ __forceinline__ friend int operator/(const int divident,
-                                                           const int_fastdiv& divisor);
+  __host__ __device__ __forceinline__ friend int operator/(int const divident,
+                                                           int_fastdiv const& divisor);
 };
 
-__host__ __device__ __forceinline__ int operator/(const int n, const int_fastdiv& divisor)
+__host__ __device__ __forceinline__ int operator/(int const n, int_fastdiv const& divisor)
 {
   int q;
 #ifdef __CUDA_ARCH__
@@ -115,61 +115,61 @@ __host__ __device__ __forceinline__ int operator/(const int n, const int_fastdiv
   return q;
 }
 
-__host__ __device__ __forceinline__ int operator%(const int n, const int_fastdiv& divisor)
+__host__ __device__ __forceinline__ int operator%(int const n, int_fastdiv const& divisor)
 {
   int quotient  = n / divisor;
   int remainder = n - quotient * divisor;
   return remainder;
 }
 
-__host__ __device__ __forceinline__ int operator/(const unsigned int n, const int_fastdiv& divisor)
+__host__ __device__ __forceinline__ int operator/(unsigned int const n, int_fastdiv const& divisor)
 {
   return ((int)n) / divisor;
 }
 
-__host__ __device__ __forceinline__ int operator%(const unsigned int n, const int_fastdiv& divisor)
+__host__ __device__ __forceinline__ int operator%(unsigned int const n, int_fastdiv const& divisor)
 {
   return ((int)n) % divisor;
 }
 
-__host__ __device__ __forceinline__ int operator/(const short n, const int_fastdiv& divisor)
+__host__ __device__ __forceinline__ int operator/(short const n, int_fastdiv const& divisor)
 {
   return ((int)n) / divisor;
 }
 
-__host__ __device__ __forceinline__ int operator%(const short n, const int_fastdiv& divisor)
+__host__ __device__ __forceinline__ int operator%(short const n, int_fastdiv const& divisor)
 {
   return ((int)n) % divisor;
 }
 
-__host__ __device__ __forceinline__ int operator/(const unsigned short n,
-                                                  const int_fastdiv& divisor)
+__host__ __device__ __forceinline__ int operator/(unsigned short const n,
+                                                  int_fastdiv const& divisor)
 {
   return ((int)n) / divisor;
 }
 
-__host__ __device__ __forceinline__ int operator%(const unsigned short n,
-                                                  const int_fastdiv& divisor)
+__host__ __device__ __forceinline__ int operator%(unsigned short const n,
+                                                  int_fastdiv const& divisor)
 {
   return ((int)n) % divisor;
 }
 
-__host__ __device__ __forceinline__ int operator/(const char n, const int_fastdiv& divisor)
+__host__ __device__ __forceinline__ int operator/(char const n, int_fastdiv const& divisor)
 {
   return ((int)n) / divisor;
 }
 
-__host__ __device__ __forceinline__ int operator%(const char n, const int_fastdiv& divisor)
+__host__ __device__ __forceinline__ int operator%(char const n, int_fastdiv const& divisor)
 {
   return ((int)n) % divisor;
 }
 
-__host__ __device__ __forceinline__ int operator/(const unsigned char n, const int_fastdiv& divisor)
+__host__ __device__ __forceinline__ int operator/(unsigned char const n, int_fastdiv const& divisor)
 {
   return ((int)n) / divisor;
 }
 
-__host__ __device__ __forceinline__ int operator%(const unsigned char n, const int_fastdiv& divisor)
+__host__ __device__ __forceinline__ int operator%(unsigned char const n, int_fastdiv const& divisor)
 {
   return ((int)n) % divisor;
 }
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 40faae7e9f4..8b709f2a8f8 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ namespace util {
  * `modulus` is positive. The safety is in regard to rollover.
  */
 template <typename S>
-S round_up_safe(S number_to_round, S modulus)
+constexpr S round_up_safe(S number_to_round, S modulus)
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
@@ -67,7 +67,7 @@ S round_up_safe(S number_to_round, S modulus)
  * `modulus` is positive and does not check for overflow.
  */
 template <typename S>
-S round_down_safe(S number_to_round, S modulus) noexcept
+constexpr S round_down_safe(S number_to_round, S modulus) noexcept
 {
   auto remainder    = number_to_round % modulus;
   auto rounded_down = number_to_round - remainder;
@@ -107,7 +107,7 @@ constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept
  * the result will be incorrect
  */
 template <typename S, typename T>
-constexpr S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept
+constexpr S div_rounding_up_unsafe(S const& dividend, T const& divisor) noexcept
 {
   return (dividend + divisor - 1) / divisor;
 }
diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
index 83f061e9407..9e2b85ea129 100644
--- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
@@ -50,7 +50,7 @@ class pinned_allocator<void> {
  public:
   using value_type      = void;            ///< The type of the elements in the allocator
   using pointer         = void*;           ///< The type returned by address() / allocate()
-  using const_pointer   = const void*;     ///< The type returned by address()
+  using const_pointer   = void const*;     ///< The type returned by address()
   using size_type       = std::size_t;     ///< The type used for the size of the allocation
   using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
 
@@ -76,9 +76,9 @@ class pinned_allocator {
  public:
   using value_type      = T;               ///< The type of the elements in the allocator
   using pointer         = T*;              ///< The type returned by address() / allocate()
-  using const_pointer   = const T*;        ///< The type returned by address()
+  using const_pointer   = T const*;        ///< The type returned by address()
   using reference       = T&;              ///< The parameter type for address()
-  using const_reference = const T&;        ///< The parameter type for address()
+  using const_reference = T const&;        ///< The parameter type for address()
   using size_type       = std::size_t;     ///< The type used for the size of the allocation
   using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
 
diff --git a/cpp/include/cudf/detail/utilities/stacktrace.hpp b/cpp/include/cudf/detail/utilities/stacktrace.hpp
new file mode 100644
index 00000000000..c3ec9ce7a52
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/stacktrace.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+
+namespace cudf::detail {
+/**
+ * @addtogroup utility_stacktrace
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Specify whether the last stackframe is included in the stacktrace.
+ */
+enum class capture_last_stackframe : bool { YES, NO };
+
+/**
+ * @brief Query the current stacktrace and return the whole stacktrace as one string.
+ *
+ * Depending on the value of the flag `capture_last_frame`, the caller that executes stacktrace
+ * retrieval can be included in the output result.
+ *
+ * @param capture_last_frame Flag to specify if the current stackframe will be included into
+ *        the output
+ * @return A string storing the whole current stacktrace
+ */
+std::string get_stacktrace(capture_last_stackframe capture_last_frame);
+
+/** @} */  // end of group
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index c446a7b5148..90ad98741ad 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -398,7 +398,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
 }
 
 /**
- * @brief Synchronously construct a `std::vector` containing a copy of data from a
+ * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
  * @note This function does a synchronize on `stream`.
@@ -417,7 +417,7 @@ thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_s
 }
 
 /**
- * @brief Synchronously construct a `std::vector` containing a copy of data from a device
+ * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a device
  * container
  *
  * @note This function synchronizes `stream`.
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 76d6fd719a4..f3f95dad017 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -49,8 +49,8 @@ __global__ void valid_if_kernel(
 {
   constexpr size_type leader_lane{0};
   auto const lane_id{threadIdx.x % warp_size};
-  thread_index_type i            = threadIdx.x + blockIdx.x * blockDim.x;
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto i            = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
   size_type warp_valid_count{0};
 
   auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size);
@@ -119,7 +119,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,
 
  * Given a set of bitmasks, `masks`, the state of bit `j` in mask `i` is
  * determined by `p( *(begin1 + i), *(begin2 + j))`. If the predicate evaluates
- * to true, the the bit is set to `1`. If false, set to `0`.
+ * to true, the bit is set to `1`. If false, set to `0`.
  *
  * Example Arguments:
  * begin1:        zero-based counting iterator,
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index 8688e97ab7e..1268f488919 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -54,11 +55,13 @@ namespace cudf {
  * @param begin The starting index of the fill range (inclusive)
  * @param end The index of the last element in the fill range (exclusive)
  * @param value The scalar value to fill
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void fill_in_place(mutable_column_view& destination,
                    size_type begin,
                    size_type end,
-                   scalar const& value);
+                   scalar const& value,
+                   rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Fills a range of elements in a column out-of-place with a scalar
@@ -79,6 +82,7 @@ void fill_in_place(mutable_column_view& destination,
  * @param begin The starting index of the fill range (inclusive)
  * @param end The index of the last element in the fill range (exclusive)
  * @param value The scalar value to fill
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return The result output column
  */
@@ -87,6 +91,7 @@ std::unique_ptr<column> fill(
   size_type begin,
   size_type end,
   scalar const& value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -113,12 +118,14 @@ std::unique_ptr<column> fill(
  *
  * @param input_table Input table
  * @param count Non-nullable column of an integral type
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The result table containing the repetitions
  */
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   column_view const& count,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -131,19 +138,19 @@ std::unique_ptr<table> repeat(
  * count = 2
  * return = [4,4,5,5,6,6]
  * ```
- * @throws cudf::logic_error if the data type of @p count is not size_type.
- * @throws cudf::logic_error if @p count is invalid or @p count is negative.
- * @throws cudf::logic_error if @p input_table.num_rows() * @p count overflows
- * size_type.
+ * @throws cudf::logic_error if @p count is negative.
+ * @throws std::overflow_error if @p input_table.num_rows() * @p count overflows size_type.
  *
  * @param input_table Input table
  * @param count Number of repetitions
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The result table containing the repetitions
  */
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   size_type count,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -166,6 +173,7 @@ std::unique_ptr<table> repeat(
  * @param size Size of the output column
  * @param init First value in the sequence
  * @param step Increment value
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return The result column containing the generated sequence
  */
@@ -173,6 +181,7 @@ std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
   scalar const& step,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -192,12 +201,14 @@ std::unique_ptr<column> sequence(
  *
  * @param size Size of the output column
  * @param init First value in the sequence
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return The result column containing the generated sequence
  */
 std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -219,6 +230,7 @@ std::unique_ptr<column> sequence(
  * @param size Number of timestamps to generate
  * @param init The initial timestamp
  * @param months Months to increment
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @return Timestamps column with sequences of months
@@ -227,6 +239,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(
   size_type size,
   scalar const& init,
   size_type months,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index b178700cfc3..7c59c2f9194 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -20,11 +20,8 @@
 #include <cudf/fixed_point/temporary.hpp>
 #include <cudf/types.hpp>
 
-// Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
-//       Jitify is needed for several algorithms (binaryop, rolling, etc)
-#include <cuda/std/climits>
 #include <cuda/std/limits>
-#include <cuda/std/type_traits>  // add cuda namespace
+#include <cuda/std/type_traits>
 
 #include <algorithm>
 #include <cassert>
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 8492916bb3c..1de7f66127b 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,12 +15,12 @@
  */
 
 #pragma once
+// To avoid https://github.com/NVIDIA/libcudacxx/issues/460
+// in libcudacxx with CTK 12.0/12.1
+#include <cuda_runtime.h>
 
 #include <cudf/types.hpp>
 
-// Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
-//       Jitify is needed for several algorithms (binaryop, rolling, etc)
-#include <cuda/std/climits>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 2f5c0d53e72..6e575685daa 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -186,6 +186,15 @@ class groupby {
     host_span<aggregation_request const> requests,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+  /**
+   * @copydoc aggregate(host_span<aggregation_request const>, rmm::mr::device_memory_resource*)
+   *
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   */
+  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
+    host_span<aggregation_request const> requests,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
   /**
    * @brief Performs grouped scans on the specified values.
    *
@@ -294,7 +303,7 @@ class groupby {
   std::pair<std::unique_ptr<table>, std::unique_ptr<table>> shift(
     table_view const& values,
     host_span<size_type const> offsets,
-    std::vector<std::reference_wrapper<const scalar>> const& fill_values,
+    std::vector<std::reference_wrapper<scalar const>> const& fill_values,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index a8f4f271309..72e32715ed4 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,6 @@
 
 namespace cudf {
 
-using hash_value_type = uint32_t;  ///< Type of hash value
-
 /**
  * @addtogroup column_hash
  * @{
@@ -31,7 +29,14 @@ using hash_value_type = uint32_t;  ///< Type of hash value
  */
 
 /**
- *  @brief Identifies the hash function to be used
+ * @brief Type of hash value
+ *
+ */
+using hash_value_type = uint32_t;
+
+/**
+ * @brief Identifies the hash function to be used
+ *
  */
 enum class hash_id {
   HASH_IDENTITY = 0,   ///< Identity hash function that simply returns the key to be hashed
@@ -48,9 +53,12 @@ static constexpr uint32_t DEFAULT_HASH_SEED = 0;
 /**
  * @brief Computes the hash value of each row in the input set of columns.
  *
+ * @deprecated Since 23.08
+ *
  * @param input The table of columns to hash
  * @param hash_function The hash function enum to use
  * @param seed Optional seed value to use for the hash function
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns A column where each row is the hash of a column from the input
@@ -59,7 +67,103 @@ std::unique_ptr<column> hash(
   table_view const& input,
   hash_id hash_function               = hash_id::HASH_MURMUR3,
   uint32_t seed                       = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+//! Hash APIs
+namespace hashing {
+
+/**
+ * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table
+ *
+ * This function computes the hash of each column using the `seed` for the first column
+ * and the resulting hash as a seed for the next column and so on.
+ * The result is a uint32 value for each row.
+ *
+ * @param input The table of columns to hash
+ * @param seed Optional seed value to use for the hash function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> murmurhash3_x86_32(
+  table_view const& input,
+  uint32_t seed                       = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes the MurmurHash3 64-bit hash value of each row in the given table
+ *
+ * This function takes a 64-bit seed value and returns hash values using the
+ * MurmurHash3_x64_128 algorithm. The hash produces in two uint64 values per row.
+ *
+ * @param input The table of columns to hash
+ * @param seed Optional seed value to use for the hash function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A table of two UINT64 columns
+ */
+std::unique_ptr<table> murmurhash3_x64_128(
+  table_view const& input,
+  uint64_t seed                       = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table
+ *
+ * This function computes the hash similar to MurmurHash3_x86_32 with special processing
+ * to match Spark's implementation results.
+ *
+ * @param input The table of columns to hash
+ * @param seed Optional seed value to use for the hash function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> spark_murmurhash3_x86_32(
+  table_view const& input,
+  uint32_t seed                       = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes the MD5 hash value of each row in the given table
+ *
+ * @param input The table of columns to hash
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> md5(
+  table_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes the XXHash_64 hash value of each row in the given table
+ *
+ * This function takes a 64-bit seed value and returns a column of type UINT64.
+ *
+ * @param input The table of columns to hash
+ * @param seed Optional seed value to use for the hash function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> xxhash_64(
+  table_view const& input,
+  uint64_t seed                       = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+}  // namespace hashing
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/hashing/detail/default_hash.cuh b/cpp/include/cudf/hashing/detail/default_hash.cuh
new file mode 100644
index 00000000000..37e13d8842f
--- /dev/null
+++ b/cpp/include/cudf/hashing/detail/default_hash.cuh
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+
+namespace cudf::hashing::detail {
+
+/**
+ * @brief The default hash algorithm for use within libcudf internal functions
+ *
+ * This is declared here so it may be changed to another algorithm without modifying
+ * all those places that use it. Internal function implementations are encourage to
+ * use the `cudf::hashing::detail::default_hash` where possible.
+ *
+ * @tparam Key The key type for use by the hash class
+ */
+template <typename Key>
+using default_hash = MurmurHash3_x86_32<Key>;
+
+}  // namespace cudf::hashing::detail
diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh
new file mode 100644
index 00000000000..7a3d1990791
--- /dev/null
+++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/traits.hpp>
+
+#include <limits>
+
+namespace cudf::hashing::detail {
+
+/**
+ * Normalization of floating point NaNs, passthrough for all other values.
+ */
+template <typename T>
+T __device__ inline normalize_nans(T const& key)
+{
+  if constexpr (cudf::is_floating_point<T>()) {
+    if (std::isnan(key)) { return std::numeric_limits<T>::quiet_NaN(); }
+  }
+  return key;
+}
+
+/**
+ * Normalization of floating point NaNs and zeros, passthrough for all other values.
+ */
+template <typename T>
+T __device__ inline normalize_nans_and_zeros(T const& key)
+{
+  if constexpr (cudf::is_floating_point<T>()) {
+    if (key == T{0.0}) { return T{0.0}; }
+  }
+  return normalize_nans(key);
+}
+
+__device__ inline uint32_t rotate_bits_left(uint32_t x, uint32_t r)
+{
+  // This function is equivalent to (x << r) | (x >> (32 - r))
+  return __funnelshift_l(x, x, r);
+}
+
+__device__ inline uint64_t rotate_bits_left(uint64_t x, uint32_t r)
+{
+  return (x << r) | (x >> (64 - r));
+}
+
+__device__ inline uint32_t rotate_bits_right(uint32_t x, uint32_t r)
+{
+  // This function is equivalent to (x >> r) | (x << (32 - r))
+  return __funnelshift_r(x, x, r);
+}
+
+__device__ inline uint64_t rotate_bits_right(uint64_t x, uint32_t r)
+{
+  return (x >> r) | (x << (64 - r));
+}
+
+}  // namespace cudf::hashing::detail
diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
similarity index 62%
rename from cpp/include/cudf/detail/hashing.hpp
rename to cpp/include/cudf/hashing/detail/hashing.hpp
index 771b3e150ec..f08d0fbb849 100644
--- a/cpp/include/cudf/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -24,32 +24,32 @@
 #include <functional>
 
 namespace cudf {
+namespace hashing {
 namespace detail {
 
-/**
- * @copydoc cudf::hash
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> hash(table_view const& input,
-                             hash_id hash_function,
-                             uint32_t seed,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
+                                           uint32_t seed,
+                                           rmm::cuda_stream_view,
+                                           rmm::mr::device_memory_resource* mr);
+
+std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
+                                           uint64_t seed,
+                                           rmm::cuda_stream_view,
+                                           rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> murmur_hash3_32(table_view const& input,
-                                        uint32_t seed,
-                                        rmm::cuda_stream_view,
-                                        rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> spark_murmurhash3_x86_32(table_view const& input,
+                                                 uint32_t seed,
+                                                 rmm::cuda_stream_view,
+                                                 rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> spark_murmur_hash3_32(table_view const& input,
-                                              uint32_t seed,
-                                              rmm::cuda_stream_view,
-                                              rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> md5(table_view const& input,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> md5_hash(table_view const& input,
-                                 rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> xxhash_64(table_view const& input,
+                                  uint64_t seed,
+                                  rmm::cuda_stream_view,
+                                  rmm::mr::device_memory_resource* mr);
 
 /* Copyright 2005-2014 Daniel James.
  *
@@ -94,6 +94,7 @@ constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs)
 }
 
 }  // namespace detail
+}  // namespace hashing
 }  // namespace cudf
 
 // specialization of std::hash for cudf::data_type
@@ -102,8 +103,8 @@ template <>
 struct hash<cudf::data_type> {
   std::size_t operator()(cudf::data_type const& type) const noexcept
   {
-    return cudf::detail::hash_combine(std::hash<int32_t>{}(static_cast<int32_t>(type.id())),
-                                      std::hash<int32_t>{}(type.scale()));
+    return cudf::hashing::detail::hash_combine(
+      std::hash<int32_t>{}(static_cast<int32_t>(type.id())), std::hash<int32_t>{}(type.scale()));
   }
 };
 }  // namespace std
diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
new file mode 100644
index 00000000000..c986a908706
--- /dev/null
+++ b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/hashing/detail/hash_functions.cuh>
+#include <cudf/strings/string_view.cuh>
+
+#include <thrust/pair.h>
+
+namespace cudf::hashing::detail {
+
+// MurmurHash3_x64_128 implementation from
+// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+template <typename Key>
+struct MurmurHash3_x64_128 {
+  using result_type = thrust::pair<uint64_t, uint64_t>;
+
+  constexpr MurmurHash3_x64_128() = default;
+  constexpr MurmurHash3_x64_128(uint64_t seed) : m_seed(seed) {}
+
+  __device__ inline uint32_t getblock32(std::byte const* data, cudf::size_type offset) const
+  {
+    // Read a 4-byte value from the data pointer as individual bytes for safe
+    // unaligned access (very likely for string types).
+    auto block = reinterpret_cast<uint8_t const*>(data + offset);
+    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
+  }
+
+  __device__ inline uint64_t getblock64(std::byte const* data, cudf::size_type offset) const
+  {
+    uint64_t result = getblock32(data, offset + 4);
+    result          = result << 32;
+    return result | getblock32(data, offset);
+  }
+
+  __device__ inline uint64_t fmix64(uint64_t k) const
+  {
+    k ^= k >> 33;
+    k *= 0xff51afd7ed558ccdUL;
+    k ^= k >> 33;
+    k *= 0xc4ceb9fe1a85ec53UL;
+    k ^= k >> 33;
+    return k;
+  }
+
+  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
+
+  template <typename T>
+  result_type __device__ inline compute(T const& key) const
+  {
+    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
+  }
+
+  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
+                                                        cudf::size_type len,
+                                                        cudf::size_type tail_offset,
+                                                        result_type h) const
+  {
+    // Process remaining bytes that do not fill a 8-byte chunk.
+    uint64_t k1     = 0;
+    uint64_t k2     = 0;
+    auto const tail = reinterpret_cast<uint8_t const*>(data) + tail_offset;
+    switch (len & (BLOCK_SIZE - 1)) {
+      case 15: k2 ^= static_cast<uint64_t>(tail[14]) << 48;
+      case 14: k2 ^= static_cast<uint64_t>(tail[13]) << 40;
+      case 13: k2 ^= static_cast<uint64_t>(tail[12]) << 32;
+      case 12: k2 ^= static_cast<uint64_t>(tail[11]) << 24;
+      case 11: k2 ^= static_cast<uint64_t>(tail[10]) << 16;
+      case 10: k2 ^= static_cast<uint64_t>(tail[9]) << 8;
+      case 9:
+        k2 ^= static_cast<uint64_t>(tail[8]) << 0;
+        k2 *= c2;
+        k2 = rotate_bits_left(k2, 33);
+        k2 *= c1;
+        h.second ^= k2;
+
+      case 8: k1 ^= static_cast<uint64_t>(tail[7]) << 56;
+      case 7: k1 ^= static_cast<uint64_t>(tail[6]) << 48;
+      case 6: k1 ^= static_cast<uint64_t>(tail[5]) << 40;
+      case 5: k1 ^= static_cast<uint64_t>(tail[4]) << 32;
+      case 4: k1 ^= static_cast<uint64_t>(tail[3]) << 24;
+      case 3: k1 ^= static_cast<uint64_t>(tail[2]) << 16;
+      case 2: k1 ^= static_cast<uint64_t>(tail[1]) << 8;
+      case 1:
+        k1 ^= static_cast<uint64_t>(tail[0]) << 0;
+        k1 *= c1;
+        k1 = rotate_bits_left(k1, 31);
+        k1 *= c2;
+        h.first ^= k1;
+    };
+    return h;
+  }
+
+  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
+  {
+    auto const nblocks = len / BLOCK_SIZE;
+    uint64_t h1        = m_seed;
+    uint64_t h2        = m_seed;
+
+    // Process all four-byte chunks.
+    for (cudf::size_type i = 0; i < nblocks; i++) {
+      uint64_t k1 = getblock64(data, (i * BLOCK_SIZE));                     // 1st 8 bytes
+      uint64_t k2 = getblock64(data, (i * BLOCK_SIZE) + (BLOCK_SIZE / 2));  // 2nd 8 bytes
+
+      k1 *= c1;
+      k1 = rotate_bits_left(k1, 31);
+      k1 *= c2;
+
+      h1 ^= k1;
+      h1 = rotate_bits_left(h1, 27);
+      h1 += h2;
+      h1 = h1 * 5 + 0x52dce729;
+
+      k2 *= c2;
+      k2 = rotate_bits_left(k2, 33);
+      k2 *= c1;
+
+      h2 ^= k2;
+      h2 = rotate_bits_left(h2, 31);
+      h2 += h1;
+      h2 = h2 * 5 + 0x38495ab5;
+    }
+
+    thrust::tie(h1, h2) = compute_remaining_bytes(data, len, nblocks * BLOCK_SIZE, {h1, h2});
+
+    // Finalize hash.
+    h1 ^= len;
+    h2 ^= len;
+
+    h1 += h2;
+    h2 += h1;
+
+    h1 = fmix64(h1);
+    h2 = fmix64(h2);
+
+    h1 += h2;
+    h2 += h1;
+
+    return {h1, h2};
+  }
+
+ private:
+  uint64_t m_seed{};
+  static constexpr uint32_t BLOCK_SIZE = 16;  // 2 x 64-bit = 16 bytes
+
+  static constexpr uint64_t c1 = 0x87c37b91114253d5UL;
+  static constexpr uint64_t c2 = 0x4cf5ad432745937fUL;
+};
+
+template <>
+MurmurHash3_x64_128<bool>::result_type __device__ inline MurmurHash3_x64_128<bool>::operator()(
+  bool const& key) const
+{
+  return compute<uint8_t>(key);
+}
+
+template <>
+MurmurHash3_x64_128<float>::result_type __device__ inline MurmurHash3_x64_128<float>::operator()(
+  float const& key) const
+{
+  return compute(normalize_nans(key));
+}
+
+template <>
+MurmurHash3_x64_128<double>::result_type __device__ inline MurmurHash3_x64_128<double>::operator()(
+  double const& key) const
+{
+  return compute(normalize_nans(key));
+}
+
+template <>
+MurmurHash3_x64_128<cudf::string_view>::result_type
+  __device__ inline MurmurHash3_x64_128<cudf::string_view>::operator()(
+    cudf::string_view const& key) const
+{
+  auto const data = reinterpret_cast<std::byte const*>(key.data());
+  auto const len  = key.size_bytes();
+  return compute_bytes(data, len);
+}
+
+template <>
+MurmurHash3_x64_128<numeric::decimal32>::result_type
+  __device__ inline MurmurHash3_x64_128<numeric::decimal32>::operator()(
+    numeric::decimal32 const& key) const
+{
+  return compute(key.value());
+}
+
+template <>
+MurmurHash3_x64_128<numeric::decimal64>::result_type
+  __device__ inline MurmurHash3_x64_128<numeric::decimal64>::operator()(
+    numeric::decimal64 const& key) const
+{
+  return compute(key.value());
+}
+
+template <>
+MurmurHash3_x64_128<numeric::decimal128>::result_type
+  __device__ inline MurmurHash3_x64_128<numeric::decimal128>::operator()(
+    numeric::decimal128 const& key) const
+{
+  return compute(key.value());
+}
+
+}  // namespace cudf::hashing::detail
diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
new file mode 100644
index 00000000000..6cf0b0fe817
--- /dev/null
+++ b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2017-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/hashing.hpp>
+#include <cudf/hashing/detail/hash_functions.cuh>
+#include <cudf/lists/list_view.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/structs/struct_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cstddef>
+
+namespace cudf::hashing::detail {
+
+// MurmurHash3_x86_32 implementation from
+// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+template <typename Key>
+struct MurmurHash3_x86_32 {
+  using result_type = hash_value_type;
+
+  constexpr MurmurHash3_x86_32() = default;
+  constexpr MurmurHash3_x86_32(uint32_t seed) : m_seed(seed) {}
+
+  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
+  {
+    h ^= h >> 16;
+    h *= 0x85ebca6b;
+    h ^= h >> 13;
+    h *= 0xc2b2ae35;
+    h ^= h >> 16;
+    return h;
+  }
+
+  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
+                                                      cudf::size_type offset) const
+  {
+    // Read a 4-byte value from the data pointer as individual bytes for safe
+    // unaligned access (very likely for string types).
+    auto const block = reinterpret_cast<uint8_t const*>(data + offset);
+    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
+  }
+
+  [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
+  {
+    return compute(normalize_nans_and_zeros(key));
+  }
+
+  template <typename T>
+  result_type __device__ inline compute(T const& key) const
+  {
+    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
+  }
+
+  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
+                                                        cudf::size_type len,
+                                                        cudf::size_type tail_offset,
+                                                        result_type h) const
+  {
+    // Process remaining bytes that do not fill a four-byte chunk.
+    uint32_t k1 = 0;
+    switch (len % 4) {
+      case 3: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 2]) << 16; [[fallthrough]];
+      case 2: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 1]) << 8; [[fallthrough]];
+      case 1:
+        k1 ^= std::to_integer<uint8_t>(data[tail_offset]);
+        k1 *= c1;
+        k1 = rotate_bits_left(k1, rot_c1);
+        k1 *= c2;
+        h ^= k1;
+    };
+    return h;
+  }
+
+  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
+  {
+    constexpr cudf::size_type BLOCK_SIZE = 4;
+    cudf::size_type const nblocks        = len / BLOCK_SIZE;
+    cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
+    result_type h                        = m_seed;
+
+    // Process all four-byte chunks.
+    for (cudf::size_type i = 0; i < nblocks; i++) {
+      uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
+      k1 *= c1;
+      k1 = rotate_bits_left(k1, rot_c1);
+      k1 *= c2;
+      h ^= k1;
+      h = rotate_bits_left(h, rot_c2);
+      h = h * 5 + c3;
+    }
+
+    h = compute_remaining_bytes(data, len, tail_offset, h);
+
+    // Finalize hash.
+    h ^= len;
+    h = fmix32(h);
+    return h;
+  }
+
+ private:
+  uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
+  static constexpr uint32_t c1     = 0xcc9e2d51;
+  static constexpr uint32_t c2     = 0x1b873593;
+  static constexpr uint32_t c3     = 0xe6546b64;
+  static constexpr uint32_t rot_c1 = 15;
+  static constexpr uint32_t rot_c2 = 13;
+};
+
+template <>
+hash_value_type __device__ inline MurmurHash3_x86_32<bool>::operator()(bool const& key) const
+{
+  return compute(static_cast<uint8_t>(key));
+}
+
+template <>
+hash_value_type __device__ inline MurmurHash3_x86_32<float>::operator()(float const& key) const
+{
+  return compute(normalize_nans_and_zeros(key));
+}
+
+template <>
+hash_value_type __device__ inline MurmurHash3_x86_32<double>::operator()(double const& key) const
+{
+  return compute(normalize_nans_and_zeros(key));
+}
+
+template <>
+hash_value_type __device__ inline MurmurHash3_x86_32<cudf::string_view>::operator()(
+  cudf::string_view const& key) const
+{
+  auto const data = reinterpret_cast<std::byte const*>(key.data());
+  auto const len  = key.size_bytes();
+  return compute_bytes(data, len);
+}
+
+template <>
+hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal32>::operator()(
+  numeric::decimal32 const& key) const
+{
+  return compute(key.value());
+}
+
+template <>
+hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal64>::operator()(
+  numeric::decimal64 const& key) const
+{
+  return compute(key.value());
+}
+
+template <>
+hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal128>::operator()(
+  numeric::decimal128 const& key) const
+{
+  return compute(key.value());
+}
+
+template <>
+hash_value_type __device__ inline MurmurHash3_x86_32<cudf::list_view>::operator()(
+  cudf::list_view const& key) const
+{
+  CUDF_UNREACHABLE("List column hashing is not supported");
+}
+
+template <>
+hash_value_type __device__ inline MurmurHash3_x86_32<cudf::struct_view>::operator()(
+  cudf::struct_view const& key) const
+{
+  CUDF_UNREACHABLE("Direct hashing of struct_view is not supported");
+}
+
+}  // namespace cudf::hashing::detail
diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp
new file mode 100644
index 00000000000..5f79f05c5a1
--- /dev/null
+++ b/cpp/include/cudf/io/arrow_io_source.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "datasource.hpp"
+
+#include <arrow/filesystem/filesystem.h>
+#include <arrow/io/interfaces.h>
+
+#include <memory>
+#include <string>
+
+namespace cudf::io {
+/**
+ * @addtogroup io_datasources
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Implementation class for reading from an Apache Arrow file. The file
+ * could be a memory-mapped file or other implementation supported by Arrow.
+ */
+class arrow_io_source : public datasource {
+ public:
+  /**
+   * @brief Constructs an object from an Apache Arrow Filesystem URI
+   *
+   * @param arrow_uri Apache Arrow Filesystem URI
+   */
+  explicit arrow_io_source(std::string const& arrow_uri);
+
+  /**
+   * @brief Constructs an object from an `arrow` source object.
+   *
+   * @param file The `arrow` object from which the data is read
+   */
+  explicit arrow_io_source(std::shared_ptr<arrow::io::RandomAccessFile> file) : arrow_file(file) {}
+
+  /**
+   * @brief Returns a buffer with a subset of data from the `arrow` source.
+   *
+   * @param offset The offset in bytes from which to read
+   * @param size The number of bytes to read
+   * @return A buffer with the read data
+   */
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override;
+
+  /**
+   * @brief Reads a selected range from the `arrow` source into a preallocated buffer.
+   *
+   * @param[in] offset The offset in bytes from which to read
+   * @param[in] size The number of bytes to read
+   * @param[out] dst The preallocated buffer to read into
+   * @return The number of bytes read
+   */
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override;
+  /**
+   * @brief Returns the size of the data in the `arrow` source.
+   *
+   * @return The size of the data in the `arrow` source
+   */
+  [[nodiscard]] size_t size() const override;
+
+ private:
+  std::shared_ptr<arrow::fs::FileSystem> filesystem;
+  std::shared_ptr<arrow::io::RandomAccessFile> arrow_file;
+};
+
+/** @} */  // end of group
+}  // namespace cudf::io
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index b5669438b4f..c84ca7e6c73 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -208,7 +208,7 @@ class csv_reader_options {
   [[nodiscard]] std::size_t get_byte_range_padding() const
   {
     auto const num_names   = _names.size();
-    auto const num_dtypes  = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes);
+    auto const num_dtypes  = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes);
     auto const num_columns = std::max(num_dtypes, num_names);
 
     auto const max_row_bytes = 16 * 1024;  // 16KB
@@ -567,31 +567,33 @@ class csv_reader_options {
   /**
    * @brief Sets number of rows to skip from start.
    *
-   * @param skip Number of rows to skip
+   * @param skiprows Number of rows to skip
    */
-  void set_skiprows(size_type skip)
+  void set_skiprows(size_type skiprows)
   {
-    if ((skip != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
-      CUDF_FAIL(
-        "skiprows can't be a non zero value if range offset and/or range size has been set");
+    if ((skiprows != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
+      CUDF_FAIL("skiprows must be zero if range offset or range size has been set",
+                std::invalid_argument);
     }
-    _skiprows = skip;
+    _skiprows = skiprows;
   }
 
   /**
    * @brief Sets number of rows to skip from end.
    *
-   * @param skip Number of rows to skip
+   * @param skipfooter Number of rows to skip
    */
-  void set_skipfooter(size_type skip)
+  void set_skipfooter(size_type skipfooter)
   {
-    CUDF_EXPECTS((skip == 0) or (_nrows == -1), "Cannot use both `nrows` and `skipfooter`");
-    if ((skip != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
-      CUDF_FAIL(
-        "skipfooter can't be a non zero value if range offset and/or range size has been set");
+    CUDF_EXPECTS((skipfooter == 0) or (_nrows == -1),
+                 "Cannot use both `nrows` and `skipfooter`",
+                 std::invalid_argument);
+    if ((skipfooter != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) {
+      CUDF_FAIL("skipfooter must be zero if range offset or range size has been set",
+                std::invalid_argument);
     }
 
-    _skipfooter = skip;
+    _skipfooter = skipfooter;
   }
 
   /**
diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 88f9c188530..69d8a388d45 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -30,6 +30,13 @@
 namespace cudf {
 //! IO interfaces
 namespace io {
+
+/**
+ * @addtogroup io_datasinks
+ * @{
+ * @file
+ */
+
 /**
  * @brief Interface class for storing the output data from the writers
  */
@@ -41,7 +48,7 @@ class data_sink {
    * @param[in] filepath Path to the file to use
    * @return Constructed data_sink object
    */
-  static std::unique_ptr<data_sink> create(const std::string& filepath);
+  static std::unique_ptr<data_sink> create(std::string const& filepath);
 
   /**
    * @brief Create a sink from a std::vector
@@ -200,5 +207,6 @@ class data_sink {
   virtual size_t bytes_written() = 0;
 };
 
+/** @} */  // end of group
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 5c37be5a56f..28263d466f3 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -22,35 +22,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <arrow/buffer.h>
-
-// We disable warning 611 because some Arrow subclasses of
-// `arrow::fs::FileSystem` only partially override the `Equals` method,
-// triggering warning 611-D from nvcc.
-#ifdef __CUDACC__
-#pragma nv_diag_suppress 611
-#endif
-#include <arrow/filesystem/filesystem.h>
-#include <arrow/filesystem/s3fs.h>
-#ifdef __CUDACC__
-#pragma nv_diag_default 611
-#endif
-
-// We disable warning 2810 to workaround the compile issue (warning treated as error):
-// result.h(263): error #2810-D: ignoring return value type with "nodiscard" attribute
-#ifdef __CUDACC__
-#pragma nv_diag_suppress 2810
-#endif
-#include <arrow/result.h>
-#ifdef __CUDACC__
-#pragma nv_diag_default 2810
-#endif
-
-#include <arrow/io/file.h>
-#include <arrow/io/interfaces.h>
-#include <arrow/io/memory.h>
-#include <arrow/status.h>
-
 #include <future>
 #include <memory>
 
@@ -58,6 +29,12 @@ namespace cudf {
 //! IO interfaces
 namespace io {
 
+/**
+ * @addtogroup io_datasources
+ * @{
+ * @file
+ */
+
 /**
  * @brief Interface class for providing input data to the readers.
  */
@@ -113,7 +90,7 @@ class datasource {
    * @param[in] size Bytes from the offset; use zero for entire file (the default is zero)
    * @return Constructed datasource object
    */
-  static std::unique_ptr<datasource> create(const std::string& filepath,
+  static std::unique_ptr<datasource> create(std::string const& filepath,
                                             size_t offset = 0,
                                             size_t size   = 0);
 
@@ -143,15 +120,6 @@ class datasource {
    */
   static std::unique_ptr<datasource> create(cudf::device_span<std::byte const> buffer);
 
-  /**
-   * @brief Creates a source from a from an Arrow file.
-   *
-   * @param[in] arrow_file RandomAccessFile to which the API calls are forwarded
-   * @return Constructed datasource object
-   */
-  static std::unique_ptr<datasource> create(
-    std::shared_ptr<arrow::io::RandomAccessFile> arrow_file);
-
   /**
    * @brief Creates a source from an user implemented datasource object.
    *
@@ -406,107 +374,6 @@ class datasource {
   };
 };
 
-/**
- * @brief Implementation class for reading from an Apache Arrow file. The file
- * could be a memory-mapped file or other implementation supported by Arrow.
- */
-class arrow_io_source : public datasource {
-  /**
-   * @brief Implementation for an owning buffer where `arrow::Buffer` holds the data.
-   */
-  class arrow_io_buffer : public buffer {
-    std::shared_ptr<arrow::Buffer> arrow_buffer;
-
-   public:
-    explicit arrow_io_buffer(std::shared_ptr<arrow::Buffer> arrow_buffer)
-      : arrow_buffer(arrow_buffer)
-    {
-    }
-    [[nodiscard]] size_t size() const override { return arrow_buffer->size(); }
-    [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); }
-  };
-
- public:
-  /**
-   * @brief Constructs an object from an Apache Arrow Filesystem URI
-   *
-   * @param arrow_uri Apache Arrow Filesystem URI
-   */
-  explicit arrow_io_source(std::string_view arrow_uri)
-  {
-    const std::string uri_start_delimiter = "//";
-    const std::string uri_end_delimiter   = "?";
-
-    arrow::Result<std::shared_ptr<arrow::fs::FileSystem>> result =
-      arrow::fs::FileSystemFromUri(static_cast<std::string>(arrow_uri));
-    CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI.");
-    filesystem = result.ValueOrDie();
-
-    // Parse the path from the URI
-    size_t start          = arrow_uri.find(uri_start_delimiter) == std::string::npos
-                              ? 0
-                              : arrow_uri.find(uri_start_delimiter) + uri_start_delimiter.size();
-    size_t end            = arrow_uri.find(uri_end_delimiter) - start;
-    std::string_view path = arrow_uri.substr(start, end);
-
-    arrow::Result<std::shared_ptr<arrow::io::RandomAccessFile>> in_stream =
-      filesystem->OpenInputFile(static_cast<std::string>(path).c_str());
-    CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile");
-    arrow_file = in_stream.ValueOrDie();
-  }
-
-  /**
-   * @brief Constructs an object from an `arrow` source object.
-   *
-   * @param file The `arrow` object from which the data is read
-   */
-  explicit arrow_io_source(std::shared_ptr<arrow::io::RandomAccessFile> file) : arrow_file(file) {}
-
-  /**
-   * @brief Returns a buffer with a subset of data from the `arrow` source.
-   *
-   * @param offset The offset in bytes from which to read
-   * @param size The number of bytes to read
-   * @return A buffer with the read data
-   */
-  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
-  {
-    auto result = arrow_file->ReadAt(offset, size);
-    CUDF_EXPECTS(result.ok(), "Cannot read file data");
-    return std::make_unique<arrow_io_buffer>(result.ValueOrDie());
-  }
-
-  /**
-   * @brief Reads a selected range from the `arrow` source into a preallocated buffer.
-   *
-   * @param[in] offset The offset in bytes from which to read
-   * @param[in] size The number of bytes to read
-   * @param[out] dst The preallocated buffer to read into
-   * @return The number of bytes read
-   */
-  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
-  {
-    auto result = arrow_file->ReadAt(offset, size, dst);
-    CUDF_EXPECTS(result.ok(), "Cannot read file data");
-    return result.ValueOrDie();
-  }
-
-  /**
-   * @brief Returns the size of the data in the `arrow` source.
-   *
-   * @return The size of the data in the `arrow` source
-   */
-  [[nodiscard]] size_t size() const override
-  {
-    auto result = arrow_file->GetSize();
-    CUDF_EXPECTS(result.ok(), "Cannot get file size");
-    return result.ValueOrDie();
-  }
-
- private:
-  std::shared_ptr<arrow::fs::FileSystem> filesystem;
-  std::shared_ptr<arrow::io::RandomAccessFile> arrow_file;
-};
-
+/** @} */  // end of group
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index a62c9873e75..b7ee5e05e96 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -32,7 +32,7 @@
 
 #include <memory>
 
-namespace cudf::io::json::experimental::detail {
+namespace cudf::io::json::detail {
 
 // Unicode code point escape sequence
 static constexpr char UNICODE_SEQ = 0x7F;
@@ -181,7 +181,7 @@ process_string(in_iterator_t in_begin,
                cudf::io::parse_options_view const& options)
 {
   int32_t bytes           = 0;
-  const auto num_in_chars = thrust::distance(in_begin, in_end);
+  auto const num_in_chars = thrust::distance(in_begin, in_end);
   // String values are indicated by keeping the quote character
   bool const is_string_value =
     num_in_chars >= 2LL &&
@@ -428,4 +428,4 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
   return out_col;
 }
 
-}  // namespace cudf::io::json::experimental::detail
+}  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 7b0350e9bc8..6930a4fdb25 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -33,7 +33,7 @@ namespace cudf::io::json::detail {
  *
  * @return cudf::table object that contains the array of cudf::column.
  */
-table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
+table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& options,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr);
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index b7794c0df6a..623f402f9c9 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -70,11 +70,9 @@ class reader {
    * @brief Reads the entire dataset.
    *
    * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(orc_reader_options const& options, rmm::cuda_stream_view stream);
+  table_with_metadata read(orc_reader_options const& options);
 };
 
 /**
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 9a94924824d..3f2e1fa5e6c 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -21,6 +21,7 @@
 #pragma once
 
 #include <cudf/io/detail/utils.hpp>
+#include <cudf/io/parquet_metadata.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -208,8 +209,17 @@ class writer {
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list
    */
   static std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
-    const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list);
+    std::vector<std::unique_ptr<std::vector<uint8_t>>> const& metadata_list);
 };
 
+/**
+ * @brief Reads metadata of parquet dataset.
+ *
+ * @param sources Dataset sources to read from
+ *
+ * @return parquet_metadata with parquet schema, number of rows, number of row groups and key-value
+ * metadata.
+ */
+parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources);
 }  // namespace detail::parquet
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp
index 4914f434c98..b2ea29a85c3 100644
--- a/cpp/include/cudf/io/detail/tokenize_json.hpp
+++ b/cpp/include/cudf/io/detail/tokenize_json.hpp
@@ -110,6 +110,8 @@ enum token_t : PdaTokenT {
   ValueEnd,
   /// Beginning-of-error token (on first encounter of a parsing error)
   ErrorBegin,
+  /// Delimiting a JSON line for error recovery
+  LineEnd,
   /// Total number of tokens
   NUM_TOKENS
 };
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index f94fd5adeb8..15dc2a614ad 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -54,6 +54,14 @@ struct schema_element {
   std::map<std::string, schema_element> child_types;
 };
 
+/**
+ * @brief Control the error recovery behavior of the json parser
+ */
+enum class json_recovery_mode_t {
+  FAIL,              ///< Does not recover from an error when encountering an invalid format
+  RECOVER_WITH_NULL  ///< Recovers from an error, replacing invalid records with null
+};
+
 /**
  * @brief Input arguments to the `read_json` interface.
  *
@@ -105,12 +113,15 @@ class json_reader_options {
   // Whether to keep the quote characters of string values
   bool _keep_quotes = false;
 
+  // Whether to recover after an invalid JSON line
+  json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
+
   /**
    * @brief Constructor from source info.
    *
    * @param src source information used to read parquet file
    */
-  explicit json_reader_options(const source_info& src) : _source(src) {}
+  explicit json_reader_options(source_info const& src) : _source(src) {}
 
   friend json_reader_options_builder;
 
@@ -192,7 +203,7 @@ class json_reader_options {
    */
   size_t get_byte_range_padding() const
   {
-    auto const num_columns = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes);
+    auto const num_columns = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes);
 
     auto const max_row_bytes = 16 * 1024;  // 16KB
     auto const column_bytes  = 64;
@@ -235,6 +246,13 @@ class json_reader_options {
    */
   bool is_enabled_keep_quotes() const { return _keep_quotes; }
 
+  /**
+   * @brief Queries the JSON reader's behavior on invalid JSON lines.
+   *
+   * @returns An enum that specifies the JSON reader's behavior on invalid JSON lines.
+   */
+  json_recovery_mode_t recovery_mode() const { return _recovery_mode; }
+
   /**
    * @brief Set data types for columns to be read.
    *
@@ -305,6 +323,13 @@ class json_reader_options {
    * of string values
    */
   void enable_keep_quotes(bool val) { _keep_quotes = val; }
+
+  /**
+   * @brief Specifies the JSON reader's behavior on invalid JSON lines.
+   *
+   * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines.
+   */
+  void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; }
 };
 
 /**
@@ -449,6 +474,18 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Specifies the JSON reader's behavior on invalid JSON lines.
+   *
+   * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines.
+   * @return this for chaining
+   */
+  json_reader_options_builder& recovery_mode(json_recovery_mode_t val)
+  {
+    options._recovery_mode = val;
+    return *this;
+  }
+
   /**
    * @brief move json_reader_options member once it's built.
    */
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index e3abbe6056f..024f4f23b94 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -445,9 +445,11 @@ class orc_writer_options {
   // Set of columns to output
   table_view _table;
   // Optional associated metadata
-  const table_input_metadata* _metadata = nullptr;
+  std::optional<table_input_metadata> _metadata;
   // Optional footer key_value_metadata
   std::map<std::string, std::string> _user_data;
+  // Optional compression statistics
+  std::shared_ptr<writer_compression_statistics> _compression_stats;
 
   friend orc_writer_options_builder;
 
@@ -548,7 +550,7 @@ class orc_writer_options {
    *
    * @return Associated metadata
    */
-  [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns Key-Value footer metadata information.
@@ -560,6 +562,16 @@ class orc_writer_options {
     return _user_data;
   }
 
+  /**
+   * @brief Returns a shared pointer to the user-provided compression statistics.
+   *
+   * @return Compression statistics
+   */
+  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
+  {
+    return _compression_stats;
+  }
+
   // Setters
 
   /**
@@ -637,7 +649,7 @@ class orc_writer_options {
    *
    * @param meta Associated metadata
    */
-  void set_metadata(table_input_metadata const* meta) { _metadata = meta; }
+  void set_metadata(table_input_metadata meta) { _metadata = std::move(meta); }
 
   /**
    * @brief Sets metadata.
@@ -648,6 +660,16 @@ class orc_writer_options {
   {
     _user_data = std::move(metadata);
   }
+
+  /**
+   * @brief Sets the pointer to the output compression statistics.
+   *
+   * @param comp_stats Pointer to compression statistics to be updated after writing
+   */
+  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
+  {
+    _compression_stats = std::move(comp_stats);
+  }
 };
 
 /**
@@ -757,9 +779,9 @@ class orc_writer_options_builder {
    * @param meta Associated metadata
    * @return this for chaining
    */
-  orc_writer_options_builder& metadata(table_input_metadata const* meta)
+  orc_writer_options_builder& metadata(table_input_metadata meta)
   {
-    options._metadata = meta;
+    options._metadata = std::move(meta);
     return *this;
   }
 
@@ -775,6 +797,19 @@ class orc_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets the pointer to the output compression statistics.
+   *
+   * @param comp_stats Pointer to compression statistics to be filled once writer is done
+   * @return this for chaining
+   */
+  orc_writer_options_builder& compression_statistics(
+    std::shared_ptr<writer_compression_statistics> const& comp_stats)
+  {
+    options._compression_stats = comp_stats;
+    return *this;
+  }
+
   /**
    * @brief move orc_writer_options member once it's built.
    */
@@ -826,9 +861,11 @@ class chunked_orc_writer_options {
   // Row index stride (maximum number of rows in each row group)
   size_type _row_index_stride = default_row_index_stride;
   // Optional associated metadata
-  const table_input_metadata* _metadata = nullptr;
+  std::optional<table_input_metadata> _metadata;
   // Optional footer key_value_metadata
   std::map<std::string, std::string> _user_data;
+  // Optional compression statistics
+  std::shared_ptr<writer_compression_statistics> _compression_stats;
 
   friend chunked_orc_writer_options_builder;
 
@@ -907,7 +944,7 @@ class chunked_orc_writer_options {
    *
    * @return Associated metadata
    */
-  [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns Key-Value footer metadata information.
@@ -919,6 +956,16 @@ class chunked_orc_writer_options {
     return _user_data;
   }
 
+  /**
+   * @brief Returns a shared pointer to the user-provided compression statistics.
+   *
+   * @return Compression statistics
+   */
+  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
+  {
+    return _compression_stats;
+  }
+
   // Setters
 
   /**
@@ -989,7 +1036,7 @@ class chunked_orc_writer_options {
    *
    * @param meta Associated metadata
    */
-  void metadata(table_input_metadata const* meta) { _metadata = meta; }
+  void metadata(table_input_metadata meta) { _metadata = std::move(meta); }
 
   /**
    * @brief Sets Key-Value footer metadata.
@@ -1000,6 +1047,16 @@ class chunked_orc_writer_options {
   {
     _user_data = std::move(metadata);
   }
+
+  /**
+   * @brief Sets the pointer to the output compression statistics.
+   *
+   * @param comp_stats Pointer to compression statistics to be updated after writing
+   */
+  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
+  {
+    _compression_stats = std::move(comp_stats);
+  }
 };
 
 /**
@@ -1094,9 +1151,9 @@ class chunked_orc_writer_options_builder {
    * @param meta Associated metadata
    * @return this for chaining
    */
-  chunked_orc_writer_options_builder& metadata(table_input_metadata const* meta)
+  chunked_orc_writer_options_builder& metadata(table_input_metadata meta)
   {
-    options._metadata = meta;
+    options._metadata = std::move(meta);
     return *this;
   }
 
@@ -1113,6 +1170,19 @@ class chunked_orc_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets the pointer to the output compression statistics.
+   *
+   * @param comp_stats Pointer to compression statistics to be filled once writer is done
+   * @return this for chaining
+   */
+  chunked_orc_writer_options_builder& compression_statistics(
+    std::shared_ptr<writer_compression_statistics> const& comp_stats)
+  {
+    options._compression_stats = comp_stats;
+    return *this;
+  }
+
   /**
    * @brief move chunked_orc_writer_options member once it's built.
    */
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 9ad16a0e173..623ee2e49fc 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -355,13 +355,13 @@ class orc_metadata {
 };
 
 /**
- * @brief Reads file-level and stripe-level statistics of ORC dataset.
+ * @brief Reads metadata of ORC dataset.
  *
  * @ingroup io_readers
  *
  * @param src_info Dataset source
  *
- * @return Column names and decoded ORC statistics
+ * @return orc_metadata with ORC schema, number of rows and number of stripes.
  */
 orc_metadata read_orc_metadata(source_info const& src_info);
 
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 07d41e3b132..788ff15f3c1 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/ast/expressions.hpp>
 #include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
@@ -62,6 +63,9 @@ class parquet_reader_options {
   // Number of rows to read; `nullopt` is all
   std::optional<size_type> _num_rows;
 
+  // Predicate filter as AST to filter output rows.
+  std::optional<std::reference_wrapper<ast::expression const>> _filter;
+
   // Whether to store string data as categorical type
   bool _convert_strings_to_categories = false;
   // Whether to use PANDAS metadata to load columns
@@ -160,6 +164,13 @@ class parquet_reader_options {
    */
   [[nodiscard]] auto const& get_row_groups() const { return _row_groups; }
 
+  /**
+   * @brief Returns AST based filter for predicate pushdown.
+   *
+   * @return AST expression to use as filter
+   */
+  [[nodiscard]] auto const& get_filter() const { return _filter; }
+
   /**
    * @brief Returns timestamp type used to cast timestamp columns.
    *
@@ -181,6 +192,13 @@ class parquet_reader_options {
    */
   void set_row_groups(std::vector<std::vector<size_type>> row_groups);
 
+  /**
+   * @brief Sets AST based filter for predicate pushdown.
+   *
+   * @param filter AST expression to use as filter
+   */
+  void set_filter(ast::expression const& filter) { _filter = filter; }
+
   /**
    * @brief Sets to enable/disable conversion of strings to categories.
    *
@@ -273,6 +291,18 @@ class parquet_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets vector of individual row groups to read.
+   *
+   * @param filter Vector of row groups to read
+   * @return this for chaining
+   */
+  parquet_reader_options_builder& filter(ast::expression const& filter)
+  {
+    options.set_filter(filter);
+    return *this;
+  }
+
   /**
    * @brief Sets enable/disable conversion of strings to categories.
    *
@@ -472,7 +502,7 @@ class parquet_writer_options {
   // Partitions described as {start_row, num_rows} pairs
   std::vector<partition_info> _partitions;
   // Optional associated metadata
-  table_input_metadata const* _metadata = nullptr;
+  std::optional<table_input_metadata> _metadata;
   // Optional footer key_value_metadata
   std::vector<std::map<std::string, std::string>> _user_data;
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
@@ -496,6 +526,10 @@ class parquet_writer_options {
   size_t _max_dictionary_size = default_max_dictionary_size;
   // Maximum number of rows in a page fragment
   std::optional<size_type> _max_page_fragment_size;
+  // Optional compression statistics
+  std::shared_ptr<writer_compression_statistics> _compression_stats;
+  // write V2 page headers?
+  bool _v2_page_headers = false;
 
   /**
    * @brief Constructor from sink and table.
@@ -575,7 +609,7 @@ class parquet_writer_options {
    *
    * @return Associated metadata
    */
-  [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns Key-Value footer metadata information.
@@ -670,6 +704,23 @@ class parquet_writer_options {
    */
   [[nodiscard]] auto get_max_page_fragment_size() const { return _max_page_fragment_size; }
 
+  /**
+   * @brief Returns a shared pointer to the user-provided compression statistics.
+   *
+   * @return Compression statistics
+   */
+  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
+  {
+    return _compression_stats;
+  }
+
+  /**
+   * @brief Returns `true` if V2 page headers should be written.
+   *
+   * @return `true` if V2 page headers should be written.
+   */
+  [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
+
   /**
    * @brief Sets partitions.
    *
@@ -683,7 +734,7 @@ class parquet_writer_options {
    *
    * @param metadata Associated metadata
    */
-  void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; }
+  void set_metadata(table_input_metadata metadata) { _metadata = std::move(metadata); }
 
   /**
    * @brief Sets metadata.
@@ -777,6 +828,23 @@ class parquet_writer_options {
    * @param size_rows Maximum page fragment size, in rows.
    */
   void set_max_page_fragment_size(size_type size_rows);
+
+  /**
+   * @brief Sets the pointer to the output compression statistics.
+   *
+   * @param comp_stats Pointer to compression statistics to be updated after writing
+   */
+  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
+  {
+    _compression_stats = std::move(comp_stats);
+  }
+
+  /**
+   * @brief Sets preference for V2 page headers. Write V2 page headers if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of V2 page headers.
+   */
+  void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
 };
 
 /**
@@ -819,9 +887,9 @@ class parquet_writer_options_builder {
    * @param metadata Associated metadata
    * @return this for chaining
    */
-  parquet_writer_options_builder& metadata(table_input_metadata const* metadata)
+  parquet_writer_options_builder& metadata(table_input_metadata metadata)
   {
-    options._metadata = metadata;
+    options._metadata = std::move(metadata);
     return *this;
   }
 
@@ -983,6 +1051,19 @@ class parquet_writer_options_builder {
    */
   parquet_writer_options_builder& max_page_fragment_size(size_type val);
 
+  /**
+   * @brief Sets the pointer to the output compression statistics.
+   *
+   * @param comp_stats Pointer to compression statistics to be filled once writer is done
+   * @return this for chaining
+   */
+  parquet_writer_options_builder& compression_statistics(
+    std::shared_ptr<writer_compression_statistics> const& comp_stats)
+  {
+    options._compression_stats = comp_stats;
+    return *this;
+  }
+
   /**
    * @brief Sets whether int96 timestamps are written or not in parquet_writer_options.
    *
@@ -995,6 +1076,14 @@ class parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set to true if V2 page headers are to be written.
+   *
+   * @param enabled Boolean value to enable/disable writing of V2 page headers.
+   * @return this for chaining
+   */
+  parquet_writer_options_builder& write_v2_headers(bool enabled);
+
   /**
    * @brief move parquet_writer_options member once it's built.
    */
@@ -1037,7 +1126,7 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const
  * @return A parquet-compatible blob that contains the data for all row groups in the list
  */
 std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
-  const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list);
+  std::vector<std::unique_ptr<std::vector<uint8_t>>> const& metadata_list);
 
 class chunked_parquet_writer_options_builder;
 
@@ -1052,7 +1141,7 @@ class chunked_parquet_writer_options {
   // Specify the level of statistics in the output file
   statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP;
   // Optional associated metadata.
-  table_input_metadata const* _metadata = nullptr;
+  std::optional<table_input_metadata> _metadata;
   // Optional footer key_value_metadata
   std::vector<std::map<std::string, std::string>> _user_data;
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
@@ -1074,6 +1163,10 @@ class chunked_parquet_writer_options {
   size_t _max_dictionary_size = default_max_dictionary_size;
   // Maximum number of rows in a page fragment
   std::optional<size_type> _max_page_fragment_size;
+  // Optional compression statistics
+  std::shared_ptr<writer_compression_statistics> _compression_stats;
+  // write V2 page headers?
+  bool _v2_page_headers = false;
 
   /**
    * @brief Constructor from sink.
@@ -1118,7 +1211,7 @@ class chunked_parquet_writer_options {
    *
    * @return Metadata information
    */
-  [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns Key-Value footer metadata information.
@@ -1204,12 +1297,29 @@ class chunked_parquet_writer_options {
    */
   [[nodiscard]] auto get_max_page_fragment_size() const { return _max_page_fragment_size; }
 
+  /**
+   * @brief Returns a shared pointer to the user-provided compression statistics.
+   *
+   * @return Compression statistics
+   */
+  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
+  {
+    return _compression_stats;
+  }
+
+  /**
+   * @brief Returns `true` if V2 page headers should be written.
+   *
+   * @return `true` if V2 page headers should be written.
+   */
+  [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
+
   /**
    * @brief Sets metadata.
    *
    * @param metadata Associated metadata
    */
-  void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; }
+  void set_metadata(table_input_metadata metadata) { _metadata = std::move(metadata); }
 
   /**
    * @brief Sets Key-Value footer metadata.
@@ -1297,6 +1407,23 @@ class chunked_parquet_writer_options {
    */
   void set_max_page_fragment_size(size_type size_rows);
 
+  /**
+   * @brief Sets the pointer to the output compression statistics.
+   *
+   * @param comp_stats Pointer to compression statistics to be updated after writing
+   */
+  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
+  {
+    _compression_stats = std::move(comp_stats);
+  }
+
+  /**
+   * @brief Sets preference for V2 page headers. Write V2 page headers if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of V2 page headers.
+   */
+  void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
+
   /**
    * @brief creates builder to build chunked_parquet_writer_options.
    *
@@ -1334,9 +1461,9 @@ class chunked_parquet_writer_options_builder {
    * @param metadata Associated metadata
    * @return this for chaining
    */
-  chunked_parquet_writer_options_builder& metadata(table_input_metadata const* metadata)
+  chunked_parquet_writer_options_builder& metadata(table_input_metadata metadata)
   {
-    options._metadata = metadata;
+    options._metadata = std::move(metadata);
     return *this;
   }
 
@@ -1350,7 +1477,7 @@ class chunked_parquet_writer_options_builder {
     std::vector<std::map<std::string, std::string>> metadata);
 
   /**
-   * @brief Sets Sets the level of statistics in chunked_parquet_writer_options.
+   * @brief Sets the level of statistics in chunked_parquet_writer_options.
    *
    * @param sf Level of statistics requested in the output file
    * @return this for chaining
@@ -1388,6 +1515,14 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set to true if V2 page headers are to be written.
+   *
+   * @param enabled Boolean value to enable/disable writing of V2 page headers.
+   * @return this for chaining
+   */
+  chunked_parquet_writer_options_builder& write_v2_headers(bool enabled);
+
   /**
    * @brief Sets the maximum row group size, in bytes.
    *
@@ -1503,6 +1638,19 @@ class chunked_parquet_writer_options_builder {
    */
   chunked_parquet_writer_options_builder& max_page_fragment_size(size_type val);
 
+  /**
+   * @brief Sets the pointer to the output compression statistics.
+   *
+   * @param comp_stats Pointer to compression statistics to be filled once writer is done
+   * @return this for chaining
+   */
+  chunked_parquet_writer_options_builder& compression_statistics(
+    std::shared_ptr<writer_compression_statistics> const& comp_stats)
+  {
+    options._compression_stats = comp_stats;
+    return *this;
+  }
+
   /**
    * @brief move chunked_parquet_writer_options member once it's built.
    */
diff --git a/cpp/include/cudf/io/parquet_metadata.hpp b/cpp/include/cudf/io/parquet_metadata.hpp
new file mode 100644
index 00000000000..0c985fc3c69
--- /dev/null
+++ b/cpp/include/cudf/io/parquet_metadata.hpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file parquet_metadata.hpp
+ * @brief cuDF-IO freeform API
+ */
+
+#pragma once
+
+#include <cudf/io/types.hpp>
+
+#include <optional>
+#include <string_view>
+#include <variant>
+#include <vector>
+
+namespace cudf {
+namespace io {
+
+namespace parquet {
+/**
+ * @brief Basic data types in Parquet, determines how data is physically stored
+ */
+enum class TypeKind : int8_t {
+  UNDEFINED_TYPE       = -1,  // Undefined for non-leaf nodes
+  BOOLEAN              = 0,
+  INT32                = 1,
+  INT64                = 2,
+  INT96                = 3,  // Deprecated
+  FLOAT                = 4,
+  DOUBLE               = 5,
+  BYTE_ARRAY           = 6,
+  FIXED_LEN_BYTE_ARRAY = 7,
+};
+}  // namespace parquet
+
+/**
+ * @brief Schema of a parquet column, including the nested columns.
+ */
+struct parquet_column_schema {
+ public:
+  /**
+   * @brief constructor
+   *
+   * @param name column name
+   * @param type parquet type
+   * @param children child columns (empty for non-nested types)
+   */
+  parquet_column_schema(std::string_view name,
+                        parquet::TypeKind type,
+                        std::vector<parquet_column_schema> children)
+    : _name{name}, _type_kind{type}, _children{std::move(children)}
+  {
+  }
+
+  /**
+   * @brief Returns parquet column name; can be empty.
+   *
+   * @return Column name
+   */
+  [[nodiscard]] auto name() const { return _name; }
+
+  /**
+   * @brief Returns parquet type of the column.
+   *
+   * @return Column parquet type
+   */
+  [[nodiscard]] auto type_kind() const { return _type_kind; }
+
+  /**
+   * @brief Returns schemas of all child columns.
+   *
+   * @return Children schemas
+   */
+  [[nodiscard]] auto const& children() const& { return _children; }
+
+  /** @copydoc children
+   * Children array is moved out of the object (rvalues only).
+   *
+   */
+  [[nodiscard]] auto children() && { return std::move(_children); }
+
+  /**
+   * @brief Returns schema of the child with the given index.
+   *
+   * @param idx child index
+   *
+   * @return Child schema
+   */
+  [[nodiscard]] auto const& child(int idx) const& { return children().at(idx); }
+
+  /** @copydoc child
+   * Child is moved out of the object (rvalues only).
+   *
+   */
+  [[nodiscard]] auto child(int idx) && { return std::move(children().at(idx)); }
+
+  /**
+   * @brief Returns the number of child columns.
+   *
+   * @return Children count
+   */
+  [[nodiscard]] auto num_children() const { return children().size(); }
+
+ private:
+  std::string _name;
+  // 3 types available: Physical, Converted, Logical.
+  parquet::TypeKind _type_kind;  // Physical
+  std::vector<parquet_column_schema> _children;
+};
+
+/**
+ * @brief Schema of a parquet file.
+ */
+struct parquet_schema {
+ public:
+  /**
+   * @brief constructor
+   *
+   * @param root_column_schema root column
+   */
+  parquet_schema(parquet_column_schema root_column_schema) : _root{std::move(root_column_schema)} {}
+
+  /**
+   * @brief Returns the schema of the struct column that contains all columns as fields.
+   *
+   * @return Root column schema
+   */
+  [[nodiscard]] auto const& root() const& { return _root; }
+
+  /** @copydoc root
+   * Root column schema is moved out of the object (rvalues only).
+   *
+   */
+  [[nodiscard]] auto root() && { return std::move(_root); }
+
+ private:
+  parquet_column_schema _root;
+};
+
+/**
+ * @brief Information about content of a parquet file.
+ */
+class parquet_metadata {
+ public:
+  /// Key-value metadata in the file footer.
+  using key_value_metadata = std::unordered_map<std::string, std::string>;
+
+  /**
+   * @brief constructor
+   *
+   * @param schema parquet schema
+   * @param num_rows number of rows
+   * @param num_rowgroups number of row groups
+   * @param file_metadata key-value metadata in the file footer
+   */
+  parquet_metadata(parquet_schema schema,
+                   int64_t num_rows,
+                   size_type num_rowgroups,
+                   key_value_metadata file_metadata)
+    : _schema{std::move(schema)},
+      _num_rows{num_rows},
+      _num_rowgroups{num_rowgroups},
+      _file_metadata{std::move(file_metadata)}
+  {
+  }
+
+  /**
+   * @brief Returns the parquet schema.
+   *
+   * @return parquet schema
+   */
+  [[nodiscard]] auto const& schema() const { return _schema; }
+
+  /**
+   * @brief Returns the number of rows of the root column.
+   *
+   * If a file contains list columns, nested columns can have a different number of rows.
+   *
+   * @return Number of rows
+   */
+  [[nodiscard]] auto num_rows() const { return _num_rows; }
+
+  /**
+   * @brief Returns the number of rowgroups in the file.
+   *
+   * @return Number of row groups
+   */
+  [[nodiscard]] auto num_rowgroups() const { return _num_rowgroups; }
+  /**
+   * @brief Returns the Key value metadata in the file footer.
+   *
+   * @return Key value metadata as a map
+   */
+  [[nodiscard]] auto const& metadata() const { return _file_metadata; }
+
+ private:
+  parquet_schema _schema;
+  int64_t _num_rows;
+  size_type _num_rowgroups;
+  key_value_metadata _file_metadata;
+};
+
+/**
+ * @brief Reads metadata of parquet dataset.
+ *
+ * @ingroup io_readers
+ *
+ * @param src_info Dataset source
+ *
+ * @return parquet_metadata with parquet schema, number of rows, number of row groups and key-value
+ * metadata.
+ */
+parquet_metadata read_parquet_metadata(source_info const& src_info);
+
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
index f5230863f17..046994d33cc 100644
--- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ std::unique_ptr<data_chunk_source> make_source(datasource& data);
  * @return the data chunk source for the provided host data. It copies data from the host to the
  *         device.
  */
-std::unique_ptr<data_chunk_source> make_source(host_span<const char> data);
+std::unique_ptr<data_chunk_source> make_source(host_span<char const> data);
 
 /**
  * @brief Creates a data source capable of producing device-buffered views of the file
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 7426811a18d..a97f81182ac 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -32,13 +32,6 @@
 #include <unordered_map>
 #include <vector>
 
-// Forward declarations
-namespace arrow {
-namespace io {
-class RandomAccessFile;
-}
-}  // namespace arrow
-
 namespace cudf {
 //! IO interfaces
 namespace io {
@@ -100,6 +93,104 @@ enum statistics_freq {
   STATISTICS_COLUMN   = 3,  ///< Full column and offset indices. Implies STATISTICS_ROWGROUP
 };
 
+/**
+ * @brief Statistics about compression performed by a writer.
+ */
+class writer_compression_statistics {
+ public:
+  /**
+   * @brief Default constructor
+   */
+  writer_compression_statistics() = default;
+
+  /**
+   * @brief Constructor with initial values.
+   *
+   * @param num_compressed_bytes The number of bytes that were successfully compressed
+   * @param num_failed_bytes The number of bytes that failed to compress
+   * @param num_skipped_bytes The number of bytes that were skipped during compression
+   * @param num_compressed_output_bytes The number of bytes in the compressed output
+   */
+  writer_compression_statistics(size_t num_compressed_bytes,
+                                size_t num_failed_bytes,
+                                size_t num_skipped_bytes,
+                                size_t num_compressed_output_bytes)
+    : _num_compressed_bytes(num_compressed_bytes),
+      _num_failed_bytes(num_failed_bytes),
+      _num_skipped_bytes(num_skipped_bytes),
+      _num_compressed_output_bytes(num_compressed_output_bytes)
+  {
+  }
+
+  /**
+   * @brief Adds the values from another `writer_compression_statistics` object.
+   *
+   * @param other The other writer_compression_statistics object
+   * @return writer_compression_statistics& Reference to this object
+   */
+  writer_compression_statistics& operator+=(writer_compression_statistics const& other) noexcept
+  {
+    _num_compressed_bytes += other._num_compressed_bytes;
+    _num_failed_bytes += other._num_failed_bytes;
+    _num_skipped_bytes += other._num_skipped_bytes;
+    _num_compressed_output_bytes += other._num_compressed_output_bytes;
+    return *this;
+  }
+
+  /**
+   * @brief Returns the number of bytes in blocks that were successfully compressed.
+   *
+   * This is the number of bytes that were actually compressed, not the size of the compressed
+   * output.
+   *
+   * @return size_t The number of bytes that were successfully compressed
+   */
+  [[nodiscard]] auto num_compressed_bytes() const noexcept { return _num_compressed_bytes; }
+
+  /**
+   * @brief Returns the number of bytes in blocks that failed to compress.
+   *
+   * @return size_t The number of bytes that failed to compress
+   */
+  [[nodiscard]] auto num_failed_bytes() const noexcept { return _num_failed_bytes; }
+
+  /**
+   * @brief Returns the number of bytes in blocks that were skipped during compression.
+   *
+   * @return size_t The number of bytes that were skipped during compression
+   */
+  [[nodiscard]] auto num_skipped_bytes() const noexcept { return _num_skipped_bytes; }
+
+  /**
+   * @brief Returns the total size of compression inputs.
+   *
+   * @return size_t The total size of compression inputs
+   */
+  [[nodiscard]] auto num_total_input_bytes() const noexcept
+  {
+    return num_compressed_bytes() + num_failed_bytes() + num_skipped_bytes();
+  }
+
+  /**
+   * @brief Returns the compression ratio for the successfully compressed blocks.
+   *
+   * Returns nan if there were no successfully compressed blocks.
+   *
+   * @return double The ratio between the size of the compression inputs and the size of the
+   * compressed output.
+   */
+  [[nodiscard]] auto compression_ratio() const noexcept
+  {
+    return static_cast<double>(num_compressed_bytes()) / _num_compressed_output_bytes;
+  }
+
+ private:
+  std::size_t _num_compressed_bytes = 0;  ///< The number of bytes that were successfully compressed
+  std::size_t _num_failed_bytes     = 0;  ///< The number of bytes that failed to compress
+  std::size_t _num_skipped_bytes = 0;  ///< The number of bytes that were skipped during compression
+  std::size_t _num_compressed_output_bytes = 0;  ///< The number of bytes in the compressed output
+};
+
 /**
  * @brief Control use of dictionary encoding for parquet writer
  */
@@ -110,20 +201,27 @@ enum dictionary_policy {
 };
 
 /**
- * @brief Detailed name information for output columns.
+ * @brief Detailed name (and optionally nullability) information for output columns.
  *
  * The hierarchy of children matches the hierarchy of children in the output
  * cudf columns.
  */
 struct column_name_info {
   std::string name;                        ///< Column name
+  std::optional<bool> is_nullable;         ///< Column nullability
   std::vector<column_name_info> children;  ///< Child column names
+
   /**
-   * @brief Construct a column name info with a name and no children
+   * @brief Construct a column name info with a name, optional nullabilty, and no children
    *
    * @param _name Column name
+   * @param _is_nullable True if column is nullable
    */
-  column_name_info(std::string const& _name) : name(_name) {}
+  column_name_info(std::string const& _name, std::optional<bool> _is_nullable = std::nullopt)
+    : name(_name), is_nullable(_is_nullable)
+  {
+  }
+
   column_name_info() = default;
 };
 
@@ -165,7 +263,7 @@ struct host_buffer {
    * @param data Pointer to the buffer
    * @param size Size of the buffer
    */
-  host_buffer(const char* data, size_t size) : data(data), size(size) {}
+  host_buffer(char const* data, size_t size) : data(data), size(size) {}
 };
 
 /**
@@ -188,8 +286,6 @@ constexpr inline auto is_byte_like_type()
  * @brief Source information for read interfaces
  */
 struct source_info {
-  std::vector<std::shared_ptr<arrow::io::RandomAccessFile>> _files;  //!< Input files
-
   source_info() = default;
 
   /**
@@ -233,7 +329,7 @@ struct source_info {
    * @param host_data Input buffer in host memory
    * @param size Size of the buffer
    */
-  explicit source_info(const char* host_data, size_t size)
+  explicit source_info(char const* host_data, size_t size)
     : _type(io_type::HOST_BUFFER),
       _host_buffers(
         {cudf::host_span<std::byte const>(reinterpret_cast<std::byte const*>(host_data), size)})
@@ -340,12 +436,6 @@ struct source_info {
    * @return The device buffers of the input
    */
   [[nodiscard]] auto const& device_buffers() const { return _device_buffers; }
-  /**
-   * @brief Get the input files
-   *
-   * @return The input files
-   */
-  [[nodiscard]] auto const& files() const { return _files; }
   /**
    * @brief Get the user sources of the input
    *
@@ -715,7 +805,17 @@ class table_input_metadata {
    *
    * @param table The table_view to construct metadata for
    */
-  table_input_metadata(table_view const& table);
+  explicit table_input_metadata(table_view const& table);
+
+  /**
+   * @brief Construct a new table_input_metadata from a table_metadata object.
+   *
+   * The constructed table_input_metadata has the same structure, column names and nullability as
+   * the passed table_metadata.
+   *
+   * @param metadata The table_metadata to construct table_intput_metadata for
+   */
+  explicit table_input_metadata(table_metadata const& metadata);
 
   std::vector<column_in_metadata> column_metadata;  //!< List of column metadata
 };
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index 11d1bbf9fc8..6c50e1d5998 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -34,10 +34,11 @@
 namespace cudf {
 
 // forward declaration
-namespace detail {
+namespace hashing::detail {
 template <typename T>
-class MurmurHash3_32;
-
+class MurmurHash3_x86_32;
+}  // namespace hashing::detail
+namespace detail {
 template <typename T>
 class hash_join;
 }  // namespace detail
@@ -167,7 +168,7 @@ full_join(cudf::table_view const& left_keys,
           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Returns a vector of row indices corresponding to a left semi join
+ * @brief Returns a vector of row indices corresponding to a left semi-join
  * between the specified tables.
  *
  * The returned vector contains the row indices from the left table
@@ -179,13 +180,9 @@ full_join(cudf::table_view const& left_keys,
  * Result: {1, 2}
  * @endcode
  *
- * @throw cudf::logic_error if number of columns in either
- * `left_keys` or `right_keys` table is 0 or exceeds MAX_JOIN_SIZE
- *
- * @param[in] left_keys The left table
- * @param[in] right_keys The right table
- * @param[in] compare_nulls controls whether null join-key values
- * should match or not.
+ * @param left_keys The left table
+ * @param right_keys The right table
+ * @param compare_nulls Controls whether null join-key values should match or not
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A vector `left_indices` that can be used to construct
@@ -276,7 +273,7 @@ enum class nullable_join : bool { YES, NO };
 class hash_join {
  public:
   using impl_type = typename cudf::detail::hash_join<
-    cudf::detail::MurmurHash3_32<cudf::hash_value_type>>;  ///< Implementation type
+    cudf::hashing::detail::MurmurHash3_x86_32<cudf::hash_value_type>>;  ///< Implementation type
 
   hash_join() = delete;
   ~hash_join();
@@ -302,7 +299,7 @@ class hash_join {
   /**
    * @copydoc hash_join(cudf::table_view const&, null_equality, rmm::cuda_stream_view)
    *
-   * @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or
+   * @param has_nulls Flag to indicate if there exists any nulls in the `build` table or
    *        any `probe` table that will be used later for join
    */
   hash_join(cudf::table_view const& build,
@@ -326,7 +323,7 @@ class hash_join {
    *
    * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
    * the result of performing an inner join between two tables with `build` and `probe`
-   * as the the join keys .
+   * as the join keys .
    */
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
@@ -351,7 +348,7 @@ class hash_join {
    *
    * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
    * the result of performing a left join between two tables with `build` and `probe`
-   * as the the join keys .
+   * as the join keys .
    */
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
@@ -376,7 +373,7 @@ class hash_join {
    *
    * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
    * the result of performing a full join between two tables with `build` and `probe`
-   * as the the join keys .
+   * as the join keys .
    */
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
@@ -396,7 +393,7 @@ class hash_join {
    * constructed with null check.
    *
    * @return The exact number of output when performing an inner join between two tables with
-   * `build` and `probe` as the the join keys .
+   * `build` and `probe` as the join keys .
    */
   [[nodiscard]] std::size_t inner_join_size(
     cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
@@ -412,7 +409,7 @@ class hash_join {
    * constructed with null check.
    *
    * @return The exact number of output when performing a left join between two tables with `build`
-   * and `probe` as the the join keys .
+   * and `probe` as the join keys .
    */
   [[nodiscard]] std::size_t left_join_size(
     cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
@@ -430,7 +427,7 @@ class hash_join {
    * constructed with null check.
    *
    * @return The exact number of output when performing a full join between two tables with `build`
-   * and `probe` as the the join keys .
+   * and `probe` as the join keys .
    */
   std::size_t full_join_size(
     cudf::table_view const& probe,
@@ -438,7 +435,7 @@ class hash_join {
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
  private:
-  const std::unique_ptr<const impl_type> _impl;
+  const std::unique_ptr<impl_type const> _impl;
 };
 
 /**
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index 531396e940e..0bc76828fc3 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -81,9 +81,7 @@ std::unique_ptr<column> concatenate_rows(
  * @endcode
  *
  * @throws std::invalid_argument if the input column is not at least two-level depth lists column
- * (i.e., each row must be a list of list).
- * @throws cudf::logic_error if the input lists column contains nested typed entries that are not
- * lists.
+ *         (i.e., each row must be a list of lists).
  *
  * @param input The lists column containing lists of list elements to concatenate.
  * @param null_policy The parameter to specify whether a null list element will be ignored from
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 83710a49f6a..18fe707fd69 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -28,7 +29,6 @@
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
-#include <thrust/transform_scan.h>
 
 namespace cudf {
 namespace lists {
@@ -74,25 +74,15 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
 {
   // size of the gather map is the # of output rows
   size_type output_count = gather_map_size;
-  size_type offset_count = output_count + 1;
 
   // offsets of the source column
   int32_t const* src_offsets{source_column.offsets().data<int32_t>() + source_column.offset()};
   size_type const src_size = source_column.size();
 
-  // outgoing offsets.  these will persist as output from the entire gather operation
-  auto dst_offsets_c = cudf::make_fixed_width_column(
-    data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr);
-  mutable_column_view dst_offsets_v = dst_offsets_c->mutable_view();
   auto const source_column_nullmask = source_column.null_mask();
 
-  // generate the compacted outgoing offsets.
-  auto count_iter = thrust::make_counting_iterator<int32_t>(0);
-  thrust::transform_exclusive_scan(
-    rmm::exec_policy_nosync(stream),
-    count_iter,
-    count_iter + offset_count,
-    dst_offsets_v.begin<int32_t>(),
+  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
+    0,
     [source_column_nullmask,
      source_column_offset = source_column.offset(),
      gather_map,
@@ -112,9 +102,10 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
 
       // the length of this list
       return src_offsets[offset_index + 1] - src_offsets[offset_index];
-    },
-    0,
-    thrust::plus<int32_t>());
+    });
+
+  auto [dst_offsets_c, map_size] =
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + output_count, stream, mr);
 
   // handle sliced columns
   size_type const shift =
@@ -147,9 +138,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
     });
 
   // Retrieve size of the resulting gather map for level N+1 (the last offset)
-  size_type child_gather_map_size =
-    cudf::detail::get_value<size_type>(dst_offsets_c->view(), output_count, stream);
-
+  auto const child_gather_map_size = static_cast<size_type>(map_size);
   return {std::move(dst_offsets_c), std::move(base_offsets), child_gather_map_size};
 }
 
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index 18cb147d1e4..f04b2fda2bf 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -240,11 +240,11 @@ std::unique_ptr<column> scatter(scalar const& slr,
   rmm::device_buffer null_mask =
     slr_valid ? cudf::detail::create_null_mask(1, mask_state::UNALLOCATED, stream, mr)
               : cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream, mr);
-  auto offset_column = make_numeric_column(
-    data_type{type_to_id<offset_type>()}, 2, mask_state::UNALLOCATED, stream, mr);
+  auto offset_column =
+    make_numeric_column(data_type{type_to_id<size_type>()}, 2, mask_state::UNALLOCATED, stream, mr);
   thrust::sequence(rmm::exec_policy_nosync(stream),
-                   offset_column->mutable_view().begin<offset_type>(),
-                   offset_column->mutable_view().end<offset_type>(),
+                   offset_column->mutable_view().begin<size_type>(),
+                   offset_column->mutable_view().end<size_type>(),
                    0,
                    lv->view().size());
   auto wrapped = column_view(data_type{type_id::LIST},
diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index 336214e3934..8c6368eacb6 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -45,7 +45,7 @@ class lists_column_view : private column_view {
    */
   lists_column_view(column_view const& lists_column);
   lists_column_view(lists_column_view&&)      = default;  ///< Move constructor
-  lists_column_view(const lists_column_view&) = default;  ///< Copy constructor
+  lists_column_view(lists_column_view const&) = default;  ///< Copy constructor
   ~lists_column_view()                        = default;
   /**
    * @brief Copy assignment operator
@@ -71,9 +71,7 @@ class lists_column_view : private column_view {
   using column_view::null_mask;
   using column_view::offset;
   using column_view::size;
-  static_assert(std::is_same_v<offset_type, size_type>,
-                "offset_type is expected to be the same as size_type.");
-  using offset_iterator = offset_type const*;  ///< Iterator type for offsets
+  using offset_iterator = size_type const*;  ///< Iterator type for offsets
 
   /**
    * @brief Returns the parent column.
@@ -119,7 +117,7 @@ class lists_column_view : private column_view {
    */
   [[nodiscard]] offset_iterator offsets_begin() const noexcept
   {
-    return offsets().begin<offset_type>() + offset();
+    return offsets().begin<size_type>() + offset();
   }
 
   /**
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 360006c1eea..672f479ad53 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -36,6 +36,8 @@ namespace cudf {
  * @brief Returns the null count for a null mask of the specified `state`
  * representing `size` elements.
  *
+ * @throw std::invalid_argument if state is UNINITIALIZED
+ *
  * @param state The state of the null mask
  * @param size The number of elements represented by the mask
  * @return The count of null elements
@@ -168,5 +170,21 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(
   table_view const& view,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Given a validity bitmask, counts the number of null elements (unset bits)
+ * in the range `[start, stop)`.
+ *
+ * If `bitmask == nullptr`, all elements are assumed to be valid and the
+ * function returns ``.
+ *
+ * @throws cudf::logic_error if `start > stop`
+ * @throws cudf::logic_error if `start < 0`
+ *
+ * @param bitmask Validity bitmask residing in device memory.
+ * @param start Index of the first bit to count (inclusive).
+ * @param stop Index of the last bit to count (exclusive).
+ * @return The number of null elements in the specified range.
+ */
+cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop);
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index a5675b5f031..52aebeb55e5 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -198,7 +198,7 @@ std::unique_ptr<column> segmented_reduce(
  * @returns Scanned output column
  */
 std::unique_ptr<column> scan(
-  const column_view& input,
+  column_view const& input,
   scan_aggregation const& agg,
   scan_type inclusive,
   null_policy null_handling           = null_policy::EXCLUDE,
diff --git a/cpp/include/cudf/reduction/detail/reduction.hpp b/cpp/include/cudf/reduction/detail/reduction.hpp
new file mode 100644
index 00000000000..4cbfb82ae6b
--- /dev/null
+++ b/cpp/include/cudf/reduction/detail/reduction.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/types.hpp>
+
+#include <optional>
+
+namespace cudf::reduction::detail {
+
+/**
+ * @copydoc cudf::reduce(column_view const&, reduce_aggregation const&, data_type,
+ * std::optional<std::reference_wrapper<scalar const>>, rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<scalar> reduce(column_view const& col,
+                               reduce_aggregation const& agg,
+                               data_type output_dtype,
+                               std::optional<std::reference_wrapper<scalar const>> init,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
+
+}  // namespace cudf::reduction::detail
diff --git a/cpp/include/cudf/reduction/detail/reduction_operators.cuh b/cpp/include/cudf/reduction/detail/reduction_operators.cuh
index 0dba84a0b28..a747f7bade7 100644
--- a/cpp/include/cudf/reduction/detail/reduction_operators.cuh
+++ b/cpp/include/cudf/reduction/detail/reduction_operators.cuh
@@ -183,7 +183,7 @@ struct compound_op : public simple_op<Derived> {
    * @copydoc simple_op<Derived>::template get_null_replacing_element_transformer<ResultType>()
    */
   template <typename ResultType>
-  auto get_null_replacing_element_transformer() override
+  auto get_null_replacing_element_transformer()
   {
     using element_transformer = typename Derived::transformer<ResultType>;
     using OutputType          = typename Derived::intermediate<ResultType>::IntermediateType;
@@ -202,9 +202,9 @@ struct compound_op : public simple_op<Derived> {
    * @return transformed output result of compound operator
    */
   template <typename ResultType, typename IntermediateType>
-  CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
-                                                           const cudf::size_type& count,
-                                                           const cudf::size_type& ddof)
+  CUDF_HOST_DEVICE inline static ResultType compute_result(IntermediateType const& input,
+                                                           cudf::size_type const& count,
+                                                           cudf::size_type const& ddof)
   {
     // Enforced interface
     return Derived::template intermediate<ResultType>::compute_result(input, count, ddof);
@@ -231,9 +231,9 @@ struct mean : public compound_op<mean> {
     using IntermediateType = ResultType;  // sum value
 
     // compute `mean` from intermediate type `IntermediateType`
-    CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
-                                                             const cudf::size_type& count,
-                                                             const cudf::size_type& ddof)
+    CUDF_HOST_DEVICE inline static ResultType compute_result(IntermediateType const& input,
+                                                             cudf::size_type const& count,
+                                                             cudf::size_type const& ddof)
     {
       return (input / count);
     };
@@ -252,9 +252,9 @@ struct variance : public compound_op<variance> {
     using IntermediateType = var_std<ResultType>;  // with sum of value, and sum of squared value
 
     // compute `variance` from intermediate type `IntermediateType`
-    CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
-                                                             const cudf::size_type& count,
-                                                             const cudf::size_type& ddof)
+    CUDF_HOST_DEVICE inline static ResultType compute_result(IntermediateType const& input,
+                                                             cudf::size_type const& count,
+                                                             cudf::size_type const& ddof)
     {
       ResultType mean     = input.value / count;
       ResultType asum     = input.value_squared;
@@ -278,9 +278,9 @@ struct standard_deviation : public compound_op<standard_deviation> {
     using IntermediateType = var_std<ResultType>;  // with sum of value, and sum of squared value
 
     // compute `standard deviation` from intermediate type `IntermediateType`
-    CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
-                                                             const cudf::size_type& count,
-                                                             const cudf::size_type& ddof)
+    CUDF_HOST_DEVICE inline static ResultType compute_result(IntermediateType const& input,
+                                                             cudf::size_type const& count,
+                                                             cudf::size_type const& ddof)
     {
       using intermediateOp = variance::template intermediate<ResultType>;
       ResultType var       = intermediateOp::compute_result(input, count, ddof);
diff --git a/cpp/include/cudf/replace.hpp b/cpp/include/cudf/replace.hpp
index 9df58306ace..3405dc8b796 100644
--- a/cpp/include/cudf/replace.hpp
+++ b/cpp/include/cudf/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -45,6 +46,7 @@ enum class replace_policy : bool { PRECEDING, FOLLOWING };
  *
  * @param[in] input A column whose null values will be replaced
  * @param[in] replacement A cudf::column whose values will replace null values in input
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns A copy of `input` with the null values replaced with corresponding values from
@@ -53,6 +55,7 @@ enum class replace_policy : bool { PRECEDING, FOLLOWING };
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   column_view const& replacement,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -63,6 +66,7 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param[in] input A column whose null values will be replaced
  * @param[in] replacement Scalar used to replace null values in `input`
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns Copy of `input` with null values replaced by `replacement`
@@ -70,6 +74,7 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   scalar const& replacement,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -80,6 +85,7 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param[in] input A column whose null values will be replaced
  * @param[in] replace_policy Specify the position of replacement values relative to null values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns Copy of `input` with null values replaced based on `replace_policy`
@@ -87,6 +93,7 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   replace_policy const& replace_policy,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -106,6 +113,7 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param input A column whose NaN values will be replaced
  * @param replacement A cudf::column whose values will replace NaN values in input
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A copy of `input` with the NaN values replaced with corresponding values from
  * `replacement`.
@@ -113,6 +121,7 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   column_view const& replacement,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -132,12 +141,14 @@ std::unique_ptr<column> replace_nans(
  *
  * @param input A column whose NaN values will be replaced
  * @param replacement A cudf::scalar whose value will replace NaN values in input
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A copy of `input` with the NaN values replaced by `replacement`
  */
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   scalar const& replacement,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -147,6 +158,7 @@ std::unique_ptr<column> replace_nans(
  * @param input_col The column to find and replace values in
  * @param values_to_replace The values to replace
  * @param replacement_values The values to replace with
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns Copy of `input_col` with specified values replaced
@@ -155,6 +167,7 @@ std::unique_ptr<column> find_and_replace_all(
   column_view const& input_col,
   column_view const& values_to_replace,
   column_view const& replacement_values,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -198,6 +211,7 @@ std::unique_ptr<column> find_and_replace_all(
  * @param[in] hi Maximum clamp value. All elements greater than `hi` will be replaced by
  * `hi_replace`. Ignored if null.
  * @param[in] hi_replace All elements greater than `hi` will be replaced by `hi_replace`
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate device memory of the returned column
  *
  * @return Returns a clamped column as per `lo` and `hi` boundaries
@@ -208,6 +222,7 @@ std::unique_ptr<column> clamp(
   scalar const& lo_replace,
   scalar const& hi,
   scalar const& hi_replace,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -244,6 +259,7 @@ std::unique_ptr<column> clamp(
  * if null.
  * @param[in] hi Maximum clamp value. All elements greater than `hi` will be replaced by `hi`
  * Ignored if null.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate device memory of the returned column
  *
  * @return Returns a clamped column as per `lo` and `hi` boundaries
@@ -252,6 +268,7 @@ std::unique_ptr<column> clamp(
   column_view const& input,
   scalar const& lo,
   scalar const& hi,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -264,12 +281,14 @@ std::unique_ptr<column> clamp(
  *
  * @throws cudf::logic_error if column does not have floating point data type.
  * @param[in] input column_view of floating-point elements to copy and normalize
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr device_memory_resource allocator for allocating output data
  *
  * @returns new column with the modified data
  */
 std::unique_ptr<column> normalize_nans_and_zeros(
   column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -282,8 +301,10 @@ std::unique_ptr<column> normalize_nans_and_zeros(
  *
  * @throws cudf::logic_error if column does not have floating point data type.
  * @param[in, out] in_out of floating-point elements to normalize
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void normalize_nans_and_zeros(mutable_column_view& in_out);
+void normalize_nans_and_zeros(mutable_column_view& in_out,
+                              rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index aa78979bf7a..efdb85691bd 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -114,19 +114,29 @@ struct window_bounds {
     return window_bounds(true, std::numeric_limits<cudf::size_type>::max());
   }
 
-  // TODO: In the future, add units for bounds.
-  //       E.g. {value=1, unit=DAYS, unbounded=false}
-  //       For the present, assume units from context:
-  //         1. For time-based window functions, assume DAYS as before
-  //         2. For all else, assume ROWS as before.
-  const bool is_unbounded;  ///< Whether the window boundary is unbounded
-  const size_type value;    ///< Finite window boundary value (in days or rows)
+  /**
+   * Whether the window_bounds is unbounded.
+   *
+   * @return true if the window bounds is unbounded.
+   * @return false if the window bounds has a finite row boundary.
+   */
+  [[nodiscard]] bool is_unbounded() const { return _is_unbounded; }
+
+  /**
+   * @brief Gets the row-boundary for this window_bounds.
+   *
+   * @return the row boundary value (in days or rows)
+   */
+  [[nodiscard]] size_type value() const { return _value; }
 
  private:
   explicit window_bounds(bool is_unbounded_, size_type value_ = 0)
-    : is_unbounded{is_unbounded_}, value{value_}
+    : _is_unbounded{is_unbounded_}, _value{value_}
   {
   }
+
+  bool const _is_unbounded;  ///< Whether the window boundary is unbounded
+  size_type const _value;    ///< Finite window boundary value (in days or rows)
 };
 
 /**
diff --git a/cpp/include/cudf/rolling/range_window_bounds.hpp b/cpp/include/cudf/rolling/range_window_bounds.hpp
index b08792740ff..c5b0c219373 100644
--- a/cpp/include/cudf/rolling/range_window_bounds.hpp
+++ b/cpp/include/cudf/rolling/range_window_bounds.hpp
@@ -59,7 +59,7 @@ struct range_window_bounds {
    * @brief Factory method to construct a window boundary
    *  limited to the value of the current row
    *
-   * @param type type The datatype of the window boundary
+   * @param type The datatype of the window boundary
    * @return  A "current row" window boundary object
    */
   static range_window_bounds current_row(data_type type);
@@ -75,7 +75,7 @@ struct range_window_bounds {
   /**
    * @brief Factory method to construct an unbounded window boundary.
    *
-   * @param type type The datatype of the window boundary
+   * @param type The datatype of the window boundary
    * @return  An unbounded window boundary object
    */
   static range_window_bounds unbounded(data_type type);
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index dab085ad7d5..af5e6d6b2d6 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -457,6 +457,8 @@ class string_scalar : public scalar {
   /**
    * @brief Construct a new string scalar object.
    *
+   * @throws std::overflow_error If the size of the input string exceeds cudf::size_type
+   *
    * @param string The value of the string.
    * @param is_valid Whether the value held by the scalar is valid.
    * @param stream CUDA stream used for device memory operations.
@@ -545,7 +547,7 @@ class string_scalar : public scalar {
    * @brief Returns a raw pointer to the string in device memory.
    * @return a raw pointer to the string in device memory
    */
-  [[nodiscard]] const char* data() const;
+  [[nodiscard]] char const* data() const;
 
  protected:
   rmm::device_buffer _data{};  ///< device memory containing the string
diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh
index 18bcd89a00b..846da0bbe10 100644
--- a/cpp/include/cudf/scalar/scalar_device_view.cuh
+++ b/cpp/include/cudf/scalar/scalar_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -304,7 +304,7 @@ class string_scalar_device_view : public detail::scalar_device_view_base {
    * validity of the stored value
    * @param size The pointer to the size of the string in device memory
    */
-  string_scalar_device_view(data_type type, const char* data, bool* is_valid, size_type size)
+  string_scalar_device_view(data_type type, char const* data, bool* is_valid, size_type size)
     : detail::scalar_device_view_base(type, is_valid), _data(data), _size(size)
   {
   }
@@ -337,7 +337,7 @@ class string_scalar_device_view : public detail::scalar_device_view_base {
   [[nodiscard]] __device__ size_type size() const noexcept { return _size; }
 
  private:
-  const char* _data{};  ///< Pointer to device memory containing the value
+  char const* _data{};  ///< Pointer to device memory containing the value
   size_type _size;      ///< Size of the string in bytes
 };
 
diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp
index fee22786d7a..49acce6a63b 100644
--- a/cpp/include/cudf/search.hpp
+++ b/cpp/include/cudf/search.hpp
@@ -63,6 +63,7 @@ namespace cudf {
  * @param needles Values for which to find the insert locations in the search space
  * @param column_order Vector of column sort order
  * @param null_precedence Vector of null_precedence enums needles
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A non-nullable column of elements containing the insertion points
  */
@@ -71,6 +72,7 @@ std::unique_ptr<column> lower_bound(
   table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -103,6 +105,7 @@ std::unique_ptr<column> lower_bound(
  * @param needles Values for which to find the insert locations in the search space
  * @param column_order Vector of column sort order
  * @param null_precedence Vector of null_precedence enums needles
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A non-nullable column of elements containing the insertion points
  */
@@ -111,6 +114,7 @@ std::unique_ptr<column> upper_bound(
   table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -128,9 +132,12 @@ std::unique_ptr<column> upper_bound(
  *
  * @param haystack The column containing search space
  * @param needle A scalar value to check for existence in the search space
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return true if the given `needle` value exists in the `haystack` column
  */
-bool contains(column_view const& haystack, scalar const& needle);
+bool contains(column_view const& haystack,
+              scalar const& needle,
+              rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Check if the given `needles` values exists in the `haystack` column.
@@ -149,12 +156,14 @@ bool contains(column_view const& haystack, scalar const& needle);
  *
  * @param haystack The column containing search space
  * @param needles A column of values to check for existence in the search space
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A BOOL column indicating if each element in `needles` exists in the search space
  */
 std::unique_ptr<column> contains(
   column_view const& haystack,
   column_view const& needles,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index e2a6b97256f..984e3037cd1 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -254,22 +254,19 @@ std::unique_ptr<table> unique(
  * @brief Create a new table without duplicate rows.
  *
  * Given an `input` table_view, each row is copied to the output table to create a set of distinct
- * rows. If there are duplicate rows, which row to be copied depends on the specified value of
- * the `keep` parameter.
+ * rows. If there are duplicate rows, which row is copied depends on the `keep` parameter.
  *
  * The order of rows in the output table is not specified.
  *
  * Performance hint: if the input is pre-sorted, `cudf::unique` can produce an equivalent result
  * (i.e., same set of output rows) but with less running time than `cudf::distinct`.
  *
- * @param[in] input           input table_view to copy only distinct rows
- * @param[in] keys            vector of indices representing key columns from `input`
- * @param[in] keep            keep any, first, last, or none of the found duplicates
- * @param[in] nulls_equal     flag to control if nulls are compared equal or not
- * @param[in] nans_equal      flag to control if floating-point NaN values are compared equal or not
- * @param[in] mr              Device memory resource used to allocate the returned table's device
- *                            memory
- *
+ * @param input The input table
+ * @param keys Vector of indices indicating key columns in the `input` table
+ * @param keep Copy any, first, last, or none of the found duplicates
+ * @param nulls_equal Flag to specify whether null elements should be considered as equal
+ * @param nans_equal Flag to specify whether NaN elements should be considered as equal
+ * @param mr Device memory resource used to allocate the returned table
  * @return Table with distinct rows in an unspecified order
  */
 std::unique_ptr<table> distinct(
@@ -280,6 +277,36 @@ std::unique_ptr<table> distinct(
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create a new table without duplicate rows, preserving input order.
+ *
+ * Given an `input` table_view, each row is copied to the output table to create a set of distinct
+ * rows. The input row order is preserved. If there are duplicate rows, which row is copied depends
+ * on the `keep` parameter.
+ *
+ * This API produces the same output rows as `cudf::distinct`, but with input order preserved.
+ *
+ * Note that when `keep` is `KEEP_ANY`, the choice of which duplicate row to keep is arbitrary, but
+ * the returned table will retain the input order. That is, if the key column contained `1, 2, 1`
+ * with another values column `3, 4, 5`, the result could contain values `3, 4` or `4, 5` but not
+ * `4, 3` or `5, 4`.
+ *
+ * @param input The input table
+ * @param keys Vector of indices indicating key columns in the `input` table
+ * @param keep Copy any, first, last, or none of the found duplicates
+ * @param nulls_equal Flag to specify whether null elements should be considered as equal
+ * @param nans_equal Flag to specify whether NaN elements should be considered as equal
+ * @param mr Device memory resource used to allocate the returned table
+ * @return Table with distinct rows, preserving input order
+ */
+std::unique_ptr<table> stable_distinct(
+  table_view const& input,
+  std::vector<size_type> const& keys,
+  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
+  null_equality nulls_equal           = null_equality::EQUAL,
+  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Count the number of consecutive groups of equivalent rows in a column.
  *
diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 067f646fc33..71f65ac9080 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -258,8 +258,8 @@ std::unique_ptr<column> concatenate(
  * @return New strings column with concatenated results.
  */
 std::unique_ptr<column> join_list_elements(
-  const lists_column_view& lists_strings_column,
-  const strings_column_view& separators,
+  lists_column_view const& lists_strings_column,
+  strings_column_view const& separators,
   string_scalar const& separator_narep   = string_scalar("", false),
   string_scalar const& string_narep      = string_scalar("", false),
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
@@ -316,7 +316,7 @@ std::unique_ptr<column> join_list_elements(
  * @return New strings column with concatenated results.
  */
 std::unique_ptr<column> join_list_elements(
-  const lists_column_view& lists_strings_column,
+  lists_column_view const& lists_strings_column,
   string_scalar const& separator         = string_scalar(""),
   string_scalar const& narep             = string_scalar("", false),
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp
index a7e09e09bac..fa729d26734 100644
--- a/cpp/include/cudf/strings/convert/convert_datetime.hpp
+++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp
@@ -243,7 +243,7 @@ std::unique_ptr<column> from_timestamps(
   column_view const& timestamps,
   std::string_view format             = "%Y-%m-%dT%H:%M:%SZ",
   strings_column_view const& names    = strings_column_view(column_view{
-    data_type{type_id::STRING}, 0, nullptr}),
+    data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 3208d5f8f3b..44213b84139 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,7 +81,7 @@ std::unique_ptr<column> from_integers(
  *
  * The output row entry will be set to `true` if the corresponding string element
  * have all characters in [-+0-9]. The optional sign character must only be in the first
- * position. Notice that the the integer value is not checked to be within its storage limits.
+ * position. Notice that the integer value is not checked to be within its storage limits.
  * For strict integer type check, use the other `is_integer()` API which accepts `data_type`
  * argument.
  *
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index e2480b459b9..7ab1bf47b0a 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ std::unique_ptr<column> format_list_column(
   lists_column_view const& input,
   string_scalar const& na_rep           = string_scalar("NULL"),
   strings_column_view const& separators = strings_column_view(column_view{
-    data_type{type_id::STRING}, 0, nullptr}),
+    data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
   rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp
index 275b7223a3b..0901076c835 100644
--- a/cpp/include/cudf/strings/detail/char_tables.hpp
+++ b/cpp/include/cudf/strings/detail/char_tables.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ using character_flags_table_type = std::uint8_t;
  *
  * @return Device memory pointer to character flags table.
  */
-const character_flags_table_type* get_character_flags_table();
+character_flags_table_type const* get_character_flags_table();
 
 // utilities to dissect a character-table flag
 constexpr uint8_t IS_DECIMAL(uint8_t x) { return ((x) & (1 << 0)); }
@@ -61,7 +61,7 @@ using character_cases_table_type = uint16_t;
  *
  * @return Device memory pointer to character cases table.
  */
-const character_cases_table_type* get_character_cases_table();
+character_cases_table_type const* get_character_cases_table();
 
 /**
  * @brief Case mapping structure for special characters.
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index 185754a00c8..dd55cae4537 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ namespace detail {
  */
 template <typename UnsignedDecimalType>
 __device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
-  char const*& iter, char const* iter_end, const char decimal_pt_char = '.')
+  char const*& iter, char const* iter_end, char const decimal_pt_char = '.')
 {
   // highest value where another decimal digit cannot be appended without an overflow;
   // this preserves the most digits when scaling the final result for this type
diff --git a/cpp/include/cudf/strings/detail/convert/is_float.cuh b/cpp/include/cudf/strings/detail/convert/is_float.cuh
index 92c993cfbb5..5b09da96dc4 100644
--- a/cpp/include/cudf/strings/detail/convert/is_float.cuh
+++ b/cpp/include/cudf/strings/detail/convert/is_float.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -83,7 +83,7 @@ inline __device__ bool is_float(string_view const& d_str)
   bool decimal_found  = false;
   bool exponent_found = false;
   size_type bytes     = d_str.size_bytes();
-  const char* data    = d_str.data();
+  char const* data    = d_str.data();
   // sign character allowed at the beginning of the string
   size_type ch_idx = (*data == '-' || *data == '+') ? 1 : 0;
 
diff --git a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
index 8721f21a7c0..ab934750f9e 100644
--- a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
+++ b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,8 +35,8 @@ namespace detail {
  */
 __device__ inline double stod(string_view const& d_str)
 {
-  const char* in_ptr = d_str.data();
-  const char* end    = in_ptr + d_str.size_bytes();
+  char const* in_ptr = d_str.data();
+  char const* end    = in_ptr + d_str.size_bytes();
   if (end == in_ptr) return 0.0;
   double sign{1.0};
   if (*in_ptr == '-' || *in_ptr == '+') {
diff --git a/cpp/include/cudf/strings/detail/convert/string_to_int.cuh b/cpp/include/cudf/strings/detail/convert/string_to_int.cuh
index 6c8de06602e..8bbaea9390c 100644
--- a/cpp/include/cudf/strings/detail/convert/string_to_int.cuh
+++ b/cpp/include/cudf/strings/detail/convert/string_to_int.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ __device__ inline int64_t string_to_integer(string_view const& d_str)
   int64_t value   = 0;
   size_type bytes = d_str.size_bytes();
   if (bytes == 0) return value;
-  const char* ptr = d_str.data();
+  char const* ptr = d_str.data();
   int sign        = 1;
   if (*ptr == '-' || *ptr == '+') {
     sign = (*ptr == '-' ? -1 : 1);
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 28b98eac3b5..7cd2338cb67 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -32,16 +33,14 @@
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
-#include <thrust/transform_reduce.h>
+#include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 
 // Helper function for loading 16B from a potentially unaligned memory location to registers.
-__forceinline__ __device__ uint4 load_uint4(const char* ptr)
+__forceinline__ __device__ uint4 load_uint4(char const* ptr)
 {
   auto const offset       = reinterpret_cast<std::uintptr_t>(ptr) % 4;
   auto const* aligned_ptr = reinterpret_cast<unsigned int const*>(ptr - offset);
@@ -100,7 +99,7 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
     // This check is necessary because string_indices[istring] may be out of bound.
     if (out_start == out_end) continue;
 
-    const char* in_start = strings_begin[string_indices[istring]].data();
+    char const* in_start = strings_begin[string_indices[istring]].data();
 
     // Both `out_start_aligned` and `out_end_aligned` are indices into `out_chars`.
     // `out_start_aligned` is the first 16B aligned memory location after `out_start + 4`.
@@ -294,58 +293,31 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  auto const output_count  = std::distance(begin, end);
-  auto const strings_count = strings.size();
+  auto const output_count = std::distance(begin, end);
   if (output_count == 0) return make_empty_column(type_id::STRING);
 
-  // allocate offsets column and use memory to compute string size in each output row
-  auto out_offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, output_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto const d_out_offsets = out_offsets_column->mutable_view().template data<int32_t>();
-  auto const d_in_offsets  = (strings_count > 0) ? strings.offsets_begin() : nullptr;
-  auto const d_strings     = column_device_view::create(strings.parent(), stream);
-  thrust::transform(
-    rmm::exec_policy_nosync(stream),
-    begin,
-    end,
-    d_out_offsets,
-    [d_strings = *d_strings, d_in_offsets, strings_count] __device__(size_type in_idx) {
-      if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0;
-      if (not d_strings.is_valid(in_idx)) return 0;
-      return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx];
-    });
+  // build offsets column
+  auto const d_strings    = column_device_view::create(strings.parent(), stream);
+  auto const d_in_offsets = !strings.is_empty() ? strings.offsets_begin() : nullptr;
 
-  // check total size is not too large
-  size_t const total_bytes = thrust::transform_reduce(
-    rmm::exec_policy_nosync(stream),
-    d_out_offsets,
-    d_out_offsets + output_count,
-    [] __device__(auto size) { return static_cast<size_t>(size); },
-    size_t{0},
-    thrust::plus{});
-  CUDF_EXPECTS(total_bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-               "total size of output strings is too large for a cudf column");
-
-  // In-place convert output sizes into offsets
-  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
-                         d_out_offsets,
-                         d_out_offsets + output_count + 1,
-                         d_out_offsets);
+  auto offsets_itr = thrust::make_transform_iterator(
+    begin, [d_strings = *d_strings, d_in_offsets] __device__(size_type idx) {
+      if (NullifyOutOfBounds && (idx < 0 || idx >= d_strings.size())) { return 0; }
+      if (not d_strings.is_valid(idx)) { return 0; }
+      return d_in_offsets[idx + 1] - d_in_offsets[idx];
+    });
+  auto [out_offsets_column, total_bytes] =
+    cudf::detail::make_offsets_child_column(offsets_itr, offsets_itr + output_count, stream, mr);
 
   // build chars column
-  cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
-  auto out_chars_column = gather_chars(d_strings->begin<string_view>(),
-                                       begin,
-                                       end,
-                                       d_out_offsets_span,
-                                       static_cast<size_type>(total_bytes),
-                                       stream,
-                                       mr);
+  auto const offsets_view = out_offsets_column->view();
+  auto out_chars_column   = gather_chars(
+    d_strings->begin<string_view>(), begin, end, offsets_view, total_bytes, stream, mr);
 
   return make_strings_column(output_count,
                              std::move(out_offsets_column),
                              std::move(out_chars_column),
-                             0,
+                             0,  // caller sets these
                              rmm::device_buffer{});
 }
 
diff --git a/cpp/include/cudf/strings/detail/split_utils.cuh b/cpp/include/cudf/strings/detail/split_utils.cuh
index 99a5edaf91a..a95a9ee23bd 100644
--- a/cpp/include/cudf/strings/detail/split_utils.cuh
+++ b/cpp/include/cudf/strings/detail/split_utils.cuh
@@ -22,6 +22,35 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+constexpr bool is_whitespace(char_utf8 ch) { return ch <= ' '; }
+
+/**
+ * @brief Count tokens delimited by whitespace
+ *
+ * @param d_str String to tokenize
+ * @param max_tokens Maximum number of tokens to count
+ * @return Number of tokens delimited by whitespace
+ */
+__device__ inline size_type count_tokens_whitespace(
+  string_view d_str, size_type const max_tokens = std::numeric_limits<size_type>::max())
+{
+  auto token_count = size_type{0};
+  auto spaces      = true;
+  auto itr         = d_str.data();
+  auto const end   = itr + d_str.size_bytes();
+  while (itr < end && token_count < max_tokens) {
+    cudf::char_utf8 ch   = 0;
+    auto const chr_width = cudf::strings::detail::to_char_utf8(itr, ch);
+    if (spaces == is_whitespace(ch)) {
+      itr += chr_width;
+    } else {
+      token_count += static_cast<size_type>(spaces);
+      spaces = !spaces;
+    }
+  }
+  return token_count;
+}
+
 // JIT has trouble including thrust/pair.h
 struct position_pair {
   size_type first;
@@ -43,26 +72,33 @@ struct whitespace_string_tokenizer {
    */
   __device__ bool next_token()
   {
-    if (itr != d_str.begin()) {            // skip these 2 lines the first time through
-      ++itr;
-      start_position = itr.byte_offset();  // end_position + 1;
+    if (start_position >= d_str.size_bytes()) { return false; }
+    auto const src_ptr = d_str.data();
+    if (current_position != 0) {
+      current_position += cudf::strings::detail::bytes_in_char_utf8(src_ptr[current_position]);
+      start_position = current_position;
     }
-    if (start_position >= d_str.size_bytes()) return false;
+    if (start_position >= d_str.size_bytes()) { return false; }
     // continue search for the next token
     end_position = d_str.size_bytes();
-    for (; itr < d_str.end(); ++itr) {
-      if (spaces == (*itr <= ' ')) {
-        if (spaces)
-          start_position = (itr + 1).byte_offset();
-        else
-          end_position = (itr + 1).byte_offset();
+    while (current_position < d_str.size_bytes()) {
+      cudf::char_utf8 ch   = 0;
+      auto const chr_width = cudf::strings::detail::to_char_utf8(src_ptr + current_position, ch);
+      if (spaces == is_whitespace(ch)) {
+        current_position += chr_width;
+        if (spaces) {
+          start_position = current_position;
+        } else {
+          end_position = current_position;
+        }
         continue;
       }
       spaces = !spaces;
       if (spaces) {
-        end_position = itr.byte_offset();
+        end_position = current_position;
         break;
       }
+      current_position += chr_width;
     }
     return start_position < end_position;
   }
@@ -106,7 +142,8 @@ struct whitespace_string_tokenizer {
       spaces(true),
       start_position{reverse ? d_str.size_bytes() + 1 : 0},
       end_position{d_str.size_bytes()},
-      itr{reverse ? d_str.end() : d_str.begin()}
+      itr{reverse ? d_str.end() : d_str.begin()},
+      current_position{0}
   {
   }
 
@@ -116,6 +153,7 @@ struct whitespace_string_tokenizer {
   cudf::string_view::const_iterator itr;
   size_type start_position;
   size_type end_position;
+  size_type current_position;
 };
 
 }  // namespace detail
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 02a65c01178..5f8a2a34606 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -79,8 +79,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
   // Convert the sizes to offsets
   auto const bytes =
     cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit",
+  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
+               "Size of output exceeds the column size limit",
                std::overflow_error);
 
   // Now build the chars column
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index b219b28cf9b..7e608cd10f0 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -45,7 +45,7 @@ namespace detail {
  * @brief Basic type expected for iterators passed to `make_strings_column` that represent string
  * data in device memory.
  */
-using string_index_pair = thrust::pair<const char*, size_type>;
+using string_index_pair = thrust::pair<char const*, size_type>;
 
 /**
  * @brief Average string byte-length threshold for deciding character-level
diff --git a/cpp/include/cudf/strings/detail/strip.cuh b/cpp/include/cudf/strings/detail/strip.cuh
index 533e76121b5..264ea0c103a 100644
--- a/cpp/include/cudf/strings/detail/strip.cuh
+++ b/cpp/include/cudf/strings/detail/strip.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,8 @@ __device__ cudf::string_view strip(cudf::string_view const d_str,
                                    cudf::string_view const d_to_strip,
                                    side_type side = side_type::BOTH)
 {
+  if (d_str.empty()) { return cudf::string_view{}; }  // sanitize empty return
+
   auto is_strip_character = [d_to_strip](char_utf8 chr) -> bool {
     if (d_to_strip.empty()) return chr <= ' ';  // whitespace check
     for (auto c : d_to_strip) {
diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
index 9e1bd10c1cf..df8e2885782 100644
--- a/cpp/include/cudf/strings/detail/utf8.hpp
+++ b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -108,7 +108,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte)
  * @param[out] character Single char_utf8 value.
  * @return The number of bytes in the character
  */
-constexpr size_type to_char_utf8(const char* str, char_utf8& character)
+constexpr size_type to_char_utf8(char const* str, char_utf8& character)
 {
   size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
 
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index 1d48a5cc201..5c719cd25d2 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/error.hpp>
 
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+
 #include <mutex>
 #include <unordered_map>
 
@@ -29,14 +32,15 @@ namespace detail {
  * @brief Copies input string data into a buffer and increments the pointer by the number of bytes
  * copied.
  *
- * @param buffer Device buffer to copy to.
- * @param input Data to copy from.
- * @param bytes Number of bytes to copy.
- * @return Pointer to the end of the output buffer after the copy.
+ * @param buffer Device buffer to copy to
+ * @param input Data to copy from
+ * @param bytes Number of bytes to copy
+ * @return Pointer to the end of the output buffer after the copy
  */
-__device__ inline char* copy_and_increment(char* buffer, const char* input, size_type bytes)
+__device__ inline char* copy_and_increment(char* buffer, char const* input, size_type bytes)
 {
-  memcpy(buffer, input, bytes);
+  // this can be slightly faster than memcpy
+  thrust::copy_n(thrust::seq, input, bytes, buffer);
   return buffer + bytes;
 }
 
@@ -48,7 +52,7 @@ __device__ inline char* copy_and_increment(char* buffer, const char* input, size
  * @param d_string String to copy.
  * @return Pointer to the end of the output buffer after the copy.
  */
-__device__ inline char* copy_string(char* buffer, const string_view& d_string)
+__device__ inline char* copy_string(char* buffer, string_view const& d_string)
 {
   return copy_and_increment(buffer, d_string.data(), d_string.size_bytes());
 }
@@ -62,7 +66,7 @@ class per_context_cache {
   // If there is no object available in the cache, it calls the initializer
   // `init` to create a new one and cache it for later uses.
   template <typename Initializer>
-  TableType* find_or_initialize(const Initializer& init)
+  TableType* find_or_initialize(Initializer const& init)
   {
     int device_id;
     CUDF_CUDA_TRY(cudaGetDevice(&device_id));
@@ -85,7 +89,7 @@ template <typename TableType>
 class thread_safe_per_context_cache : public per_context_cache<TableType> {
  public:
   template <typename Initializer>
-  TableType* find_or_initialize(const Initializer& init)
+  TableType* find_or_initialize(Initializer const& init)
   {
     std::lock_guard<std::mutex> guard(mutex);
     return per_context_cache<TableType>::find_or_initialize(init);
diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index 4f4b71ac82d..2fed36862b9 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,6 +87,33 @@ std::unique_ptr<column> rfind(
   size_type stop                      = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a column of character position values where the target
+ * string is first found in the corresponding string of the provided column
+ *
+ * The output of row `i` is the character position of the target string for row `i`
+ * within input string of row `i` starting at the character position `start`.
+ * If the target is not found within the input string, -1 is returned for that
+ * row entry in the output column.
+ *
+ * Any null input or target entries return corresponding null output column entries.
+ *
+ * @throw cudf::logic_error if `input.size() != target.size()`
+ *
+ * @param input Strings to search against
+ * @param target Strings to search for in `input`
+ * @param start First character position to include in the search
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New integer column with character position values
+ */
+std::unique_ptr<column> find(
+  strings_column_view const& input,
+  strings_column_view const& target,
+  size_type start                     = 0,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a column of boolean values for each string where true indicates
  * the target string was found within that string in the provided column.
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index 26fe5f95983..2b6575f80d0 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -20,8 +20,6 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
-#include <optional>
-
 namespace cudf {
 namespace strings {
 /**
@@ -49,9 +47,8 @@ namespace strings {
  * out is '123XYZ-123XYZ-123XYZ-'
  * @endcode
  *
- * @throw cudf::logic_error if the size of the output string scalar exceeds the maximum value that
- *        can be stored by the index type:
- *        `input.size() * repeat_times > max of size_type`
+ * @throw std::overflow_error if the size of the output string scalar exceeds the maximum value that
+ *        can be stored by the scalar: `input.size() * repeat_times > max of size_type`
  *
  * @param input The scalar containing the string to repeat
  * @param repeat_times The number of times the input string is repeated
diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp
index e28d42b8154..5f2c71725eb 100644
--- a/cpp/include/cudf/strings/slice.hpp
+++ b/cpp/include/cudf/strings/slice.hpp
@@ -107,99 +107,6 @@ std::unique_ptr<column> slice_strings(
   column_view const& stops,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Slices a column of strings by using a delimiter as a slice point.
- *
- * Returns a column of strings after searching for @p delimiter @p count number of
- * times in the source @p strings from left to right if @p count is positive or from
- * right to left if @p count is negative. If @p count is positive, it returns a substring
- * from the start of the source @p strings up until @p count occurrence of the @p delimiter
- * not including the @p delimiter. If @p count is negative, it returns a substring from
- * the start of the @p count occurrence of the @p delimiter in the source @p strings past
- * the delimiter until the end of the string.
- *
- * The search for @p delimiter in @p strings is case sensitive.
- * If the row value of @p strings is null, the row value in the output column will be null.
- * If the @p count is 0 or if @p delimiter is invalid or empty, every row in the output column
- * will be an empty string.
- * If the column value for a row is empty, the row value in the output column will be empty.
- * If @p count occurrences of @p delimiter isn't found, the row value in the output column will
- * be the row value from the input @p strings column.
- *
- * @code{.pseudo}
- * Example:
- * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo']
- * r = slice_strings(in_s, '.', 1)
- * r =    ['www',            null, 'www',            '', 'foo']
- *
- * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo']
- * r = slice_strings(in_s, '.', -2)
- * r =    ['nvidia.com',     null, 'google.com',     '', 'foo']
- * @endcode
- *
- * @param strings Strings instance for this operation.
- * @param delimiter UTF-8 encoded string to search for in each string.
- * @param count Number of times to search for delimiter in each string. If the value is positive,
- *              delimiter is searched from left to right; else, it is searched from right to left.
- * @param mr Resource for allocating device memory.
- * @return New strings column containing the substrings.
- */
-std::unique_ptr<column> slice_strings(
-  strings_column_view const& strings,
-  string_scalar const& delimiter,
-  size_type count,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Slices a column of strings by using a delimiter column as slice points.
- *
- * Returns a column of strings after searching the delimiter defined per row from
- * @p delimiter_strings @p count number of times in the source @p strings from left to right
- * if @p count is positive or from right to left if @p count is negative. If @p count is
- * positive, it returns a substring from the start of the source @p strings up until
- * @p count occurrence of the delimiter for that row not including that delimiter. If @p count
- * is negative, it returns a substring from the start of the @p count occurrence of the
- * delimiter for that row in the source @p strings past the delimiter until the end of the string.
- *
- * The search for @p delimiter_strings in @p strings is case sensitive.
- * If the @p count is 0, every row in the output column will be an empty string.
- * If the row value of @p strings is null, the row value in the output column will be null.
- * If the row value from @p delimiter_strings is invalid or null, the row value in the
- * output column will be an empty string.
- * If the row value from @p delimiter_strings or the column value for a row is empty, the
- * row value in the output column will be empty.
- * If @p count occurrences of delimiter isn't found, the row value in the output column will
- * be the row value from the input @p strings column.
- *
- * @code{.pseudo}
- * Example:
- * in_s =       ['www.nvidia.com', null, 'www.google.com', 'bar', 'foo..bar....goo']
- * delimiters = ['.',              '..', '',               null,  '..']
- * r = slice_strings(in_s, delimiters, 2)
- * r =          ['www.nvidia',     null, '',               '',   'foo..bar']
- *
- * in_s =       ['www.nvidia.com', null, 'www.google.com', '',  'foo..bar....goo', 'apache.org']
- * delimiters = ['.',              '..', '',               null,'..',              '.']
- * r = slice_strings(in_s, delimiters, -2)
- * r =          ['nvidia.com',     null, '',               '',  '..goo',           'apache.org']
- * @endcode
- *
- * @throw cudf::logic_error if the number of rows in @p strings and @p delimiter_strings do not
- * match.
- *
- * @param strings Strings instance for this operation.
- * @param delimiter_strings UTF-8 encoded string for each row.
- * @param count Number of times to search for delimiter in each string. If the value is positive,
- *              delimiter is searched from left to right; else, it is searched from right to left.
- * @param mr Resource for allocating device memory.
- * @return New strings column containing the substrings.
- */
-std::unique_ptr<column> slice_strings(
-  strings_column_view const& strings,
-  strings_column_view const& delimiter_strings,
-  size_type count,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index a6c942d39b4..701950e61a5 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,18 +43,20 @@ namespace strings {
  *
  * Any null string entries return corresponding null output columns.
  *
- * @param strings_column Strings instance for this operation.
- * @param delimiter UTF-8 encoded string indicating the split points in each string.
+ * @param strings_column Strings instance for this operation
+ * @param delimiter UTF-8 encoded string indicating the split points in each string;
  *        Default of empty string indicates split on whitespace.
- * @param maxsplit Maximum number of splits to perform.
+ * @param maxsplit Maximum number of splits to perform;
  *        Default of -1 indicates all possible splits on each string.
- * @param mr Device memory resource used to allocate the returned table's device memory.
- * @return New table of strings columns.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return New table of strings columns
  */
 std::unique_ptr<table> split(
   strings_column_view const& strings_column,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -71,18 +73,20 @@ std::unique_ptr<table> split(
  *
  * Any null string entries return corresponding null output columns.
  *
- * @param strings_column Strings instance for this operation.
- * @param delimiter UTF-8 encoded string indicating the split points in each string.
+ * @param strings_column Strings instance for this operation
+ * @param delimiter UTF-8 encoded string indicating the split points in each string;
  *        Default of empty string indicates split on whitespace.
- * @param maxsplit Maximum number of splits to perform.
+ * @param maxsplit Maximum number of splits to perform;
  *        Default of -1 indicates all possible splits on each string.
- * @param mr Device memory resource used to allocate the returned table's device memory.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
  * @return New strings columns.
  */
 std::unique_ptr<table> rsplit(
   strings_column_view const& strings_column,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -141,20 +145,22 @@ std::unique_ptr<table> rsplit(
  *
  * @throw cudf:logic_error if `delimiter` is invalid.
  *
- * @param strings A column of string elements to be split.
- * @param delimiter The string to identify split points in each string.
+ * @param strings A column of string elements to be split
+ * @param delimiter The string to identify split points in each string;
  *        Default of empty string indicates split on whitespace.
- * @param maxsplit Maximum number of splits to perform.
- *        Default of -1 indicates all possible splits on each string.
- * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return Lists column of strings
- *         Each vector of the lists column holds splits from a single row
+ * @param maxsplit Maximum number of splits to perform;
+ *        Default of -1 indicates all possible splits on each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return Lists column of strings;
+ *         Each row of the lists column holds splits from a single row
  *         element of the input column.
  */
 std::unique_ptr<column> split_record(
   strings_column_view const& strings,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -218,20 +224,22 @@ std::unique_ptr<column> split_record(
  *
  * @throw cudf:logic_error if `delimiter` is invalid.
  *
- * @param strings A column of string elements to be split.
- * @param delimiter The string to identify split points in each string.
+ * @param strings A column of string elements to be split
+ * @param delimiter The string to identify split points in each string;
  *        Default of empty string indicates split on whitespace.
- * @param maxsplit Maximum number of splits to perform.
- *        Default of -1 indicates all possible splits on each string.
- * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return Lists column of strings
- *         Each vector of the lists column holds splits from a single row
+ * @param maxsplit Maximum number of splits to perform;
+ *        Default of -1 indicates all possible splits on each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return Lists column of strings;
+ *         Each row of the lists column holds splits from a single row
  *         element of the input column.
  */
 std::unique_ptr<column> rsplit_record(
   strings_column_view const& strings,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index fc4e3d57cfb..74df1ea1887 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -46,7 +46,7 @@ namespace detail {
  * @param bytes Number of bytes in str.
  * @return The number of characters in the array.
  */
-__device__ inline size_type characters_in_string(const char* str, size_type bytes)
+__device__ inline size_type characters_in_string(char const* str, size_type bytes)
 {
   if ((str == nullptr) || (bytes == 0)) return 0;
   auto ptr = reinterpret_cast<uint8_t const*>(str);
@@ -123,7 +123,7 @@ CUDF_HOST_DEVICE inline string_view string_view::min() { return string_view(); }
  */
 CUDF_HOST_DEVICE inline string_view string_view::max()
 {
-  const char* psentinel{nullptr};
+  char const* psentinel{nullptr};
 #if defined(__CUDA_ARCH__)
   psentinel = &cudf::strings::detail::max_string_sentinel[0];
 #else
@@ -142,7 +142,7 @@ __device__ inline size_type string_view::length() const
 
 // @cond
 // this custom iterator knows about UTF8 encoding
-__device__ inline string_view::const_iterator::const_iterator(const string_view& str, size_type pos)
+__device__ inline string_view::const_iterator::const_iterator(string_view const& str, size_type pos)
   : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)}
 {
 }
@@ -223,38 +223,45 @@ __device__ inline string_view::const_iterator string_view::const_iterator::opera
   return tmp;
 }
 
+__device__ inline string_view::const_iterator& string_view::const_iterator::move_to(
+  size_type new_pos)
+{
+  *this += (new_pos - char_pos);  // more efficient than recounting from the start
+  return *this;
+}
+
 __device__ inline bool string_view::const_iterator::operator==(
-  const string_view::const_iterator& rhs) const
+  string_view::const_iterator const& rhs) const
 {
   return (p == rhs.p) && (char_pos == rhs.char_pos);
 }
 
 __device__ inline bool string_view::const_iterator::operator!=(
-  const string_view::const_iterator& rhs) const
+  string_view::const_iterator const& rhs) const
 {
   return (p != rhs.p) || (char_pos != rhs.char_pos);
 }
 
 __device__ inline bool string_view::const_iterator::operator<(
-  const string_view::const_iterator& rhs) const
+  string_view::const_iterator const& rhs) const
 {
   return (p == rhs.p) && (char_pos < rhs.char_pos);
 }
 
 __device__ inline bool string_view::const_iterator::operator<=(
-  const string_view::const_iterator& rhs) const
+  string_view::const_iterator const& rhs) const
 {
   return (p == rhs.p) && (char_pos <= rhs.char_pos);
 }
 
 __device__ inline bool string_view::const_iterator::operator>(
-  const string_view::const_iterator& rhs) const
+  string_view::const_iterator const& rhs) const
 {
   return (p == rhs.p) && (char_pos > rhs.char_pos);
 }
 
 __device__ inline bool string_view::const_iterator::operator>=(
-  const string_view::const_iterator& rhs) const
+  string_view::const_iterator const& rhs) const
 {
   return (p == rhs.p) && (char_pos >= rhs.char_pos);
 }
@@ -272,7 +279,7 @@ __device__ inline size_type string_view::const_iterator::byte_offset() const { r
 
 __device__ inline string_view::const_iterator string_view::begin() const
 {
-  return const_iterator(*this, 0);
+  return const_iterator(*this, 0, 0);
 }
 
 __device__ inline string_view::const_iterator string_view::end() const
@@ -296,16 +303,16 @@ __device__ inline size_type string_view::byte_offset(size_type pos) const
   return std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
 }
 
-__device__ inline int string_view::compare(const string_view& in) const
+__device__ inline int string_view::compare(string_view const& in) const
 {
   return compare(in.data(), in.size_bytes());
 }
 
-__device__ inline int string_view::compare(const char* data, size_type bytes) const
+__device__ inline int string_view::compare(char const* data, size_type bytes) const
 {
   size_type const len1 = size_bytes();
-  const auto* ptr1     = reinterpret_cast<const unsigned char*>(this->data());
-  const auto* ptr2     = reinterpret_cast<const unsigned char*>(data);
+  auto const* ptr1     = reinterpret_cast<unsigned char const*>(this->data());
+  auto const* ptr2     = reinterpret_cast<unsigned char const*>(data);
   if ((ptr1 == ptr2) && (bytes == len1)) return 0;
   size_type idx = 0;
   for (; (idx < len1) && (idx < bytes); ++idx) {
@@ -318,39 +325,39 @@ __device__ inline int string_view::compare(const char* data, size_type bytes) co
   return 0;
 }
 
-__device__ inline bool string_view::operator==(const string_view& rhs) const
+__device__ inline bool string_view::operator==(string_view const& rhs) const
 {
   return (size_bytes() == rhs.size_bytes()) && (compare(rhs) == 0);
 }
 
-__device__ inline bool string_view::operator!=(const string_view& rhs) const
+__device__ inline bool string_view::operator!=(string_view const& rhs) const
 {
   return compare(rhs) != 0;
 }
 
-__device__ inline bool string_view::operator<(const string_view& rhs) const
+__device__ inline bool string_view::operator<(string_view const& rhs) const
 {
   return compare(rhs) < 0;
 }
 
-__device__ inline bool string_view::operator>(const string_view& rhs) const
+__device__ inline bool string_view::operator>(string_view const& rhs) const
 {
   return compare(rhs) > 0;
 }
 
-__device__ inline bool string_view::operator<=(const string_view& rhs) const
+__device__ inline bool string_view::operator<=(string_view const& rhs) const
 {
   int rc = compare(rhs);
   return (rc == 0) || (rc < 0);
 }
 
-__device__ inline bool string_view::operator>=(const string_view& rhs) const
+__device__ inline bool string_view::operator>=(string_view const& rhs) const
 {
   int rc = compare(rhs);
   return (rc == 0) || (rc > 0);
 }
 
-__device__ inline size_type string_view::find(const string_view& str,
+__device__ inline size_type string_view::find(string_view const& str,
                                               size_type pos,
                                               size_type count) const
 {
@@ -358,7 +365,7 @@ __device__ inline size_type string_view::find(const string_view& str,
 }
 
 template <bool forward>
-__device__ inline size_type string_view::find_impl(const char* str,
+__device__ inline size_type string_view::find_impl(char const* str,
                                                    size_type bytes,
                                                    size_type pos,
                                                    size_type count) const
@@ -388,7 +395,7 @@ __device__ inline size_type string_view::find_impl(const char* str,
   return npos;
 }
 
-__device__ inline size_type string_view::find(const char* str,
+__device__ inline size_type string_view::find(char const* str,
                                               size_type bytes,
                                               size_type pos,
                                               size_type count) const
@@ -403,14 +410,14 @@ __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size
   return find(str, chwidth, pos, count);
 }
 
-__device__ inline size_type string_view::rfind(const string_view& str,
+__device__ inline size_type string_view::rfind(string_view const& str,
                                                size_type pos,
                                                size_type count) const
 {
   return rfind(str.data(), str.size_bytes(), pos, count);
 }
 
-__device__ inline size_type string_view::rfind(const char* str,
+__device__ inline size_type string_view::rfind(char const* str,
                                                size_type bytes,
                                                size_type pos,
                                                size_type count) const
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index 23627943d95..afc7e027a4b 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -60,7 +60,7 @@ class string_view {
    *
    * @return A pointer to the internal device array
    */
-  CUDF_HOST_DEVICE [[nodiscard]] inline const char* data() const { return _data; }
+  CUDF_HOST_DEVICE [[nodiscard]] inline char const* data() const { return _data; }
 
   /**
    * @brief Return true if string has no characters
@@ -80,10 +80,10 @@ class string_view {
     using reference         = char_utf8&;
     using pointer           = char_utf8*;
     using iterator_category = std::input_iterator_tag;
-    __device__ inline const_iterator(const string_view& str, size_type pos);
-    const_iterator(const const_iterator& mit)        = default;
+    __device__ inline const_iterator(string_view const& str, size_type pos);
+    const_iterator(const_iterator const& mit)        = default;
     const_iterator(const_iterator&& mit)             = default;
-    const_iterator& operator=(const const_iterator&) = default;
+    const_iterator& operator=(const_iterator const&) = default;
     const_iterator& operator=(const_iterator&&)      = default;
     __device__ inline const_iterator& operator++();
     __device__ inline const_iterator operator++(int);
@@ -93,19 +93,20 @@ class string_view {
     __device__ inline const_iterator operator--(int);
     __device__ inline const_iterator& operator-=(difference_type);
     __device__ inline const_iterator operator-(difference_type) const;
-    __device__ inline bool operator==(const const_iterator&) const;
-    __device__ inline bool operator!=(const const_iterator&) const;
-    __device__ inline bool operator<(const const_iterator&) const;
-    __device__ inline bool operator<=(const const_iterator&) const;
-    __device__ inline bool operator>(const const_iterator&) const;
-    __device__ inline bool operator>=(const const_iterator&) const;
+    __device__ inline const_iterator& move_to(size_type);
+    __device__ inline bool operator==(const_iterator const&) const;
+    __device__ inline bool operator!=(const_iterator const&) const;
+    __device__ inline bool operator<(const_iterator const&) const;
+    __device__ inline bool operator<=(const_iterator const&) const;
+    __device__ inline bool operator>(const_iterator const&) const;
+    __device__ inline bool operator>=(const_iterator const&) const;
     __device__ inline char_utf8 operator*() const;
     [[nodiscard]] __device__ inline size_type position() const;
     [[nodiscard]] __device__ inline size_type byte_offset() const;
 
    private:
     friend class string_view;
-    const char* p{};
+    char const* p{};
     size_type bytes{};
     size_type char_pos{};
     size_type byte_pos{};
@@ -154,7 +155,7 @@ class string_view {
    *            not match is greater in the arg string, or all compared characters
    *            match but the arg string is longer.
    */
-  __device__ [[nodiscard]] inline int compare(const string_view& str) const;
+  __device__ [[nodiscard]] inline int compare(string_view const& str) const;
   /**
    * @brief Comparing target string with this string. Each character is compared
    * as a UTF-8 code-point value.
@@ -169,7 +170,7 @@ class string_view {
    *            not match is greater in the arg string, or all compared characters
    *            match but the arg string is longer.
    */
-  __device__ inline int compare(const char* str, size_type bytes) const;
+  __device__ inline int compare(char const* str, size_type bytes) const;
 
   /**
    * @brief Returns true if rhs matches this string exactly.
@@ -177,42 +178,42 @@ class string_view {
    * @param rhs Target string to compare with this string.
    * @return true if rhs matches this string exactly
    */
-  __device__ inline bool operator==(const string_view& rhs) const;
+  __device__ inline bool operator==(string_view const& rhs) const;
   /**
    * @brief Returns true if rhs does not match this string.
    *
    * @param rhs Target string to compare with this string.
    * @return true if rhs does not match this string
    */
-  __device__ inline bool operator!=(const string_view& rhs) const;
+  __device__ inline bool operator!=(string_view const& rhs) const;
   /**
    * @brief Returns true if this string is ordered before rhs.
    *
    * @param rhs Target string to compare with this string.
    * @return true if this string is ordered before rhs
    */
-  __device__ inline bool operator<(const string_view& rhs) const;
+  __device__ inline bool operator<(string_view const& rhs) const;
   /**
    * @brief Returns true if rhs is ordered before this string.
    *
    * @param rhs Target string to compare with this string.
    * @return true if rhs is ordered before this string
    */
-  __device__ inline bool operator>(const string_view& rhs) const;
+  __device__ inline bool operator>(string_view const& rhs) const;
   /**
    * @brief Returns true if this string matches or is ordered before rhs.
    *
    * @param rhs Target string to compare with this string.
    * @return true if this string matches or is ordered before rhs
    */
-  __device__ inline bool operator<=(const string_view& rhs) const;
+  __device__ inline bool operator<=(string_view const& rhs) const;
   /**
    * @brief Returns true if rhs matches or is ordered before this string.
    *
    * @param rhs Target string to compare with this string.
    * @return true if rhs matches or is ordered before this string
    */
-  __device__ inline bool operator>=(const string_view& rhs) const;
+  __device__ inline bool operator>=(string_view const& rhs) const;
 
   /**
    * @brief Returns the character position of the first occurrence where the
@@ -224,7 +225,7 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return npos if str is not found in this string.
    */
-  __device__ [[nodiscard]] inline size_type find(const string_view& str,
+  __device__ [[nodiscard]] inline size_type find(string_view const& str,
                                                  size_type pos   = 0,
                                                  size_type count = -1) const;
   /**
@@ -238,7 +239,7 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return npos if arg string is not found in this string.
    */
-  __device__ inline size_type find(const char* str,
+  __device__ inline size_type find(char const* str,
                                    size_type bytes,
                                    size_type pos   = 0,
                                    size_type count = -1) const;
@@ -265,7 +266,7 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return npos if arg string is not found in this string.
    */
-  __device__ [[nodiscard]] inline size_type rfind(const string_view& str,
+  __device__ [[nodiscard]] inline size_type rfind(string_view const& str,
                                                   size_type pos   = 0,
                                                   size_type count = -1) const;
   /**
@@ -279,7 +280,7 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return npos if arg string is not found in this string.
    */
-  __device__ inline size_type rfind(const char* str,
+  __device__ inline size_type rfind(char const* str,
                                     size_type bytes,
                                     size_type pos   = 0,
                                     size_type count = -1) const;
@@ -339,12 +340,12 @@ class string_view {
    * @param data Device char array encoded in UTF8.
    * @param bytes Number of bytes in data array.
    */
-  CUDF_HOST_DEVICE inline string_view(const char* data, size_type bytes)
+  CUDF_HOST_DEVICE inline string_view(char const* data, size_type bytes)
     : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH)
   {
   }
 
-  string_view(const string_view&) = default;  ///< Copy constructor
+  string_view(string_view const&) = default;  ///< Copy constructor
   string_view(string_view&&)      = default;  ///< Move constructor
   ~string_view()                  = default;
   /**
@@ -352,7 +353,7 @@ class string_view {
    *
    * @return Reference to this instance
    */
-  string_view& operator=(const string_view&) = default;
+  string_view& operator=(string_view const&) = default;
   /**
    * @brief Move assignment operator
    *
@@ -368,7 +369,7 @@ class string_view {
   static inline cudf::size_type const npos{-1};
 
  private:
-  const char* _data{};          ///< Pointer to device memory contain char array for this string
+  char const* _data{};          ///< Pointer to device memory contain char array for this string
   size_type _bytes{};           ///< Number of bytes in _data for this string
   mutable size_type _length{};  ///< Number of characters in this string (computed)
 
@@ -399,7 +400,7 @@ class string_view {
    * @return npos if str is not found in this string
    */
   template <bool forward>
-  __device__ inline size_type find_impl(const char* str,
+  __device__ inline size_type find_impl(char const* str,
                                         size_type bytes,
                                         size_type pos,
                                         size_type count) const;
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index e617dbde024..f1aa8e49f00 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,8 +67,8 @@ class strings_column_view : private column_view {
   using column_view::offset;
   using column_view::size;
 
-  using offset_iterator = offset_type const*;  ///< offsets iterator type
-  using chars_iterator  = char const*;         ///< character iterator type
+  using offset_iterator = size_type const*;  ///< offsets iterator type
+  using chars_iterator  = char const*;       ///< character iterator type
 
   /**
    * @brief Returns the parent column.
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index 3e37bd53972..6b024d902a9 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -17,11 +17,11 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/hashing.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/assert.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/list_device_view.cuh>
 #include <cudf/lists/lists_column_device_view.cuh>
@@ -542,8 +542,10 @@ class device_row_comparator {
                                                 size_type const rhs_index) const noexcept
   {
     int last_null_depth = std::numeric_limits<int>::max();
-    size_type list_column_index{0};
+    size_type list_column_index{-1};
     for (size_type i = 0; i < _lhs.num_columns(); ++i) {
+      if (_lhs.column(i).type().id() == type_id::LIST) { ++list_column_index; }
+
       int const depth = _depth.has_value() ? (*_depth)[i] : 0;
       if (depth > last_null_depth) { continue; }
 
@@ -556,15 +558,12 @@ class device_row_comparator {
       // TODO: At what point do we verify that the columns of lhs and rhs are
       // all of the same types? I assume that it's already happened before
       // here, otherwise the current code would be failing.
-      auto [l_dremel_i, r_dremel_i] = [&]() {
-        if (_lhs.column(i).type().id() == type_id::LIST) {
-          auto idx = list_column_index++;
-          return std::make_tuple(optional_dremel_view(_l_dremel[idx]),
-                                 optional_dremel_view(_r_dremel[idx]));
-        } else {
-          return std::make_tuple(optional_dremel_view{}, optional_dremel_view{});
-        }
-      }();
+      auto const [l_dremel_i, r_dremel_i] =
+        _lhs.column(i).type().id() == type_id::LIST
+          ? std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]),
+                            optional_dremel_view(_r_dremel[list_column_index]))
+          : std::make_tuple(optional_dremel_view{}, optional_dremel_view{});
+
       auto element_comp = element_comparator{_check_nulls,
                                              _lhs.column(i),
                                              _rhs.column(i),
@@ -1813,7 +1812,7 @@ class device_row_hasher {
 
     // Hash each element and combine all the hash values together
     return detail::accumulate(it, it + _table.num_columns(), _seed, [](auto hash, auto h) {
-      return cudf::detail::hash_combine(hash, h);
+      return cudf::hashing::detail::hash_combine(hash, h);
     });
   }
 
@@ -1854,7 +1853,8 @@ class device_row_hasher {
           auto validity_it = detail::make_validity_iterator<true>(curr_col);
           hash             = detail::accumulate(
             validity_it, validity_it + curr_col.size(), hash, [](auto hash, auto is_valid) {
-              return cudf::detail::hash_combine(hash, is_valid ? NON_NULL_HASH : NULL_HASH);
+              return cudf::hashing::detail::hash_combine(hash,
+                                                         is_valid ? NON_NULL_HASH : NULL_HASH);
             });
         }
         if (curr_col.type().id() == type_id::STRUCT) {
@@ -1866,13 +1866,13 @@ class device_row_hasher {
           auto list_sizes = make_list_size_iterator(list_col);
           hash            = detail::accumulate(
             list_sizes, list_sizes + list_col.size(), hash, [](auto hash, auto size) {
-              return cudf::detail::hash_combine(hash, hash_fn<size_type>{}(size));
+              return cudf::hashing::detail::hash_combine(hash, hash_fn<size_type>{}(size));
             });
           curr_col = list_col.get_sliced_child();
         }
       }
       for (int i = 0; i < curr_col.size(); ++i) {
-        hash = cudf::detail::hash_combine(
+        hash = cudf::hashing::detail::hash_combine(
           hash,
           type_dispatcher<dispatch_void_if_nested>(curr_col.type(), _element_hasher, curr_col, i));
       }
@@ -1941,7 +1941,7 @@ class row_hasher {
    * @param seed The seed to use for the hash function
    * @return A hash operator to use on the device
    */
-  template <template <typename> class hash_function = detail::default_hash,
+  template <template <typename> class hash_function = cudf::hashing::detail::default_hash,
             template <template <typename> class, typename>
             class DeviceRowHasher = device_row_hasher,
             typename Nullate>
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index add88b6b47d..599a85c8a54 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/hashing.hpp>
 #include <cudf/detail/utilities/assert.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/hashing/detail/hash_functions.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/traits.hpp>
@@ -227,7 +227,7 @@ class element_equality_comparator {
 };
 
 /**
- * @brief Performs a relational comparison between two elements in two columns.
+ * @brief Performs a relational comparison between two elements in two tables.
  *
  * @tparam Nullate A cudf::nullate type describing how to check for nulls
  */
@@ -600,7 +600,7 @@ class row_hasher {
   __device__ auto operator()(size_type row_index) const
   {
     // Hash the first column w/ the seed
-    auto const initial_hash = cudf::detail::hash_combine(
+    auto const initial_hash = cudf::hashing::detail::hash_combine(
       hash_value_type{0},
       type_dispatcher<dispatch_storage_type>(
         _table.column(0).type(),
@@ -626,7 +626,7 @@ class row_hasher {
       hasher,
       initial_hash,
       [](hash_value_type lhs, hash_value_type rhs) {
-        return cudf::detail::hash_combine(lhs, rhs);
+        return cudf::hashing::detail::hash_combine(lhs, rhs);
       });
   }
 
diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.hpp b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
index c63e2b16326..f2f493cbbe4 100644
--- a/cpp/include/cudf/tdigest/tdigest_column_view.hpp
+++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ class tdigest_column_view : private column_view {
  public:
   tdigest_column_view(column_view const&);  ///< Construct tdigest_column_view from a column_view
   tdigest_column_view(tdigest_column_view&&)      = default;  ///< Move constructor
-  tdigest_column_view(const tdigest_column_view&) = default;  ///< Copy constructor
+  tdigest_column_view(tdigest_column_view const&) = default;  ///< Copy constructor
   ~tdigest_column_view()                          = default;
   /**
    * @brief Copy assignment operator
@@ -67,9 +67,7 @@ class tdigest_column_view : private column_view {
   tdigest_column_view& operator=(tdigest_column_view&&) = default;
 
   using column_view::size;
-  static_assert(std::is_same_v<offset_type, size_type>,
-                "offset_type is expected to be the same as size_type.");
-  using offset_iterator = offset_type const*;  ///< Iterator over offsets
+  using offset_iterator = size_type const*;  ///< Iterator over offsets
 
   // mean and weight column indices within tdigest inner struct columns
   static constexpr size_type mean_column_index{0};    ///< Mean column index
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 3bc1f9d6da7..addab160b6e 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -80,7 +80,6 @@ class mutable_table_view;
 using size_type         = int32_t;   ///< Row index type for columns and tables
 using bitmask_type      = uint32_t;  ///< Bitmask type stored as 32-bit unsigned integer
 using valid_type        = uint8_t;   ///< Valid type in host memory
-using offset_type       = int32_t;   ///< Offset type for column offsets
 using thread_index_type = int64_t;   ///< Thread index type in kernels
 
 /**
@@ -97,14 +96,6 @@ size_type distance(T f, T l)
   return static_cast<size_type>(std::distance(f, l));
 }
 
-/**
- * @brief Indicates an unknown null count.
- *
- * Use this value when constructing any column-like object to indicate that
- * the null count should be computed on the first invocation of `null_count()`.
- */
-static constexpr size_type UNKNOWN_NULL_COUNT{-1};
-
 /**
  * @brief Indicates the order in which elements should be sorted.
  */
@@ -230,7 +221,7 @@ enum class type_id : int32_t {
 /**
  * @brief Indicator for the logical data type of an element in a column.
  *
- * Simple types can be be entirely described by their `id()`, but some types
+ * Simple types can be entirely described by their `id()`, but some types
  * require additional metadata to fully describe elements of that type.
  */
 class data_type {
diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp
index f70ef4e5f07..afb9275e152 100644
--- a/cpp/include/cudf/utilities/error.hpp
+++ b/cpp/include/cudf/utilities/error.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/stacktrace.hpp>
+
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <stdexcept>
@@ -29,13 +31,35 @@ namespace cudf {
  * @file
  */
 
+/**
+ * @brief The struct to store the current stacktrace upon its construction.
+ */
+struct stacktrace_recorder {
+  stacktrace_recorder()
+    // Exclude the current stackframe, as it is this constructor.
+    : _stacktrace{cudf::detail::get_stacktrace(cudf::detail::capture_last_stackframe::NO)}
+  {
+  }
+
+ public:
+  /**
+   * @brief Get the stored stacktrace captured during object construction.
+   *
+   * @return The pointer to a null-terminated string storing the output stacktrace
+   */
+  char const* stacktrace() const { return _stacktrace.c_str(); }
+
+ protected:
+  std::string const _stacktrace;  //!< The whole stacktrace stored as one string.
+};
+
 /**
  * @brief Exception thrown when logical precondition is violated.
  *
  * This exception should not be thrown directly and is instead thrown by the
  * CUDF_EXPECTS macro.
  */
-struct logic_error : public std::logic_error {
+struct logic_error : public std::logic_error, public stacktrace_recorder {
   /**
    * @brief Constructs a logic_error with the error message.
    *
@@ -57,7 +81,7 @@ struct logic_error : public std::logic_error {
  * @brief Exception thrown when a CUDA error is encountered.
  *
  */
-struct cuda_error : public std::runtime_error {
+struct cuda_error : public std::runtime_error, public stacktrace_recorder {
   /**
    * @brief Construct a new cuda error object with error message and code.
    *
@@ -92,7 +116,7 @@ struct fatal_cuda_error : public cuda_error {
  * unsupported data_type. This exception should not be thrown directly and is
  * instead thrown by the CUDF_EXPECTS or CUDF_FAIL macros.
  */
-struct data_type_error : public std::invalid_argument {
+struct data_type_error : public std::invalid_argument, public stacktrace_recorder {
   /**
    * @brief Constructs a data_type_error with the error message.
    *
@@ -201,7 +225,7 @@ struct data_type_error : public std::invalid_argument {
 namespace cudf {
 namespace detail {
 // @cond
-inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line)
+inline void throw_cuda_error(cudaError_t error, char const* file, unsigned int line)
 {
   // Calls cudaGetLastError to clear the error status. It is nearly certain that a fatal error
   // occurred if it still returns the same error after a cleanup.
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index f339b38d9c8..3e5f6e3e97a 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -246,7 +246,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
             std::enable_if_t<(Extent == OtherExtent || Extent == dynamic_extent) &&
                                std::is_convertible_v<OtherT (*)[], T (*)[]>,
                              void>* = nullptr>
-  constexpr host_span(const host_span<OtherT, OtherExtent>& other) noexcept
+  constexpr host_span(host_span<OtherT, OtherExtent> const& other) noexcept
     : base(other.data(), other.size())
   {
   }
@@ -313,7 +313,7 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
             std::enable_if_t<(Extent == OtherExtent || Extent == dynamic_extent) &&
                                std::is_convertible_v<OtherT (*)[], T (*)[]>,
                              void>* = nullptr>
-  constexpr device_span(const device_span<OtherT, OtherExtent>& other) noexcept
+  constexpr device_span(device_span<OtherT, OtherExtent> const& other) noexcept
     : base(other.data(), other.size())
   {
   }
diff --git a/cpp/include/cudf/wrappers/dictionary.hpp b/cpp/include/cudf/wrappers/dictionary.hpp
index a0153352f1f..98de549c724 100644
--- a/cpp/include/cudf/wrappers/dictionary.hpp
+++ b/cpp/include/cudf/wrappers/dictionary.hpp
@@ -65,7 +65,7 @@ struct dictionary_wrapper {
    *
    * @return The reference to this dictionary wrapper object
    */
-  dictionary_wrapper& operator=(const dictionary_wrapper&) = default;
+  dictionary_wrapper& operator=(dictionary_wrapper const&) = default;
 
   /**
    * @brief Construct dictionary_wrapper from a value
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 6c7c7e87a7e..05319e03003 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -59,6 +59,27 @@ class BaseFixture : public ::testing::Test {
   rmm::mr::device_memory_resource* mr() { return _mr; }
 };
 
+/**
+ * @brief Base test fixture that takes a parameter.
+ *
+ * Example:
+ * ```
+ * class MyIntTestFixture : public cudf::test::BaseFixtureWithParam<int> {};
+ * ```
+ */
+template <typename T>
+class BaseFixtureWithParam : public ::testing::TestWithParam<T> {
+  rmm::mr::device_memory_resource* _mr{rmm::mr::get_current_device_resource()};
+
+ public:
+  /**
+   * @brief Returns pointer to `device_memory_resource` that should be used for
+   * all tests inheriting from this fixture
+   * @return pointer to memory resource
+   */
+  rmm::mr::device_memory_resource* mr() const { return _mr; }
+};
+
 template <typename T, typename Enable = void>
 struct uniform_distribution_impl {};
 template <typename T>
@@ -308,10 +329,10 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
 {
   try {
     cxxopts::Options options(argv[0], " - cuDF tests command line options");
-    const char* env_rmm_mode = std::getenv("GTEST_CUDF_RMM_MODE");  // Overridden by CLI options
-    const char* env_stream_mode =
+    char const* env_rmm_mode = std::getenv("GTEST_CUDF_RMM_MODE");  // Overridden by CLI options
+    char const* env_stream_mode =
       std::getenv("GTEST_CUDF_STREAM_MODE");                        // Overridden by CLI options
-    const char* env_stream_error_mode =
+    char const* env_stream_error_mode =
       std::getenv("GTEST_CUDF_STREAM_ERROR_MODE");                  // Overridden by CLI options
     auto default_rmm_mode          = env_rmm_mode ? env_rmm_mode : "pool";
     auto default_stream_mode       = env_stream_mode ? env_stream_mode : "default";
@@ -339,7 +360,7 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
       "is not \"default\"",
       cxxopts::value<std::string>()->default_value(default_stream_error_mode));
     return options.parse(argc, argv);
-  } catch (const cxxopts::OptionException& e) {
+  } catch (cxxopts::OptionException const& e) {
     CUDF_FAIL("Error parsing command line options");
   }
 }
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index f288c30e313..059bd10eae1 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -261,8 +261,8 @@ inline std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to
       cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
       cudf::get_default_stream());
     auto const h_offsets = cudf::detail::make_std_vector_sync(
-      cudf::device_span<cudf::offset_type const>(
-        scv.offsets().data<cudf::offset_type>() + scv.offset(), scv.size() + 1),
+      cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
+                                               scv.size() + 1),
       cudf::get_default_stream());
 
     // build std::string vector from chars and offsets
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index fd2c3699cf1..cc8cac35ef4 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -18,8 +18,8 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/encode.hpp>
@@ -34,8 +34,10 @@
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/default_stream.hpp>
 
 #include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/functional.h>
@@ -170,7 +172,8 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
   auto transform_begin = thrust::make_transform_iterator(begin, transformer);
   auto const size      = cudf::distance(begin, end);
   auto const elements  = thrust::host_vector<ElementTo>(transform_begin, transform_begin + size);
-  return rmm::device_buffer{elements.data(), size * sizeof(ElementTo), cudf::get_default_stream()};
+  return rmm::device_buffer{
+    elements.data(), size * sizeof(ElementTo), cudf::test::get_default_stream()};
 }
 
 /**
@@ -196,7 +199,8 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
   auto transform_begin = thrust::make_transform_iterator(begin, transformer);
   auto const size      = cudf::distance(begin, end);
   auto const elements  = thrust::host_vector<RepType>(transform_begin, transform_begin + size);
-  return rmm::device_buffer{elements.data(), size * sizeof(RepType), cudf::get_default_stream()};
+  return rmm::device_buffer{
+    elements.data(), size * sizeof(RepType), cudf::test::get_default_stream()};
 }
 
 /**
@@ -223,7 +227,8 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
   auto transformer_begin = thrust::make_transform_iterator(begin, to_rep);
   auto const size        = cudf::distance(begin, end);
   auto const elements = thrust::host_vector<RepType>(transformer_begin, transformer_begin + size);
-  return rmm::device_buffer{elements.data(), size * sizeof(RepType), cudf::get_default_stream()};
+  return rmm::device_buffer{
+    elements.data(), size * sizeof(RepType), cudf::test::get_default_stream()};
 }
 
 /**
@@ -279,7 +284,7 @@ std::pair<rmm::device_buffer, cudf::size_type> make_null_mask(ValidityIterator b
   auto [null_mask, null_count] = make_null_mask_vector(begin, end);
   auto d_mask                  = rmm::device_buffer{null_mask.data(),
                                    cudf::bitmask_allocation_size_bytes(cudf::distance(begin, end)),
-                                   cudf::get_default_stream()};
+                                   cudf::test::get_default_stream()};
   return {std::move(d_mask), null_count};
 }
 
@@ -445,7 +450,7 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
 
   /**
    * @brief Construct a nullable column from a list of fixed-width elements and
-   * the the range `[v, v + element_list.size())` interpreted as booleans to
+   * the range `[v, v + element_list.size())` interpreted as booleans to
    * indicate the validity of each element.
    *
    * Example:
@@ -561,7 +566,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
     wrapped.reset(new cudf::column{
       data_type,
       size,
-      rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()},
+      rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::test::get_default_stream()},
       rmm::device_buffer{},
       0});
   }
@@ -627,7 +632,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
     wrapped.reset(new cudf::column{
       data_type,
       size,
-      rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()},
+      rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::test::get_default_stream()},
       std::move(null_mask),
       null_count});
   }
@@ -658,7 +663,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
   }
 
   /**
-   * @brief Construct a nullable column from an initializer list of decimal elements and the the
+   * @brief Construct a nullable column from an initializer list of decimal elements and the
    * range `[v, v + element_list.size())` interpreted as booleans to indicate the validity of each
    * element.
    *
@@ -749,10 +754,11 @@ class strings_column_wrapper : public detail::column_wrapper {
     auto all_valid        = thrust::make_constant_iterator(true);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
     auto d_chars          = cudf::detail::make_device_uvector_sync(
-      chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     auto d_offsets = cudf::detail::make_device_uvector_sync(
-      offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-    wrapped = cudf::make_strings_column(d_chars, d_offsets, {}, 0);
+      offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+    wrapped =
+      cudf::make_strings_column(d_chars, d_offsets, {}, 0, cudf::test::get_default_stream());
   }
 
   /**
@@ -791,11 +797,11 @@ class strings_column_wrapper : public detail::column_wrapper {
     auto [chars, offsets]        = detail::make_chars_and_offsets(begin, end, v);
     auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings);
     auto d_chars                 = cudf::detail::make_device_uvector_sync(
-      chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     auto d_offsets = cudf::detail::make_device_uvector_sync(
-      offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     auto d_bitmask = cudf::detail::make_device_uvector_sync(
-      null_mask, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask, null_count);
   }
 
@@ -1021,7 +1027,7 @@ class dictionary_column_wrapper : public detail::column_wrapper {
 
   /**
    * @brief Construct a nullable dictionary column from a list of fixed-width elements and
-   * the the range `[v, v + element_list.size())` interpreted as booleans to
+   * the range `[v, v + element_list.size())` interpreted as booleans to
    * indicate the validity of each element.
    *
    * Example:
@@ -1501,7 +1507,7 @@ class lists_column_wrapper : public detail::column_wrapper {
    */
   static lists_column_wrapper<T> make_one_empty_row_column(bool valid = true)
   {
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{0, 0};
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 0};
     cudf::test::fixed_width_column_wrapper<int> values{};
     return lists_column_wrapper<T>(
       1,
@@ -1590,7 +1596,10 @@ class lists_column_wrapper : public detail::column_wrapper {
     thrust::copy_if(
       std::cbegin(cols), std::cend(cols), valids, std::back_inserter(children), thrust::identity{});
 
-    auto data = children.empty() ? cudf::empty_like(expected_hierarchy) : concatenate(children);
+    auto data = children.empty() ? cudf::empty_like(expected_hierarchy)
+                                 : cudf::concatenate(children,
+                                                     cudf::test::get_default_stream(),
+                                                     rmm::mr::get_current_device_resource());
 
     // increment depth
     depth = expected_depth + 1;
@@ -1717,7 +1726,7 @@ class lists_column_wrapper : public detail::column_wrapper {
                          // "a List<List<List<int>>> that's empty at the top level"
                          //
                          // { {{{1, 2, 3}}}, {4, 5, 6} }
-                         // In this case, row 1 is a a concrete List<int> with actual values.
+                         // In this case, row 1 is a concrete List<int> with actual values.
                          // There is no way to rectify the differences so we will treat it as a
                          // true column mismatch.
                          CUDF_EXPECTS(l.wrapped->size() == 0, "Mismatch in column types!");
diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp
index 36ab2e02aed..fa76204d622 100644
--- a/cpp/include/cudf_test/cudf_gtest.hpp
+++ b/cpp/include/cudf_test/cudf_gtest.hpp
@@ -130,7 +130,7 @@ struct TypeList<Types<TYPES...>> {
   }
 
 /**
- * @brief test macro comparing for equality of \p lhs and and \p rhs for the first \p size elements.
+ * @brief test macro comparing for equality of \p lhs and \p rhs for the first \p size elements.
  */
 #define CUDF_TEST_EXPECT_VECTOR_EQUAL(lhs, rhs, size)          \
   do {                                                         \
diff --git a/cpp/include/cudf_test/cxxopts.hpp b/cpp/include/cudf_test/cxxopts.hpp
index d0fc3c7e38c..248fc7532cc 100644
--- a/cpp/include/cudf_test/cxxopts.hpp
+++ b/cpp/include/cudf_test/cxxopts.hpp
@@ -66,13 +66,13 @@ inline String toLocalString(std::string s) { return icu::UnicodeString::fromUTF8
 
 class UnicodeStringIterator : public std::iterator<std::forward_iterator_tag, int32_t> {
  public:
-  UnicodeStringIterator(const icu::UnicodeString* string, int32_t pos) : s(string), i(pos) {}
+  UnicodeStringIterator(icu::UnicodeString const* string, int32_t pos) : s(string), i(pos) {}
 
   value_type operator*() const { return s->char32At(i); }
 
-  bool operator==(const UnicodeStringIterator& rhs) const { return s == rhs.s && i == rhs.i; }
+  bool operator==(UnicodeStringIterator const& rhs) const { return s == rhs.s && i == rhs.i; }
 
-  bool operator!=(const UnicodeStringIterator& rhs) const { return !(*this == rhs); }
+  bool operator!=(UnicodeStringIterator const& rhs) const { return !(*this == rhs); }
 
   UnicodeStringIterator& operator++()
   {
@@ -83,7 +83,7 @@ class UnicodeStringIterator : public std::iterator<std::forward_iterator_tag, in
   UnicodeStringIterator operator+(int32_t v) { return UnicodeStringIterator(s, i + v); }
 
  private:
-  const icu::UnicodeString* s;
+  icu::UnicodeString const* s;
   int32_t i;
 };
 
@@ -109,9 +109,9 @@ String& stringAppend(String& s, Iterator begin, Iterator end)
   return s;
 }
 
-inline size_t stringLength(const String& s) { return s.length(); }
+inline size_t stringLength(String const& s) { return s.length(); }
 
-inline std::string toUTF8String(const String& s)
+inline std::string toUTF8String(String const& s)
 {
   std::string result;
   s.toUTF8String(result);
@@ -119,16 +119,16 @@ inline std::string toUTF8String(const String& s)
   return result;
 }
 
-inline bool empty(const String& s) { return s.isEmpty(); }
+inline bool empty(String const& s) { return s.isEmpty(); }
 }  // namespace cxxopts
 
 namespace std {
-inline cxxopts::UnicodeStringIterator begin(const icu::UnicodeString& s)
+inline cxxopts::UnicodeStringIterator begin(icu::UnicodeString const& s)
 {
   return cxxopts::UnicodeStringIterator(&s, 0);
 }
 
-inline cxxopts::UnicodeStringIterator end(const icu::UnicodeString& s)
+inline cxxopts::UnicodeStringIterator end(icu::UnicodeString const& s)
 {
   return cxxopts::UnicodeStringIterator(&s, s.length());
 }
@@ -146,7 +146,7 @@ T toLocalString(T&& t)
   return std::forward<T>(t);
 }
 
-inline size_t stringLength(const String& s) { return s.length(); }
+inline size_t stringLength(String const& s) { return s.length(); }
 
 inline String& stringAppend(String& s, String a) { return s.append(std::move(a)); }
 
@@ -164,7 +164,7 @@ std::string toUTF8String(T&& t)
   return std::forward<T>(t);
 }
 
-inline bool empty(const std::string& s) { return s.empty(); }
+inline bool empty(std::string const& s) { return s.empty(); }
 }  // namespace cxxopts
 
 // ifdef CXXOPTS_USE_UNICODE
@@ -187,7 +187,7 @@ class Value : public std::enable_shared_from_this<Value> {
 
   virtual std::shared_ptr<Value> clone() const = 0;
 
-  virtual void parse(const std::string& text) const = 0;
+  virtual void parse(std::string const& text) const = 0;
 
   virtual void parse() const = 0;
 
@@ -201,9 +201,9 @@ class Value : public std::enable_shared_from_this<Value> {
 
   virtual std::string get_implicit_value() const = 0;
 
-  virtual std::shared_ptr<Value> default_value(const std::string& value) = 0;
+  virtual std::shared_ptr<Value> default_value(std::string const& value) = 0;
 
-  virtual std::shared_ptr<Value> implicit_value(const std::string& value) = 0;
+  virtual std::shared_ptr<Value> implicit_value(std::string const& value) = 0;
 
   virtual std::shared_ptr<Value> no_implicit_value() = 0;
 
@@ -212,9 +212,9 @@ class Value : public std::enable_shared_from_this<Value> {
 
 class OptionException : public std::exception {
  public:
-  OptionException(const std::string& message) : m_message(message) {}
+  OptionException(std::string const& message) : m_message(message) {}
 
-  virtual const char* what() const noexcept { return m_message.c_str(); }
+  virtual char const* what() const noexcept { return m_message.c_str(); }
 
  private:
   std::string m_message;
@@ -222,17 +222,17 @@ class OptionException : public std::exception {
 
 class OptionSpecException : public OptionException {
  public:
-  OptionSpecException(const std::string& message) : OptionException(message) {}
+  OptionSpecException(std::string const& message) : OptionException(message) {}
 };
 
 class OptionParseException : public OptionException {
  public:
-  OptionParseException(const std::string& message) : OptionException(message) {}
+  OptionParseException(std::string const& message) : OptionException(message) {}
 };
 
 class option_exists_error : public OptionSpecException {
  public:
-  option_exists_error(const std::string& option)
+  option_exists_error(std::string const& option)
     : OptionSpecException("Option " + LQUOTE + option + RQUOTE + " already exists")
   {
   }
@@ -240,7 +240,7 @@ class option_exists_error : public OptionSpecException {
 
 class invalid_option_format_error : public OptionSpecException {
  public:
-  invalid_option_format_error(const std::string& format)
+  invalid_option_format_error(std::string const& format)
     : OptionSpecException("Invalid option format " + LQUOTE + format + RQUOTE)
   {
   }
@@ -248,7 +248,7 @@ class invalid_option_format_error : public OptionSpecException {
 
 class option_syntax_exception : public OptionParseException {
  public:
-  option_syntax_exception(const std::string& text)
+  option_syntax_exception(std::string const& text)
     : OptionParseException("Argument " + LQUOTE + text + RQUOTE +
                            " starts with a - but has incorrect syntax")
   {
@@ -257,7 +257,7 @@ class option_syntax_exception : public OptionParseException {
 
 class option_not_exists_exception : public OptionParseException {
  public:
-  option_not_exists_exception(const std::string& option)
+  option_not_exists_exception(std::string const& option)
     : OptionParseException("Option " + LQUOTE + option + RQUOTE + " does not exist")
   {
   }
@@ -265,7 +265,7 @@ class option_not_exists_exception : public OptionParseException {
 
 class missing_argument_exception : public OptionParseException {
  public:
-  missing_argument_exception(const std::string& option)
+  missing_argument_exception(std::string const& option)
     : OptionParseException("Option " + LQUOTE + option + RQUOTE + " is missing an argument")
   {
   }
@@ -273,7 +273,7 @@ class missing_argument_exception : public OptionParseException {
 
 class option_requires_argument_exception : public OptionParseException {
  public:
-  option_requires_argument_exception(const std::string& option)
+  option_requires_argument_exception(std::string const& option)
     : OptionParseException("Option " + LQUOTE + option + RQUOTE + " requires an argument")
   {
   }
@@ -281,7 +281,7 @@ class option_requires_argument_exception : public OptionParseException {
 
 class option_not_has_argument_exception : public OptionParseException {
  public:
-  option_not_has_argument_exception(const std::string& option, const std::string& arg)
+  option_not_has_argument_exception(std::string const& option, std::string const& arg)
     : OptionParseException("Option " + LQUOTE + option + RQUOTE +
                            " does not take an argument, but argument " + LQUOTE + arg + RQUOTE +
                            " given")
@@ -291,7 +291,7 @@ class option_not_has_argument_exception : public OptionParseException {
 
 class option_not_present_exception : public OptionParseException {
  public:
-  option_not_present_exception(const std::string& option)
+  option_not_present_exception(std::string const& option)
     : OptionParseException("Option " + LQUOTE + option + RQUOTE + " not present")
   {
   }
@@ -299,7 +299,7 @@ class option_not_present_exception : public OptionParseException {
 
 class argument_incorrect_type : public OptionParseException {
  public:
-  argument_incorrect_type(const std::string& arg)
+  argument_incorrect_type(std::string const& arg)
     : OptionParseException("Argument " + LQUOTE + arg + RQUOTE + " failed to parse")
   {
   }
@@ -307,14 +307,14 @@ class argument_incorrect_type : public OptionParseException {
 
 class option_required_exception : public OptionParseException {
  public:
-  option_required_exception(const std::string& option)
+  option_required_exception(std::string const& option)
     : OptionParseException("Option " + LQUOTE + option + RQUOTE + " is required but not present")
   {
   }
 };
 
 template <typename T>
-void throw_or_mimic(const std::string& text)
+void throw_or_mimic(std::string const& text)
 {
   static_assert(std::is_base_of<std::exception, T>::value,
                 "throw_or_mimic only works on std::exception and "
@@ -347,7 +347,7 @@ struct SignedCheck;
 template <typename T>
 struct SignedCheck<T, true> {
   template <typename U>
-  void operator()(bool negative, U u, const std::string& text)
+  void operator()(bool negative, U u, std::string const& text)
   {
     if (negative) {
       if (u > static_cast<U>((std::numeric_limits<T>::min)())) {
@@ -364,20 +364,20 @@ struct SignedCheck<T, true> {
 template <typename T>
 struct SignedCheck<T, false> {
   template <typename U>
-  void operator()(bool, U, const std::string&)
+  void operator()(bool, U, std::string const&)
   {
   }
 };
 
 template <typename T, typename U>
-void check_signed_range(bool negative, U value, const std::string& text)
+void check_signed_range(bool negative, U value, std::string const& text)
 {
   SignedCheck<T, std::numeric_limits<T>::is_signed>()(negative, value, text);
 }
 }  // namespace detail
 
 template <typename R, typename T>
-R checked_negate(T&& t, const std::string&, std::true_type)
+R checked_negate(T&& t, std::string const&, std::true_type)
 {
   // if we got to here, then `t` is a positive number that fits into
   // `R`. So to avoid MSVC C4146, we first cast it to `R`.
@@ -386,14 +386,14 @@ R checked_negate(T&& t, const std::string&, std::true_type)
 }
 
 template <typename R, typename T>
-T checked_negate(T&& t, const std::string& text, std::false_type)
+T checked_negate(T&& t, std::string const& text, std::false_type)
 {
   throw_or_mimic<argument_incorrect_type>(text);
   return t;
 }
 
 template <typename T>
-void integer_parser(const std::string& text, T& value)
+void integer_parser(std::string const& text, T& value)
 {
   std::smatch match;
   std::regex_match(text, match, integer_pattern);
@@ -408,7 +408,7 @@ void integer_parser(const std::string& text, T& value)
   using US = typename std::make_unsigned<T>::type;
 
   constexpr bool is_signed = std::numeric_limits<T>::is_signed;
-  const bool negative      = match.length(1) > 0;
+  bool const negative      = match.length(1) > 0;
   const uint8_t base       = match.length(2) > 0 ? 16 : 10;
 
   auto value_match = match[3];
@@ -444,30 +444,30 @@ void integer_parser(const std::string& text, T& value)
 }
 
 template <typename T>
-void stringstream_parser(const std::string& text, T& value)
+void stringstream_parser(std::string const& text, T& value)
 {
   std::stringstream in(text);
   in >> value;
   if (!in) { throw_or_mimic<argument_incorrect_type>(text); }
 }
 
-inline void parse_value(const std::string& text, uint8_t& value) { integer_parser(text, value); }
+inline void parse_value(std::string const& text, uint8_t& value) { integer_parser(text, value); }
 
-inline void parse_value(const std::string& text, int8_t& value) { integer_parser(text, value); }
+inline void parse_value(std::string const& text, int8_t& value) { integer_parser(text, value); }
 
-inline void parse_value(const std::string& text, uint16_t& value) { integer_parser(text, value); }
+inline void parse_value(std::string const& text, uint16_t& value) { integer_parser(text, value); }
 
-inline void parse_value(const std::string& text, int16_t& value) { integer_parser(text, value); }
+inline void parse_value(std::string const& text, int16_t& value) { integer_parser(text, value); }
 
-inline void parse_value(const std::string& text, uint32_t& value) { integer_parser(text, value); }
+inline void parse_value(std::string const& text, uint32_t& value) { integer_parser(text, value); }
 
-inline void parse_value(const std::string& text, int32_t& value) { integer_parser(text, value); }
+inline void parse_value(std::string const& text, int32_t& value) { integer_parser(text, value); }
 
-inline void parse_value(const std::string& text, uint64_t& value) { integer_parser(text, value); }
+inline void parse_value(std::string const& text, uint64_t& value) { integer_parser(text, value); }
 
-inline void parse_value(const std::string& text, int64_t& value) { integer_parser(text, value); }
+inline void parse_value(std::string const& text, int64_t& value) { integer_parser(text, value); }
 
-inline void parse_value(const std::string& text, bool& value)
+inline void parse_value(std::string const& text, bool& value)
 {
   std::smatch result;
   std::regex_match(text, result, truthy_pattern);
@@ -486,19 +486,19 @@ inline void parse_value(const std::string& text, bool& value)
   throw_or_mimic<argument_incorrect_type>(text);
 }
 
-inline void parse_value(const std::string& text, std::string& value) { value = text; }
+inline void parse_value(std::string const& text, std::string& value) { value = text; }
 
 // The fallback parser. It uses the stringstream parser to parse all types
 // that have not been overloaded explicitly.  It has to be placed in the
 // source code before all other more specialized templates.
 template <typename T>
-void parse_value(const std::string& text, T& value)
+void parse_value(std::string const& text, T& value)
 {
   stringstream_parser(text, value);
 }
 
 template <typename T>
-void parse_value(const std::string& text, std::vector<T>& value)
+void parse_value(std::string const& text, std::vector<T>& value)
 {
   std::stringstream in(text);
   std::string token;
@@ -509,7 +509,7 @@ void parse_value(const std::string& text, std::vector<T>& value)
   }
 }
 
-inline void parse_value(const std::string& text, char& c)
+inline void parse_value(std::string const& text, char& c)
 {
   if (text.length() != 1) { throw_or_mimic<argument_incorrect_type>(text); }
 
@@ -537,7 +537,7 @@ class abstract_value : public Value {
 
   virtual ~abstract_value() = default;
 
-  abstract_value(const abstract_value& rhs)
+  abstract_value(abstract_value const& rhs)
   {
     if (rhs.m_result) {
       m_result = std::make_shared<T>();
@@ -552,7 +552,7 @@ class abstract_value : public Value {
     m_implicit_value = rhs.m_implicit_value;
   }
 
-  void parse(const std::string& text) const { parse_value(text, *m_store); }
+  void parse(std::string const& text) const { parse_value(text, *m_store); }
 
   bool is_container() const { return type_is_container<T>::value; }
 
@@ -562,14 +562,14 @@ class abstract_value : public Value {
 
   bool has_implicit() const { return m_implicit; }
 
-  std::shared_ptr<Value> default_value(const std::string& value)
+  std::shared_ptr<Value> default_value(std::string const& value)
   {
     m_default       = true;
     m_default_value = value;
     return shared_from_this();
   }
 
-  std::shared_ptr<Value> implicit_value(const std::string& value)
+  std::shared_ptr<Value> implicit_value(std::string const& value)
   {
     m_implicit       = true;
     m_implicit_value = value;
@@ -588,7 +588,7 @@ class abstract_value : public Value {
 
   bool is_boolean() const { return std::is_same_v<T, bool>; }
 
-  const T& get() const
+  T const& get() const
   {
     if (m_store == nullptr) {
       return *m_result;
@@ -654,36 +654,36 @@ class OptionAdder;
 
 class OptionDetails {
  public:
-  OptionDetails(const std::string& short_,
-                const std::string& long_,
-                const String& desc,
-                std::shared_ptr<const Value> val)
+  OptionDetails(std::string const& short_,
+                std::string const& long_,
+                String const& desc,
+                std::shared_ptr<Value const> val)
     : m_short(short_), m_long(long_), m_desc(desc), m_value(val), m_count(0)
   {
   }
 
-  OptionDetails(const OptionDetails& rhs) : m_desc(rhs.m_desc), m_count(rhs.m_count)
+  OptionDetails(OptionDetails const& rhs) : m_desc(rhs.m_desc), m_count(rhs.m_count)
   {
     m_value = rhs.m_value->clone();
   }
 
   OptionDetails(OptionDetails&& rhs) = default;
 
-  const String& description() const { return m_desc; }
+  String const& description() const { return m_desc; }
 
-  const Value& value() const { return *m_value; }
+  Value const& value() const { return *m_value; }
 
   std::shared_ptr<Value> make_storage() const { return m_value->clone(); }
 
-  const std::string& short_name() const { return m_short; }
+  std::string const& short_name() const { return m_short; }
 
-  const std::string& long_name() const { return m_long; }
+  std::string const& long_name() const { return m_long; }
 
  private:
   std::string m_short;
   std::string m_long;
   String m_desc;
-  std::shared_ptr<const Value> m_value;
+  std::shared_ptr<Value const> m_value;
   int m_count;
 };
 
@@ -708,14 +708,14 @@ struct HelpGroupDetails {
 
 class OptionValue {
  public:
-  void parse(std::shared_ptr<const OptionDetails> details, const std::string& text)
+  void parse(std::shared_ptr<OptionDetails const> details, std::string const& text)
   {
     ensure_value(details);
     ++m_count;
     m_value->parse(text);
   }
 
-  void parse_default(std::shared_ptr<const OptionDetails> details)
+  void parse_default(std::shared_ptr<OptionDetails const> details)
   {
     ensure_value(details);
     m_default = true;
@@ -728,19 +728,19 @@ class OptionValue {
   bool has_default() const noexcept { return m_default; }
 
   template <typename T>
-  const T& as() const
+  T const& as() const
   {
     if (m_value == nullptr) { throw_or_mimic<std::domain_error>("No value"); }
 
 #ifdef CXXOPTS_NO_RTTI
-    return static_cast<const values::standard_value<T>&>(*m_value).get();
+    return static_cast<values::standard_value<T> const&>(*m_value).get();
 #else
-    return dynamic_cast<const values::standard_value<T>&>(*m_value).get();
+    return dynamic_cast<values::standard_value<T> const&>(*m_value).get();
 #endif
   }
 
  private:
-  void ensure_value(std::shared_ptr<const OptionDetails> details)
+  void ensure_value(std::shared_ptr<OptionDetails const> details)
   {
     if (m_value == nullptr) { m_value = details->make_storage(); }
   }
@@ -757,9 +757,9 @@ class KeyValue {
   {
   }
 
-  const std::string& key() const { return m_key; }
+  std::string const& key() const { return m_key; }
 
-  const std::string& value() const { return m_value; }
+  std::string const& value() const { return m_value; }
 
   template <typename T>
   T as() const
@@ -783,7 +783,7 @@ class ParseResult {
     int&,
     char**&);
 
-  size_t count(const std::string& o) const
+  size_t count(std::string const& o) const
   {
     auto iter = m_options->find(o);
     if (iter == m_options->end()) { return 0; }
@@ -793,7 +793,7 @@ class ParseResult {
     return riter->second.count();
   }
 
-  const OptionValue& operator[](const std::string& option) const
+  OptionValue const& operator[](std::string const& option) const
   {
     auto iter = m_options->find(option);
 
@@ -804,18 +804,18 @@ class ParseResult {
     return riter->second;
   }
 
-  const std::vector<KeyValue>& arguments() const { return m_sequential; }
+  std::vector<KeyValue> const& arguments() const { return m_sequential; }
 
  private:
   void parse(int& argc, char**& argv);
 
-  void add_to_option(const std::string& option, const std::string& arg);
+  void add_to_option(std::string const& option, std::string const& arg);
 
   bool consume_positional(std::string a);
 
   void parse_option(std::shared_ptr<OptionDetails> value,
-                    const std::string& name,
-                    const std::string& arg = "");
+                    std::string const& name,
+                    std::string const& arg = "");
 
   void parse_default(std::shared_ptr<OptionDetails> details);
 
@@ -823,7 +823,7 @@ class ParseResult {
                          char* argv[],
                          int& current,
                          std::shared_ptr<OptionDetails> value,
-                         const std::string& name);
+                         std::string const& name);
 
   const std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<OptionDetails>>> m_options;
   std::vector<std::string> m_positional;
@@ -837,17 +837,17 @@ class ParseResult {
 };
 
 struct Option {
-  Option(const std::string& opts,
-         const std::string& desc,
-         const std::shared_ptr<const Value>& value = ::cxxopts::value<bool>(),
-         const std::string& arg_help               = "")
+  Option(std::string const& opts,
+         std::string const& desc,
+         std::shared_ptr<Value const> const& value = ::cxxopts::value<bool>(),
+         std::string const& arg_help               = "")
     : opts_(opts), desc_(desc), value_(value), arg_help_(arg_help)
   {
   }
 
   std::string opts_;
   std::string desc_;
-  std::shared_ptr<const Value> value_;
+  std::shared_ptr<Value const> value_;
   std::string arg_help_;
 };
 
@@ -895,15 +895,15 @@ class Options {
 
   OptionAdder add_options(std::string group = "");
 
-  void add_options(const std::string& group, std::initializer_list<Option> options);
+  void add_options(std::string const& group, std::initializer_list<Option> options);
 
-  void add_option(const std::string& group, const Option& option);
+  void add_option(std::string const& group, Option const& option);
 
-  void add_option(const std::string& group,
-                  const std::string& s,
-                  const std::string& l,
+  void add_option(std::string const& group,
+                  std::string const& s,
+                  std::string const& l,
                   std::string desc,
-                  std::shared_ptr<const Value> value,
+                  std::shared_ptr<Value const> value,
                   std::string arg_help);
 
   // parse positional arguments into the given option
@@ -919,18 +919,18 @@ class Options {
     parse_positional(std::vector<std::string>{begin, end});
   }
 
-  std::string help(const std::vector<std::string>& groups = {}) const;
+  std::string help(std::vector<std::string> const& groups = {}) const;
 
   const std::vector<std::string> groups() const;
 
-  const HelpGroupDetails& group_help(const std::string& group) const;
+  HelpGroupDetails const& group_help(std::string const& group) const;
 
  private:
-  void add_one_option(const std::string& option, std::shared_ptr<OptionDetails> details);
+  void add_one_option(std::string const& option, std::shared_ptr<OptionDetails> details);
 
-  String help_one_group(const std::string& group) const;
+  String help_one_group(std::string const& group) const;
 
-  void generate_group_help(String& result, const std::vector<std::string>& groups) const;
+  void generate_group_help(String& result, std::vector<std::string> const& groups) const;
 
   void generate_all_groups_help(String& result) const;
 
@@ -956,9 +956,9 @@ class OptionAdder {
   {
   }
 
-  OptionAdder& operator()(const std::string& opts,
-                          const std::string& desc,
-                          std::shared_ptr<const Value> value = ::cxxopts::value<bool>(),
+  OptionAdder& operator()(std::string const& opts,
+                          std::string const& desc,
+                          std::shared_ptr<Value const> value = ::cxxopts::value<bool>(),
                           std::string arg_help               = "");
 
  private:
@@ -974,7 +974,7 @@ std::basic_regex<char> option_matcher("--([[:alnum:]][-_[:alnum:]]+)(=(.*))?|-([
 
 std::basic_regex<char> option_specifier("(([[:alnum:]]),)?[ ]*([[:alnum:]][-_[:alnum:]]*)?");
 
-String format_option(const HelpOptionDetails& o)
+String format_option(HelpOptionDetails const& o)
 {
   auto& s = o.s;
   auto& l = o.l;
@@ -1002,7 +1002,7 @@ String format_option(const HelpOptionDetails& o)
   return result;
 }
 
-String format_description(const HelpOptionDetails& o, size_t start, size_t width)
+String format_description(HelpOptionDetails const& o, size_t start, size_t width)
 {
   auto desc = o.desc;
 
@@ -1071,10 +1071,10 @@ inline ParseResult::ParseResult(
   parse(argc, argv);
 }
 
-inline void Options::add_options(const std::string& group, std::initializer_list<Option> options)
+inline void Options::add_options(std::string const& group, std::initializer_list<Option> options)
 {
   OptionAdder option_adder(*this, group);
-  for (const auto& option : options) {
+  for (auto const& option : options) {
     option_adder(option.opts_, option.desc_, option.value_, option.arg_help_);
   }
 }
@@ -1084,18 +1084,18 @@ inline OptionAdder Options::add_options(std::string group)
   return OptionAdder(*this, std::move(group));
 }
 
-inline OptionAdder& OptionAdder::operator()(const std::string& opts,
-                                            const std::string& desc,
-                                            std::shared_ptr<const Value> value,
+inline OptionAdder& OptionAdder::operator()(std::string const& opts,
+                                            std::string const& desc,
+                                            std::shared_ptr<Value const> value,
                                             std::string arg_help)
 {
-  std::match_results<const char*> result;
+  std::match_results<char const*> result;
   std::regex_match(opts.c_str(), result, option_specifier);
 
   if (result.empty()) { throw_or_mimic<invalid_option_format_error>(opts); }
 
-  const auto& short_match = result[2];
-  const auto& long_match  = result[3];
+  auto const& short_match = result[2];
+  auto const& long_match  = result[3];
 
   if (!short_match.length() && !long_match.length()) {
     throw_or_mimic<invalid_option_format_error>(opts);
@@ -1103,8 +1103,8 @@ inline OptionAdder& OptionAdder::operator()(const std::string& opts,
     throw_or_mimic<invalid_option_format_error>(opts);
   }
 
-  auto option_names = [](const std::sub_match<const char*>& short_,
-                         const std::sub_match<const char*>& long_) {
+  auto option_names = [](std::sub_match<char const*> const& short_,
+                         std::sub_match<char const*> const& long_) {
     if (long_.length() == 1) {
       return std::make_tuple(long_.str(), short_.str());
     } else {
@@ -1128,8 +1128,8 @@ inline void ParseResult::parse_default(std::shared_ptr<OptionDetails> details)
 }
 
 inline void ParseResult::parse_option(std::shared_ptr<OptionDetails> value,
-                                      const std::string& /*name*/,
-                                      const std::string& arg)
+                                      std::string const& /*name*/,
+                                      std::string const& arg)
 {
   auto& result = m_results[value];
   result.parse(value, arg);
@@ -1141,7 +1141,7 @@ inline void ParseResult::checked_parse_arg(int argc,
                                            char* argv[],
                                            int& current,
                                            std::shared_ptr<OptionDetails> value,
-                                           const std::string& name)
+                                           std::string const& name)
 {
   if (current + 1 >= argc) {
     if (value->value().has_implicit()) {
@@ -1159,7 +1159,7 @@ inline void ParseResult::checked_parse_arg(int argc,
   }
 }
 
-inline void ParseResult::add_to_option(const std::string& option, const std::string& arg)
+inline void ParseResult::add_to_option(std::string const& option, std::string const& arg)
 {
   auto iter = m_options->find(option);
 
@@ -1234,7 +1234,7 @@ inline void ParseResult::parse(int& argc, char**& argv)
       break;
     }
 
-    std::match_results<const char*> result;
+    std::match_results<char const*> result;
     std::regex_match(argv[current], result, option_matcher);
 
     if (result.empty()) {
@@ -1256,7 +1256,7 @@ inline void ParseResult::parse(int& argc, char**& argv)
     } else {
       // short or long option?
       if (result[4].length() != 0) {
-        const std::string& s = result[4];
+        std::string const& s = result[4];
 
         for (std::size_t i = 0; i != s.size(); ++i) {
           std::string name(1, s[i]);
@@ -1284,7 +1284,7 @@ inline void ParseResult::parse(int& argc, char**& argv)
           }
         }
       } else if (result[1].length() != 0) {
-        const std::string& name = result[1];
+        std::string const& name = result[1];
 
         auto iter = m_options->find(name);
 
@@ -1344,16 +1344,16 @@ inline void ParseResult::parse(int& argc, char**& argv)
   argc = nextKeep;
 }
 
-inline void Options::add_option(const std::string& group, const Option& option)
+inline void Options::add_option(std::string const& group, Option const& option)
 {
   add_options(group, {option});
 }
 
-inline void Options::add_option(const std::string& group,
-                                const std::string& s,
-                                const std::string& l,
+inline void Options::add_option(std::string const& group,
+                                std::string const& s,
+                                std::string const& l,
                                 std::string desc,
-                                std::shared_ptr<const Value> value,
+                                std::shared_ptr<Value const> value,
                                 std::string arg_help)
 {
   auto stringDesc = toLocalString(std::move(desc));
@@ -1378,7 +1378,7 @@ inline void Options::add_option(const std::string& group,
                                                  value->is_boolean()});
 }
 
-inline void Options::add_one_option(const std::string& option,
+inline void Options::add_one_option(std::string const& option,
                                     std::shared_ptr<OptionDetails> details)
 {
   auto in = m_options->emplace(option, details);
@@ -1386,7 +1386,7 @@ inline void Options::add_one_option(const std::string& option,
   if (!in.second) { throw_or_mimic<option_exists_error>(option); }
 }
 
-inline String Options::help_one_group(const std::string& g) const
+inline String Options::help_one_group(std::string const& g) const
 {
   typedef std::vector<std::pair<String, String>> OptionHelp;
 
@@ -1401,7 +1401,7 @@ inline String Options::help_one_group(const std::string& g) const
 
   if (!g.empty()) { result += toLocalString(" " + g + " options:\n"); }
 
-  for (const auto& o : group->second.options) {
+  for (auto const& o : group->second.options) {
     if (m_positional_set.find(o.l) != m_positional_set.end() && !m_show_positional) { continue; }
 
     auto s  = format_option(o);
@@ -1415,7 +1415,7 @@ inline String Options::help_one_group(const std::string& g) const
   auto allowed = size_t{76} - longest - OPTION_DESC_GAP;
 
   auto fiter = format.begin();
-  for (const auto& o : group->second.options) {
+  for (auto const& o : group->second.options) {
     if (m_positional_set.find(o.l) != m_positional_set.end() && !m_show_positional) { continue; }
 
     auto d = format_description(o, longest + OPTION_DESC_GAP, allowed);
@@ -1438,10 +1438,10 @@ inline String Options::help_one_group(const std::string& g) const
 }
 
 inline void Options::generate_group_help(String& result,
-                                         const std::vector<std::string>& print_groups) const
+                                         std::vector<std::string> const& print_groups) const
 {
   for (size_t i = 0; i != print_groups.size(); ++i) {
-    const String& group_help_text = help_one_group(print_groups[i]);
+    String const& group_help_text = help_one_group(print_groups[i]);
     if (empty(group_help_text)) { continue; }
     result += group_help_text;
     if (i < print_groups.size() - 1) { result += '\n'; }
@@ -1460,7 +1460,7 @@ inline void Options::generate_all_groups_help(String& result) const
   generate_group_help(result, all_groups);
 }
 
-inline std::string Options::help(const std::vector<std::string>& help_groups) const
+inline std::string Options::help(std::vector<std::string> const& help_groups) const
 {
   String result =
     m_help_string + "\nUsage:\n  " + toLocalString(m_program) + " " + toLocalString(m_custom_help);
@@ -1493,7 +1493,7 @@ inline const std::vector<std::string> Options::groups() const
   return g;
 }
 
-inline const HelpGroupDetails& Options::group_help(const std::string& group) const
+inline HelpGroupDetails const& Options::group_help(std::string const& group) const
 {
   return m_help.at(group);
 }
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index e15b0ccb371..fa3daae639e 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ class temp_directory {
    *
    * @param base_name The base name of the temporary directory
    */
-  temp_directory(const std::string& base_name)
+  temp_directory(std::string const& base_name)
   {
     std::string dir_template{std::filesystem::temp_directory_path().string()};
     if (auto env_p = std::getenv("WORKSPACE")) dir_template = env_p;
@@ -67,5 +67,5 @@ class temp_directory {
    *
    * @return string path of the temporary directory
    */
-  [[nodiscard]] const std::string& path() const { return _path; }
+  [[nodiscard]] std::string const& path() const { return _path; }
 };
diff --git a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index 946635ab241..e18400422aa 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -17,8 +17,12 @@
 
 #include <cudf_test/default_stream.hpp>
 
+#include <cudf/detail/utilities/stacktrace.hpp>
+
 #include <rmm/mr/device/device_memory_resource.hpp>
 
+#include <iostream>
+
 /**
  * @brief Resource that verifies that the default stream is not used in any allocation.
  *
@@ -162,6 +166,10 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
                             : (cstream != cudf::test::get_default_stream().value());
 
     if (invalid_stream) {
+      // Exclude the current function from stacktrace.
+      std::cout << cudf::detail::get_stacktrace(cudf::detail::capture_last_stackframe::NO)
+                << std::endl;
+
       if (error_on_invalid_stream_) {
         throw std::runtime_error("Attempted to perform an operation on an unexpected stream!");
       } else {
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index 20be166cac6..742cd764a1f 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -179,7 +179,7 @@ void tdigest_minmax_compare(cudf::tdigest::tdigest_column_view const& tdv,
                     spans.end(),
                     expected_min->mutable_view().template begin<double>(),
                     column_min<T>{});
-  column_view result_min(data_type{type_id::FLOAT64}, tdv.size(), tdv.min_begin());
+  column_view result_min(data_type{type_id::FLOAT64}, tdv.size(), tdv.min_begin(), nullptr, 0);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_min, *expected_min);
 
   auto expected_max = cudf::make_fixed_width_column(
@@ -189,7 +189,7 @@ void tdigest_minmax_compare(cudf::tdigest::tdigest_column_view const& tdv,
                     spans.end(),
                     expected_max->mutable_view().template begin<double>(),
                     column_max<T>{});
-  column_view result_max(data_type{type_id::FLOAT64}, tdv.size(), tdv.max_begin());
+  column_view result_max(data_type{type_id::FLOAT64}, tdv.size(), tdv.max_begin(), nullptr, 0);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_max, *expected_max);
 }
 
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index 094f2127e3a..4da2807bbe6 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -141,13 +141,15 @@
  * @}
  * @defgroup io_apis IO
  * @{
- *   @defgroup io_datasources Datasources
  *   @defgroup io_readers Readers
  *   @defgroup io_writers Writers
+ *   @defgroup io_datasources Data Sources
+ *   @defgroup io_datasinks Data Sinks
  * @}
  * @defgroup lists_apis Lists
  * @{
  *   @defgroup lists_combine Combining
+ *   @defgroup lists_modify Modifying
  *   @defgroup lists_extract Extracting
  *   @defgroup lists_filling Filling
  *   @defgroup lists_contains Searching
@@ -166,6 +168,7 @@
  *   @defgroup nvtext_tokenize Tokenizing
  *   @defgroup nvtext_replace Replacing
  *   @defgroup nvtext_minhash MinHashing
+ *   @defgroup nvtext_jaccard Jaccard Index
  * @}
  * @defgroup utility_apis Utilities
  * @{
diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index b93d93b07c6..c67f4bd8b1c 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,19 +61,6 @@ struct bpe_merge_pairs {
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   ~bpe_merge_pairs();
-
-  /**
-   * @brief Returns the number of merge pairs in the table.
-   *
-   * @return The number of merge pairs in the table
-   */
-  cudf::size_type get_size();
-  /**
-   * @brief  Returns the number of unique merge pairs in the table.
-   *
-   * @return The number of unique merge pairs in the table
-   */
-  std::size_t get_map_size();
 };
 
 /**
diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp
new file mode 100644
index 00000000000..835124141d4
--- /dev/null
+++ b/cpp/include/nvtext/detail/generate_ngrams.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <nvtext/generate_ngrams.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace nvtext {
+namespace detail {
+
+/**
+ * @copydoc hash_character_ngrams(cudf::strings_column_view const&,
+ * cudf::size_type, rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for allocating/copying device memory and launching kernels
+ */
+std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
+                                                    cudf::size_type ngrams,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace nvtext
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index e432453a045..5d66401df9d 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -90,5 +90,38 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
   cudf::size_type ngrams              = 2,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Hashes ngrams of characters within each string
+ *
+ * Each character of a string used to build the ngrams and ngrams are not
+ * produced across adjacent strings rows.
+ *
+ * ```
+ * "abcdefg" would generate ngrams=5 as ["abcde", "bcdef" "cdefg"]
+ * ```
+ *
+ * The ngrams for each string are hashed and returned in a list column where
+ * the offsets specify rows of hash values for each string.
+ *
+ * The size of the child column will be the total number of ngrams generated from
+ * the input strings column.
+ *
+ * All null row entries are ignored and the output contains all valid rows.
+ *
+ * The hash algorithm uses MurmurHash32 on each ngram.
+ *
+ * @throw cudf::logic_error if `ngrams < 2`
+ * @throw cudf::logic_error if there are not enough characters to generate any ngrams
+ *
+ * @param strings Strings column to produce ngrams from.
+ * @param ngrams The ngram number to generate. Default is 5.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return A lists column of hash values
+ */
+std::unique_ptr<cudf::column> hash_character_ngrams(
+  cudf::strings_column_view const& strings,
+  cudf::size_type ngrams              = 5,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/jaccard.hpp b/cpp/include/nvtext/jaccard.hpp
new file mode 100644
index 00000000000..19d6c111200
--- /dev/null
+++ b/cpp/include/nvtext/jaccard.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+namespace nvtext {
+/**
+ * @addtogroup nvtext_jaccard
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Computes the Jaccard similarity between individual rows
+ * in two strings columns
+ *
+ * The similarity is calculated between strings in corresponding rows
+ * such that `output[row] = J(input1[row],input2[row])`.
+ *
+ * The Jaccard index formula is https://en.wikipedia.org/wiki/Jaccard_index
+ * ```
+ *  J = |A ∩ B| / |A ∪ B|
+ *  where |A ∩ B| is number of common values between A and B
+ *  and |x| is the number of unique values in x.
+ * ```
+ *
+ * The computation here compares strings columns by treating each string as text (i.e. sentences,
+ * paragraphs, articles) instead of individual words or tokens to be compared directly. The
+ * algorithm applies a sliding window (size specified by the `width` parameter) to each string to
+ * form the set of tokens to compare within each row of the two input columns.
+ *
+ * These substrings are essentially character ngrams and used as part of the union and intersect
+ * calculations for that row. For efficiency, the substrings are hashed using the default
+ * MurmurHash32 to identify uniqueness within each row. Once the union and intersect sizes for the
+ * row are resolved, the Jaccard index is computed using the above formula and returned as a float32
+ * value.
+ *
+ * @code{.pseudo}
+ * input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"]
+ * input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"]
+ * r = jaccard_index(input1, input2)
+ * r is now [0.5, 0.15384616, 1.0, 0]
+ * @endcode
+ *
+ * If either input column's row is null, the output for that row will also be null.
+ *
+ * @throw std::invalid_argument if the `width < 2` or `input1.size() != input2.size()`
+ *
+ * @param input1 Strings column to compare with `input2`
+ * @param input2 Strings column to compare with `input1`
+ * @param width The character width used for apply substrings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Index calculation values
+ */
+std::unique_ptr<cudf::column> jaccard_index(
+  cudf::strings_column_view const& input1,
+  cudf::strings_column_view const& input2,
+  cudf::size_type width,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of group
+}  // namespace nvtext
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 9fdaeda0959..47c625b5079 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -36,24 +36,24 @@ namespace nvtext {
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
  * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if hash_function is not HASH_MURMUR3
  *
  * @param input Strings column to compute minhash
- * @param seed  Seed value used for the MurmurHash3_32 algorithm
+ * @param seed  Seed value used for the hash algorithm
  * @param width The character width used for apply substrings;
  *              Default is 4 characters.
- * @param hash_function Hash algorithm to use;
- *                      Only HASH_MURMUR3 is currently supported.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values for each string in input
  */
 std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
-  cudf::numeric_scalar<cudf::hash_value_type> seed = cudf::numeric_scalar(cudf::DEFAULT_HASH_SEED),
-  cudf::size_type width                            = 4,
-  cudf::hash_id hash_function                      = cudf::hash_id::HASH_MURMUR3,
-  rmm::mr::device_memory_resource* mr              = rmm::mr::get_current_device_resource());
+  cudf::numeric_scalar<uint32_t> seed = 0,
+  cudf::size_type width               = 4,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -64,27 +64,88 @@ std::unique_ptr<cudf::column> minhash(
  * string. The order of the elements in each row match the order of
  * the seeds provided in the `seeds` parameter.
  *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
  * Any null row entries result in corresponding null output rows.
  *
  * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if hash_function is not HASH_MURMUR3
  * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
- * @param seeds Seed values used for the MurmurHash3_32 algorithm
+ * @param seeds Seed values used for the hash algorithm
  * @param width The character width used for apply substrings;
  *              Default is 4 characters.
- * @param hash_function Hash algorithm to use;
- *                      Only HASH_MURMUR3 is currently supported.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
- *         or a hash_value_type column if only a single seed is specified
  */
 std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
-  cudf::device_span<cudf::hash_value_type const> seeds,
+  cudf::device_span<uint32_t const> seeds,
+  cudf::size_type width               = 4,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the minhash value for each string
+ *
+ * Hash values are computed from substrings of each string and the
+ * minimum hash value is returned for each string.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ * The hash function returns 2 uint64 values but only the first value
+ * is used with the minhash calculation.
+ *
+ * @throw std::invalid_argument if the width < 2
+ *
+ * @param input Strings column to compute minhash
+ * @param seed  Seed value used for the hash algorithm
+ * @param width The character width used for apply substrings;
+ *              Default is 4 characters.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Minhash values as UINT64 for each string in input
+ */
+std::unique_ptr<cudf::column> minhash64(
+  cudf::strings_column_view const& input,
+  cudf::numeric_scalar<uint64_t> seed = 0,
+  cudf::size_type width               = 4,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the minhash values for each string per seed
+ *
+ * Hash values are computed from substrings of each string and the
+ * minimum hash value is returned for each string for each seed.
+ * Each row of the list column are seed results for the corresponding
+ * string. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param width The character width used for apply substrings;
+ *              Default is 4 characters.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash64(
+  cudf::strings_column_view const& input,
+  cudf::device_span<uint64_t const> seeds,
   cudf::size_type width               = 4,
-  cudf::hash_id hash_function         = cudf::hash_id::HASH_MURMUR3,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index 164ec7a603e..ac75f5e9147 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -130,8 +130,8 @@ struct tokenizer_result {
  * strings column as working memory.
  *
  * @throw cudf::logic_error if `stride > max_sequence_length`
- * @throw cudf::logic_error if `max_sequence_length * max_rows_tensor` is
- *        larger than the max value for cudf::size_type
+ * @throw std::overflow_error if `max_sequence_length * max_rows_tensor`
+ *        exceeds the column size limit
  *
  * @param strings The input strings to tokenize.
  * @param vocabulary_table The vocabulary table pre-loaded into this object.
@@ -145,10 +145,6 @@ struct tokenizer_result {
  * @param do_truncate If true, the tokenizer will discard all the token-ids after
  *        `max_sequence_length` for each input string. If false, it will use a new row
  *        in the output token-ids to continue generating the output.
- * @param max_rows_tensor Maximum number of rows for the output token-ids expected
- *        to be generated by the tokenizer.
- *        Used for allocating temporary working memory on the GPU device.
- *        If the output generates a larger number of rows, behavior is undefined.
  * @param mr Memory resource to allocate any returned objects.
  * @return token-ids, attention-mask, and metadata
  */
@@ -159,7 +155,6 @@ tokenizer_result subword_tokenize(
   uint32_t stride,
   bool do_lower_case,
   bool do_truncate,
-  uint32_t max_rows_tensor,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index d173f25cf89..33bd04fffb3 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -11,7 +11,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
 include(../../fetch_rapids.cmake)
 include(rapids-cmake)
@@ -22,7 +22,7 @@ include(rapids-find)
 
 project(
   CUDA_KAFKA
-  VERSION 23.06.00
+  VERSION 23.10.00
   LANGUAGES CXX
 )
 
diff --git a/cpp/libcudf_kafka/include/cudf_kafka/kafka_callback.hpp b/cpp/libcudf_kafka/include/cudf_kafka/kafka_callback.hpp
index 471f0e4fa46..0bde7e44812 100644
--- a/cpp/libcudf_kafka/include/cudf_kafka/kafka_callback.hpp
+++ b/cpp/libcudf_kafka/include/cudf_kafka/kafka_callback.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,7 +73,7 @@ class python_oauth_refresh_callback : public RdKafka::OAuthBearerTokenRefreshCb
    * @param handle
    * @param oauthbearer_config pointer to the OAuthBearerConfig object
    */
-  void oauthbearer_token_refresh_cb(RdKafka::Handle* handle, const std::string& oauthbearer_config);
+  void oauthbearer_token_refresh_cb(RdKafka::Handle* handle, std::string const& oauthbearer_config);
 
  private:
   kafka_oauth_callback_wrapper_type callback_wrapper_;
diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp
index 1072bff43dd..3b650d791aa 100644
--- a/cpp/src/ast/expression_parser.cpp
+++ b/cpp/src/ast/expression_parser.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -193,6 +193,16 @@ cudf::size_type expression_parser::visit(operation const& expr)
   return index;
 }
 
+// TODO: Eliminate column name references from expression_parser because
+// 2 code paths diverge in supporting column name references:
+// 1. column name references are specific to cuIO
+// 2. column name references are not supported in the libcudf table operations such as join,
+// transform.
+cudf::size_type expression_parser::visit(column_name_reference const& expr)
+{
+  CUDF_FAIL("Column name references are not supported in the AST expression parser.");
+}
+
 cudf::data_type expression_parser::output_type() const
 {
   return _data_references.empty() ? cudf::data_type(cudf::type_id::EMPTY)
diff --git a/cpp/src/ast/expressions.cpp b/cpp/src/ast/expressions.cpp
index 88cc6650d6c..b45b9d0c78c 100644
--- a/cpp/src/ast/expressions.cpp
+++ b/cpp/src/ast/expressions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/ast/detail/expression_transformer.hpp>
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -52,7 +53,31 @@ cudf::size_type operation::accept(detail::expression_parser& visitor) const
 {
   return visitor.visit(*this);
 }
+cudf::size_type column_name_reference::accept(detail::expression_parser& visitor) const
+{
+  return visitor.visit(*this);
+}
 
+auto literal::accept(detail::expression_transformer& visitor) const
+  -> decltype(visitor.visit(*this))
+{
+  return visitor.visit(*this);
+}
+auto column_reference::accept(detail::expression_transformer& visitor) const
+  -> decltype(visitor.visit(*this))
+{
+  return visitor.visit(*this);
+}
+auto operation::accept(detail::expression_transformer& visitor) const
+  -> decltype(visitor.visit(*this))
+{
+  return visitor.visit(*this);
+}
+auto column_name_reference::accept(detail::expression_transformer& visitor) const
+  -> decltype(visitor.visit(*this))
+{
+  return visitor.visit(*this);
+}
 }  // namespace ast
 
 }  // namespace cudf
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index daec9b5b199..ef07de8c461 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -136,7 +136,7 @@ namespace jit {
 void binary_operation(mutable_column_view& out,
                       column_view const& lhs,
                       column_view const& rhs,
-                      const std::string& ptx,
+                      std::string const& ptx,
                       rmm::cuda_stream_view stream)
 {
   std::string const output_type_name = cudf::type_to_name(out.type());
@@ -217,6 +217,8 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
 
   auto out_view = out->mutable_view();
   cudf::binops::compiled::binary_operation(out_view, lhs, rhs, op, stream);
+  // TODO: consider having the binary_operation count nulls instead
+  out->set_null_count(cudf::detail::null_count(out_view.null_mask(), 0, out->size(), stream));
   return out;
 }
 }  // namespace compiled
@@ -373,6 +375,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
 
   auto out_view = out->mutable_view();
   binops::jit::binary_operation(out_view, lhs, rhs, ptx, stream);
+  out->set_null_count(cudf::detail::null_count(out_view.null_mask(), 0, out->size(), stream));
   return out;
 }
 }  // namespace detail
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index c7dcc2a968e..1f7f342632a 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -50,8 +50,11 @@ struct scalar_as_column_view {
   return_type operator()(scalar const& s, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
   {
     auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
-    auto col_v =
-      column_view(s.type(), 1, h_scalar_type_view.data(), (bitmask_type const*)s.validity_data());
+    auto col_v               = column_view(s.type(),
+                             1,
+                             h_scalar_type_view.data(),
+                             reinterpret_cast<bitmask_type const*>(s.validity_data()),
+                             !s.is_valid());
     return std::pair{col_v, std::unique_ptr<column>(nullptr)};
   }
   template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
@@ -74,8 +77,8 @@ scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::strin
   auto offsets_column = std::get<0>(cudf::detail::make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + 1, stream, mr));
 
-  auto chars_column_v =
-    column_view(data_type{type_id::INT8}, h_scalar_type_view.size(), h_scalar_type_view.data());
+  auto chars_column_v = column_view(
+    data_type{type_id::INT8}, h_scalar_type_view.size(), h_scalar_type_view.data(), nullptr, 0);
   // Construct string column_view
   auto col_v = column_view(s.type(),
                            1,
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 9b1163c1169..9a50eb0d0ec 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -269,7 +269,7 @@ void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
   CUDF_CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel<decltype(f)>));
   // 2 elements per thread.
-  const int grid_size = util::div_rounding_up_safe(size, 2 * block_size);
+  int const grid_size = util::div_rounding_up_safe(size, 2 * block_size);
   for_each_kernel<<<grid_size, block_size, 0, stream.value()>>>(size, std::forward<Functor&&>(f));
 }
 
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index b98a2196748..5a0d3e4f120 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,10 +50,9 @@ size_type state_null_count(mask_state state, size_type size)
 {
   switch (state) {
     case mask_state::UNALLOCATED: return 0;
-    case mask_state::UNINITIALIZED: return UNKNOWN_NULL_COUNT;
     case mask_state::ALL_NULL: return size;
     case mask_state::ALL_VALID: return 0;
-    default: CUDF_FAIL("Invalid null mask state.");
+    default: CUDF_FAIL("Invalid null mask state.", std::invalid_argument);
   }
 }
 
@@ -105,13 +104,15 @@ __global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination,
                                      bool valid,
                                      size_type number_of_mask_words)
 {
-  auto x                  = destination + word_index(begin_bit);
-  const auto last_word    = word_index(end_bit) - word_index(begin_bit);
-  bitmask_type fill_value = valid ? 0xffff'ffff : 0;
+  auto x                            = destination + word_index(begin_bit);
+  thread_index_type const last_word = word_index(end_bit) - word_index(begin_bit);
+  bitmask_type fill_value           = valid ? 0xffff'ffff : 0;
 
-  for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+
+  for (thread_index_type destination_word_index = grid_1d::global_thread_id();
        destination_word_index < number_of_mask_words;
-       destination_word_index += blockDim.x * gridDim.x) {
+       destination_word_index += stride) {
     if (destination_word_index == 0 || destination_word_index == last_word) {
       bitmask_type mask = ~bitmask_type{0};
       if (destination_word_index == 0) {
@@ -190,9 +191,10 @@ __global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination,
                                     size_type source_end_bit,
                                     size_type number_of_mask_words)
 {
-  for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+  for (thread_index_type destination_word_index = grid_1d::global_thread_id();
        destination_word_index < number_of_mask_words;
-       destination_word_index += blockDim.x * gridDim.x) {
+       destination_word_index += stride) {
     destination[destination_word_index] = detail::get_mask_offset_word(
       source, destination_word_index, source_begin_bit, source_end_bit);
   }
@@ -262,14 +264,15 @@ __global__ void count_set_bits_kernel(bitmask_type const* bitmask,
 
   auto const first_word_index{word_index(first_bit_index)};
   auto const last_word_index{word_index(last_bit_index)};
-  auto const tid         = threadIdx.x + blockIdx.x * blockDim.x;
-  auto thread_word_index = tid + first_word_index;
+  thread_index_type const tid         = grid_1d::global_thread_id();
+  thread_index_type const stride      = grid_1d::grid_stride();
+  thread_index_type thread_word_index = tid + first_word_index;
   size_type thread_count{0};
 
   // First, just count the bits in all words
   while (thread_word_index <= last_word_index) {
     thread_count += __popc(bitmask[thread_word_index]);
-    thread_word_index += blockDim.x * gridDim.x;
+    thread_word_index += stride;
   }
 
   // Subtract any slack bits counted from the first and last word
@@ -373,32 +376,32 @@ cudf::size_type null_count(bitmask_type const* bitmask,
 }
 
 // Count non-zero bits in the specified ranges of a bitmask
-std::vector<size_type> segmented_count_set_bits(const bitmask_type* bitmask,
-                                                host_span<const size_type> indices,
+std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
+                                                host_span<size_type const> indices,
                                                 rmm::cuda_stream_view stream)
 {
   return detail::segmented_count_set_bits(bitmask, indices.begin(), indices.end(), stream);
 }
 
 // Count zero bits in the specified ranges of a bitmask
-std::vector<size_type> segmented_count_unset_bits(const bitmask_type* bitmask,
-                                                  host_span<const size_type> indices,
+std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
+                                                  host_span<size_type const> indices,
                                                   rmm::cuda_stream_view stream)
 {
   return detail::segmented_count_unset_bits(bitmask, indices.begin(), indices.end(), stream);
 }
 
 // Count valid elements in the specified ranges of a validity bitmask
-std::vector<size_type> segmented_valid_count(const bitmask_type* bitmask,
-                                             host_span<const size_type> indices,
+std::vector<size_type> segmented_valid_count(bitmask_type const* bitmask,
+                                             host_span<size_type const> indices,
                                              rmm::cuda_stream_view stream)
 {
   return detail::segmented_valid_count(bitmask, indices.begin(), indices.end(), stream);
 }
 
 // Count null elements in the specified ranges of a validity bitmask
-std::vector<size_type> segmented_null_count(const bitmask_type* bitmask,
-                                            host_span<const size_type> indices,
+std::vector<size_type> segmented_null_count(bitmask_type const* bitmask,
+                                            host_span<size_type const> indices,
                                             rmm::cuda_stream_view stream)
 {
   return detail::segmented_null_count(bitmask, indices.begin(), indices.end(), stream);
@@ -531,4 +534,10 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
   return detail::bitmask_or(view, cudf::get_default_stream(), mr);
 }
 
+// Count non-zero bits in the specified range
+cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop)
+{
+  return detail::null_count(bitmask, start, stop, cudf::get_default_stream());
+}
+
 }  // namespace cudf
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index f0b5719e5b4..8276dbe78d2 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -117,38 +117,15 @@ mutable_column_view column::mutable_view()
     child_views.emplace_back(*c);
   }
 
-  // Store the old null count before resetting it. By accessing the value
-  // directly instead of calling `this->null_count()`, we can avoid a potential
-  // invocation of `cudf::detail::null_count()`. This does however mean that
-  // calling `this->null_count()` on the resulting mutable view could still
-  // potentially invoke `cudf::detail::null_count()`.
-  auto current_null_count = _null_count;
-
-  // The elements of a column could be changed through a `mutable_column_view`, therefore the
-  // existing `null_count` is no longer valid. Reset it to `UNKNOWN_NULL_COUNT` forcing it to be
-  // recomputed on the next invocation of `this->null_count()`.
-  set_null_count(cudf::UNKNOWN_NULL_COUNT);
-
   return mutable_column_view{type(),
                              size(),
                              _data.data(),
                              static_cast<bitmask_type*>(_null_mask.data()),
-                             current_null_count,
+                             _null_count,
                              0,
                              child_views};
 }
 
-// If the null count is known, return it. Else, compute and return it
-size_type column::null_count() const
-{
-  CUDF_FUNC_RANGE();
-  if (_null_count <= cudf::UNKNOWN_NULL_COUNT) {
-    _null_count = cudf::detail::null_count(
-      static_cast<bitmask_type const*>(_null_mask.data()), 0, size(), cudf::get_default_stream());
-  }
-  return _null_count;
-}
-
 void column::set_null_mask(rmm::device_buffer&& new_null_mask, size_type new_null_count)
 {
   if (new_null_count > 0) {
@@ -228,7 +205,7 @@ struct create_column_from_view {
       view.type(),
       view.size(),
       rmm::device_buffer{
-        static_cast<const char*>(view.head()) + (view.offset() * cudf::size_of(view.type())),
+        static_cast<char const*>(view.head()) + (view.offset() * cudf::size_of(view.type())),
         view.size() * cudf::size_of(view.type()),
         stream,
         mr},
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 5f455e26e52..3fcc67a67d3 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,7 +64,7 @@ std::unique_ptr<column> make_empty_column(data_type type)
 {
   CUDF_EXPECTS(type.id() == type_id::EMPTY || !cudf::is_nested(type),
                "make_empty_column is invalid to call on nested types");
-  return std::make_unique<column>(type, 0, rmm::device_buffer{});
+  return std::make_unique<column>(type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
 }
 
 // Empty column of specified type id
@@ -81,12 +81,13 @@ std::unique_ptr<column> make_numeric_column(data_type type,
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
-  return std::make_unique<column>(type,
-                                  size,
-                                  rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                                  detail::create_null_mask(size, state, stream, mr),
-                                  state_null_count(state, size),
-                                  std::vector<std::unique_ptr<column>>{});
+  return std::make_unique<column>(
+    type,
+    size,
+    rmm::device_buffer{size * cudf::size_of(type), stream, mr},
+    detail::create_null_mask(size, state, stream, mr),
+    state == mask_state::UNINITIALIZED ? 0 : state_null_count(state, size),
+    std::vector<std::unique_ptr<column>>{});
 }
 
 // Allocate storage for a specified number of numeric elements
@@ -100,12 +101,13 @@ std::unique_ptr<column> make_fixed_point_column(data_type type,
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
-  return std::make_unique<column>(type,
-                                  size,
-                                  rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                                  detail::create_null_mask(size, state, stream, mr),
-                                  state_null_count(state, size),
-                                  std::vector<std::unique_ptr<column>>{});
+  return std::make_unique<column>(
+    type,
+    size,
+    rmm::device_buffer{size * cudf::size_of(type), stream, mr},
+    detail::create_null_mask(size, state, stream, mr),
+    state == mask_state::UNINITIALIZED ? 0 : state_null_count(state, size),
+    std::vector<std::unique_ptr<column>>{});
 }
 
 // Allocate storage for a specified number of timestamp elements
@@ -119,12 +121,13 @@ std::unique_ptr<column> make_timestamp_column(data_type type,
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
-  return std::make_unique<column>(type,
-                                  size,
-                                  rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                                  detail::create_null_mask(size, state, stream, mr),
-                                  state_null_count(state, size),
-                                  std::vector<std::unique_ptr<column>>{});
+  return std::make_unique<column>(
+    type,
+    size,
+    rmm::device_buffer{size * cudf::size_of(type), stream, mr},
+    detail::create_null_mask(size, state, stream, mr),
+    state == mask_state::UNINITIALIZED ? 0 : state_null_count(state, size),
+    std::vector<std::unique_ptr<column>>{});
 }
 
 // Allocate storage for a specified number of duration elements
@@ -138,12 +141,13 @@ std::unique_ptr<column> make_duration_column(data_type type,
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
-  return std::make_unique<column>(type,
-                                  size,
-                                  rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                                  detail::create_null_mask(size, state, stream, mr),
-                                  state_null_count(state, size),
-                                  std::vector<std::unique_ptr<column>>{});
+  return std::make_unique<column>(
+    type,
+    size,
+    rmm::device_buffer{size * cudf::size_of(type), stream, mr},
+    detail::create_null_mask(size, state, stream, mr),
+    state == mask_state::UNINITIALIZED ? 0 : state_null_count(state, size),
+    std::vector<std::unique_ptr<column>>{});
 }
 
 // Allocate storage for a specified number of fixed width elements
diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu
index a2da33599c7..0e65a131e67 100644
--- a/cpp/src/column/column_factories.cu
+++ b/cpp/src/column/column_factories.cu
@@ -57,7 +57,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
 
   // Since we are setting every row to the scalar, the fill() never needs to access
   // any of the children in the strings column which would otherwise cause an exception.
-  column_view sc{value.type(), size, nullptr};
+  column_view sc{value.type(), size, nullptr, nullptr, 0};
   auto& sv = static_cast<scalar_type_t<cudf::string_view> const&>(value);
 
   // fill the column with the scalar
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 3e18b9734f6..75722ede9d2 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,8 @@
  */
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/hashing.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -63,16 +63,6 @@ column_view_base::column_view_base(data_type type,
   }
 }
 
-// If null count is known, returns it. Else, compute and return it
-size_type column_view_base::null_count() const
-{
-  if (_null_count <= cudf::UNKNOWN_NULL_COUNT) {
-    _null_count = cudf::detail::null_count(
-      null_mask(), offset(), offset() + size(), cudf::get_default_stream());
-  }
-  return _null_count;
-}
-
 size_type column_view_base::null_count(size_type begin, size_type end) const
 {
   CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end), "Range is out of bounds.");
@@ -88,7 +78,7 @@ struct HashValue {
   explicit HashValue(std::size_t h) : hash{h} {}
   HashValue operator^(HashValue const& other) const
   {
-    return HashValue{hash_combine(hash, other.hash)};
+    return HashValue{cudf::hashing::detail::hash_combine(hash, other.hash)};
   }
 };
 
@@ -107,7 +97,7 @@ std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false
                          c.child_end(),
                          init,
                          [&c, is_parent_empty](std::size_t hash, auto const& child) {
-                           return hash_combine(
+                           return cudf::hashing::detail::hash_combine(
                              hash, shallow_hash_impl(child, c.is_empty() or is_parent_empty));
                          });
 }
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index c42cc5c69f9..d08c3025553 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -13,10 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/detail/concatenate.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -49,6 +49,7 @@
 
 namespace cudf {
 namespace detail {
+namespace {
 
 // From benchmark data, the fused kernel optimization appears to perform better
 // when there are more than a trivial number of columns, or when the null mask
@@ -100,24 +101,31 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
  * @brief Concatenates the null mask bits of all the column device views in the
  * `views` array to the destination bitmask.
  *
+ * @tparam block_size Block size for using with single_lane_block_sum_reduce
+ *
  * @param views Array of column_device_view
  * @param output_offsets Prefix sum of sizes of elements of `views`
  * @param number_of_views Size of `views` array
  * @param dest_mask The output buffer to copy null masks into
- * @param number_of_mask_bits The total number of null masks bits that are being
- * copied
+ * @param number_of_mask_bits The total number of null masks bits that are being copied
+ * @param out_valid_count To hold the total number of valid bits set
  */
+template <size_type block_size>
 __global__ void concatenate_masks_kernel(column_device_view const* views,
                                          size_t const* output_offsets,
                                          size_type number_of_views,
                                          bitmask_type* dest_mask,
-                                         size_type number_of_mask_bits)
+                                         size_type number_of_mask_bits,
+                                         size_type* out_valid_count)
 {
-  size_type mask_index = threadIdx.x + blockIdx.x * blockDim.x;
+  auto tidx         = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+  auto active_mask  = __ballot_sync(0xFFFF'FFFFu, tidx < number_of_mask_bits);
 
-  auto active_mask = __ballot_sync(0xFFFF'FFFFu, mask_index < number_of_mask_bits);
+  size_type warp_valid_count = 0;
 
-  while (mask_index < number_of_mask_bits) {
+  while (tidx < number_of_mask_bits) {
+    auto const mask_index = static_cast<cudf::size_type>(tidx);
     size_type const source_view_index =
       thrust::upper_bound(
         thrust::seq, output_offsets, output_offsets + number_of_views, mask_index) -
@@ -129,32 +137,44 @@ __global__ void concatenate_masks_kernel(column_device_view const* views,
     }
     bitmask_type const new_word = __ballot_sync(active_mask, bit_is_set);
 
-    if (threadIdx.x % detail::warp_size == 0) { dest_mask[word_index(mask_index)] = new_word; }
+    if (threadIdx.x % detail::warp_size == 0) {
+      dest_mask[word_index(mask_index)] = new_word;
+      warp_valid_count += __popc(new_word);
+    }
 
-    mask_index += blockDim.x * gridDim.x;
-    active_mask = __ballot_sync(active_mask, mask_index < number_of_mask_bits);
+    tidx += stride;
+    active_mask = __ballot_sync(active_mask, tidx < number_of_mask_bits);
   }
+
+  using detail::single_lane_block_sum_reduce;
+  auto const block_valid_count = single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
+  if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); }
 }
+}  // namespace
 
-void concatenate_masks(device_span<column_device_view const> d_views,
-                       device_span<size_t const> d_offsets,
-                       bitmask_type* dest_mask,
-                       size_type output_size,
-                       rmm::cuda_stream_view stream)
+size_type concatenate_masks(device_span<column_device_view const> d_views,
+                            device_span<size_t const> d_offsets,
+                            bitmask_type* dest_mask,
+                            size_type output_size,
+                            rmm::cuda_stream_view stream)
 {
+  rmm::device_scalar<size_type> d_valid_count(0, stream);
   constexpr size_type block_size{256};
   cudf::detail::grid_1d config(output_size, block_size);
-  concatenate_masks_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-    d_views.data(),
-    d_offsets.data(),
-    static_cast<size_type>(d_views.size()),
-    dest_mask,
-    output_size);
+  concatenate_masks_kernel<block_size>
+    <<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+      d_views.data(),
+      d_offsets.data(),
+      static_cast<size_type>(d_views.size()),
+      dest_mask,
+      output_size,
+      d_valid_count.data());
+  return output_size - d_valid_count.value(stream);
 }
 
-void concatenate_masks(host_span<column_view const> views,
-                       bitmask_type* dest_mask,
-                       rmm::cuda_stream_view stream)
+size_type concatenate_masks(host_span<column_view const> views,
+                            bitmask_type* dest_mask,
+                            rmm::cuda_stream_view stream)
 {
   // Preprocess and upload inputs to device memory
   auto const device_views = create_device_views(views, stream);
@@ -162,9 +182,10 @@ void concatenate_masks(host_span<column_view const> views,
   auto const& d_offsets   = std::get<2>(device_views);
   auto const output_size  = std::get<3>(device_views);
 
-  concatenate_masks(d_views, d_offsets, dest_mask, output_size, stream);
+  return concatenate_masks(d_views, d_offsets, dest_mask, output_size, stream);
 }
 
+namespace {
 template <typename T, size_type block_size, bool Nullable>
 __global__ void fused_concatenate_kernel(column_device_view const* input_views,
                                          size_t const* input_offsets,
@@ -175,7 +196,8 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views,
   auto const output_size = output_view.size();
   auto* output_data      = output_view.data<T>();
 
-  int64_t output_index       = threadIdx.x + blockIdx.x * blockDim.x;
+  auto output_index          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride          = cudf::detail::grid_1d::grid_stride();
   size_type warp_valid_count = 0;
 
   unsigned active_mask;
@@ -204,7 +226,7 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views,
       warp_valid_count += __popc(new_word);
     }
 
-    output_index += blockDim.x * gridDim.x;
+    output_index += stride;
     if (Nullable) { active_mask = __ballot_sync(active_mask, output_index < output_size); }
   }
 
@@ -230,7 +252,7 @@ std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
   auto const output_size  = std::get<3>(device_views);
 
   CUDF_EXPECTS(output_size <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-               "Total number of concatenated rows exceeds size_type range",
+               "Total number of concatenated rows exceeds the column size limit",
                std::overflow_error);
 
   // Allocate output
@@ -287,7 +309,8 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
 
   // If concatenated column is nullable, proceed to calculate it
   if (has_nulls) {
-    cudf::detail::concatenate_masks(views, (col->mutable_view()).null_mask(), stream);
+    col->set_null_count(
+      cudf::detail::concatenate_masks(views, (col->mutable_view()).null_mask(), stream));
   } else {
     col->set_null_count(0);  // prevent null count from being materialized
   }
@@ -340,8 +363,6 @@ std::unique_ptr<column> concatenate_dispatch::operator()<cudf::struct_view>()
   return cudf::structs::detail::concatenate(views, stream, mr);
 }
 
-namespace {
-
 void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream);
 
 /**
@@ -369,9 +390,9 @@ class traverse_children {
                       std::size_t{},
                       [](size_t a, auto const& b) -> size_t { return a + b.size(); }) +
       1;
-    // note:  output text must include "exceeds size_type range" for python error handling
     CUDF_EXPECTS(total_offset_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
-                 "Total number of concatenated offsets exceeds size_type range");
+                 "Total number of concatenated offsets exceeds the column size limit",
+                 std::overflow_error);
   }
 };
 
@@ -389,19 +410,18 @@ void traverse_children::operator()<cudf::string_view>(host_span<column_view cons
       return a + (scv.is_empty() ? 0
                   // if the column is unsliced, skip the offset retrieval.
                   : scv.offset() > 0
-                    ? cudf::detail::get_value<offset_type>(
+                    ? cudf::detail::get_value<size_type>(
                         scv.offsets(), scv.offset() + scv.size(), stream) -
-                        cudf::detail::get_value<offset_type>(scv.offsets(), scv.offset(), stream)
+                        cudf::detail::get_value<size_type>(scv.offsets(), scv.offset(), stream)
                   // if the offset() is 0, it can still be sliced to a shorter length. in this case
                   // we only need to read a single offset. otherwise just return the full length
                   // (chars_size())
                   : scv.size() + 1 == scv.offsets().size()
                     ? scv.chars_size()
-                    : cudf::detail::get_value<offset_type>(scv.offsets(), scv.size(), stream));
+                    : cudf::detail::get_value<size_type>(scv.offsets(), scv.size(), stream));
     });
-  // note:  output text must include "exceeds size_type range" for python error handling
   CUDF_EXPECTS(total_char_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
-               "Total number of concatenated chars exceeds size_type range",
+               "Total number of concatenated chars exceeds the column size limit",
                std::overflow_error);
 }
 
@@ -471,9 +491,8 @@ void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_v
     std::accumulate(cols.begin(), cols.end(), std::size_t{}, [](size_t a, auto const& b) {
       return a + static_cast<size_t>(b.size());
     });
-  // note:  output text must include "exceeds size_type range" for python error handling
   CUDF_EXPECTS(total_row_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
-               "Total number of concatenated rows exceeds size_type range",
+               "Total number of concatenated rows exceeds the column size limit",
                std::overflow_error);
 
   // traverse children
@@ -536,7 +555,7 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::mr::device_memory_resource* mr)
 {
   bool const has_nulls =
-    std::any_of(views.begin(), views.end(), [](const column_view col) { return col.has_nulls(); });
+    std::any_of(views.begin(), views.end(), [](column_view const col) { return col.has_nulls(); });
   if (has_nulls) {
     size_type const total_element_count =
       std::accumulate(views.begin(), views.end(), 0, [](auto accumulator, auto const& v) {
@@ -557,25 +576,28 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
 }  // namespace detail
 
 rmm::device_buffer concatenate_masks(host_span<column_view const> views,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_masks(views, cudf::get_default_stream(), mr);
+  return detail::concatenate_masks(views, stream, mr);
 }
 
 // Concatenates the elements from a vector of column_views
 std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(columns_to_concat, cudf::get_default_stream(), mr);
+  return detail::concatenate(columns_to_concat, stream, mr);
 }
 
 std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(tables_to_concat, cudf::get_default_stream(), mr);
+  return detail::concatenate(tables_to_concat, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index e7ac424001c..e1a55ec5419 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -21,8 +21,8 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
@@ -51,10 +51,15 @@
 namespace cudf {
 namespace {
 
-// align all column size allocations to this boundary so that all output column buffers
+// Align all column size allocations to this boundary so that all output column buffers
 // start at that alignment.
 static constexpr std::size_t split_align = 64;
 
+// The size that contiguous split uses internally as the GPU unit of work.
+// The number of `desired_batch_size` batches equals the number of CUDA blocks
+// that will be used for the main kernel launch (`copy_partitions`).
+static constexpr std::size_t desired_batch_size = 1 * 1024 * 1024;
+
 /**
  * @brief Struct which contains information on a source buffer.
  *
@@ -66,7 +71,7 @@ static constexpr std::size_t split_align = 64;
  */
 struct src_buf_info {
   src_buf_info(cudf::type_id _type,
-               const int* _offsets,
+               int const* _offsets,
                int _offset_stack_pos,
                int _parent_offsets_index,
                bool _is_validity,
@@ -81,7 +86,7 @@ struct src_buf_info {
   }
 
   cudf::type_id type;
-  const int* offsets;        // a pointer to device memory offsets if I am an offset buffer
+  int const* offsets;        // a pointer to device memory offsets if I am an offset buffer
   int offset_stack_pos;      // position in the offset stack buffer
   int parent_offsets_index;  // immediate parent that has offsets, or -1 if none
   bool is_validity;          // if I am a validity buffer
@@ -103,7 +108,7 @@ struct dst_buf_info {
   int num_rows;  // # of rows to be copied(which may be different from num_elements in the case of
                  // validity or offset buffers)
 
-  int src_element_index;   // element index to start reading from from my associated source buffer
+  int src_element_index;   // element index to start reading from my associated source buffer
   std::size_t dst_offset;  // my offset into the per-partition allocation
   int value_shift;         // amount to shift values down by (for offset buffers)
   int bit_shift;           // # of bits to shift right by (for validity buffers)
@@ -120,7 +125,7 @@ struct dst_buf_info {
  * Copies a single partition of a source column buffer to a destination buffer. Shifts
  * element values by value_shift in the case of a buffer of offsets (value_shift will
  * only ever be > 0 in that case).  Shifts elements bitwise by bit_shift in the case of
- * a validity buffer (bif_shift will only ever be > 0 in that case).  This function assumes
+ * a validity buffer (bit_shift will only ever be > 0 in that case).  This function assumes
  * value_shift and bit_shift will never be > 0 at the same time.
  *
  * This function expects:
@@ -167,7 +172,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
   stride *= 16;
   while (pos + 20 <= num_bytes) {
     // read from the nearest aligned address.
-    const uint32_t* in32 = reinterpret_cast<const uint32_t*>((src + pos) - ofs);
+    uint32_t const* in32 = reinterpret_cast<uint32_t const*>((src + pos) - ofs);
     uint4 v              = uint4{in32[0], in32[1], in32[2], in32[3]};
     if (ofs || bit_shift) {
       v.x = __funnelshift_r(v.x, v.y, ofs * 8 + bit_shift);
@@ -268,22 +273,21 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
  * to be done as evenly as possible across the multiprocessors on the device.
  * This kernel is arranged such that each block copies 1 source/destination pair.
  *
+ * @param index_to_buffer A function that given a `buf_index` returns the destination buffer
  * @param src_bufs Input source buffers
- * @param dst_bufs Destination buffers
- * @param buf_info Information on the range of values to be copied for each destination buffer.
+ * @param buf_info Information on the range of values to be copied for each destination buffer
  */
-template <int block_size>
-__global__ void copy_partitions(uint8_t const** src_bufs,
-                                uint8_t** dst_bufs,
+template <int block_size, typename IndexToDstBuf>
+__global__ void copy_partitions(IndexToDstBuf index_to_buffer,
+                                uint8_t const** src_bufs,
                                 dst_buf_info* buf_info)
 {
   auto const buf_index     = blockIdx.x;
   auto const src_buf_index = buf_info[buf_index].src_buf_index;
-  auto const dst_buf_index = buf_info[buf_index].dst_buf_index;
 
   // copy, shifting offsets and validity bits as needed
   copy_buffer<block_size>(
-    dst_bufs[dst_buf_index] + buf_info[buf_index].dst_offset,
+    index_to_buffer(buf_index) + buf_info[buf_index].dst_offset,
     src_bufs[src_buf_index],
     threadIdx.x,
     buf_info[buf_index].num_elements,
@@ -336,7 +340,7 @@ bool is_offset_type(type_id id) { return (id == type_id::STRING or id == type_id
  * @param end End of input columns
  * @param offset_depth Current offset nesting depth
  *
- * @returns Total offset stack size needed for this range of columns.
+ * @returns Total offset stack size needed for this range of columns
  */
 template <typename InputIter>
 std::size_t compute_offset_stack_size(InputIter begin, InputIter end, int offset_depth = 0)
@@ -400,7 +404,8 @@ template <typename InputIter>
 size_type count_src_bufs(InputIter begin, InputIter end)
 {
   auto buf_iter = thrust::make_transform_iterator(begin, [](column_view const& col) {
-    return 1 + (col.nullable() ? 1 : 0) + count_src_bufs(col.child_begin(), col.child_end());
+    auto const children_counts = count_src_bufs(col.child_begin(), col.child_end());
+    return 1 + (col.nullable() ? 1 : 0) + children_counts;
   });
   return std::accumulate(buf_iter, buf_iter + std::distance(begin, end), 0);
 }
@@ -661,6 +666,66 @@ std::pair<src_buf_info*, size_type> setup_source_buf_info(InputIter begin,
   return {current, offset_stack_pos};
 }
 
+/**
+ * @brief Given a column, processed split buffers, and a metadata builder, populate
+ * the metadata for this column in the builder, and return a tuple of:
+ * column size, data offset, bitmask offset and null count.
+ *
+ * @param src column_view to create metadata from
+ * @param current_info dst_buf_info pointer reference, pointing to this column's buffer info
+ *                     This is a pointer reference because it is updated by this function as the
+ *                     columns's validity and data buffers are visited
+ * @param mb A metadata_builder instance to update with the column's packed metadata
+ * @param use_src_null_count True for the chunked_pack case where current_info has invalid null
+ *                           count information. The null count should be taken
+ *                           from `src` because this case is restricted to a single partition
+ *                           (no splits)
+ * @returns a std::tuple containing:
+ *          column size, data offset, bitmask offset, and null count
+ */
+template <typename BufInfo>
+std::tuple<size_type, int64_t, int64_t, size_type> build_output_column_metadata(
+  column_view const& src,
+  BufInfo& current_info,
+  detail::metadata_builder& mb,
+  bool use_src_null_count)
+{
+  auto [bitmask_offset, null_count] = [&]() {
+    if (src.nullable()) {
+      // offsets in the existing serialized_column metadata are int64_t
+      // that's the reason for the casting in this code.
+      int64_t const bitmask_offset =
+        current_info->num_elements == 0
+          ? -1  // this means that the bitmask buffer pointer should be nullptr
+          : static_cast<int64_t>(current_info->dst_offset);
+
+      // use_src_null_count is used for the chunked contig split case, where we have
+      // no splits: the null_count is just the source column's null_count
+      size_type const null_count = use_src_null_count
+                                     ? src.null_count()
+                                     : (current_info->num_elements == 0
+                                          ? 0
+                                          : (current_info->num_rows - current_info->valid_count));
+
+      ++current_info;
+      return std::pair(bitmask_offset, null_count);
+    }
+    return std::pair(static_cast<int64_t>(-1), 0);
+  }();
+
+  // size/data pointer for the column
+  auto const col_size       = static_cast<size_type>(current_info->num_elements);
+  int64_t const data_offset = src.num_children() > 0 || col_size == 0 || src.head() == nullptr
+                                ? -1
+                                : static_cast<int64_t>(current_info->dst_offset);
+
+  mb.add_column_info_to_meta(
+    src.type(), col_size, null_count, data_offset, bitmask_offset, src.num_children());
+
+  ++current_info;
+  return {col_size, data_offset, bitmask_offset, null_count};
+}
+
 /**
  * @brief Given a set of input columns and processed split buffers, produce
  * output columns.
@@ -686,39 +751,69 @@ BufInfo build_output_columns(InputIter begin,
                              InputIter end,
                              BufInfo info_begin,
                              Output out_begin,
-                             uint8_t const* const base_ptr)
+                             uint8_t const* const base_ptr,
+                             detail::metadata_builder& mb)
 {
   auto current_info = info_begin;
-  std::transform(begin, end, out_begin, [&current_info, base_ptr](column_view const& src) {
-    auto [bitmask_ptr, null_count] = [&]() {
-      if (src.nullable()) {
-        auto const ptr =
-          current_info->num_elements == 0
-            ? nullptr
-            : reinterpret_cast<bitmask_type const*>(base_ptr + current_info->dst_offset);
-        auto const null_count = current_info->num_elements == 0
-                                  ? 0
-                                  : (current_info->num_rows - current_info->valid_count);
-        ++current_info;
-        return std::pair(ptr, null_count);
-      }
-      return std::pair(static_cast<bitmask_type const*>(nullptr), 0);
-    }();
+  std::transform(begin, end, out_begin, [&current_info, base_ptr, &mb](column_view const& src) {
+    auto [col_size, data_offset, bitmask_offset, null_count] =
+      build_output_column_metadata<BufInfo>(src, current_info, mb, false);
+
+    auto const bitmask_ptr =
+      base_ptr != nullptr && bitmask_offset != -1
+        ? reinterpret_cast<bitmask_type const*>(base_ptr + static_cast<uint64_t>(bitmask_offset))
+        : nullptr;
 
     // size/data pointer for the column
-    auto const size = current_info->num_elements;
-    uint8_t const* data_ptr =
-      size == 0 || src.head() == nullptr ? nullptr : base_ptr + current_info->dst_offset;
-    ++current_info;
+    uint8_t const* data_ptr = base_ptr != nullptr && data_offset != -1
+                                ? base_ptr + static_cast<uint64_t>(data_offset)
+                                : nullptr;
 
     // children
     auto children = std::vector<column_view>{};
     children.reserve(src.num_children());
 
     current_info = build_output_columns(
-      src.child_begin(), src.child_end(), current_info, std::back_inserter(children), base_ptr);
+      src.child_begin(), src.child_end(), current_info, std::back_inserter(children), base_ptr, mb);
 
-    return column_view{src.type(), size, data_ptr, bitmask_ptr, null_count, 0, std::move(children)};
+    return column_view{
+      src.type(), col_size, data_ptr, bitmask_ptr, null_count, 0, std::move(children)};
+  });
+
+  return current_info;
+}
+
+/**
+ * @brief Given a set of input columns, processed split buffers, and a metadata_builder,
+ * append column metadata using the builder.
+ *
+ * After performing the split we are left with 1 large buffer per incoming split
+ * partition.  We need to traverse this buffer and distribute the individual
+ * subpieces that represent individual columns and children to produce the final
+ * output columns.
+ *
+ * This function is called recursively in the case of nested types.
+ *
+ * @param begin Beginning of input columns
+ * @param end End of input columns
+ * @param info_begin Iterator of dst_buf_info structs containing information about each
+ * copied buffer
+ * @param mb packed column metadata builder
+ *
+ * @returns new dst_buf_info iterator after processing this range of input columns
+ */
+template <typename InputIter, typename BufInfo>
+BufInfo populate_metadata(InputIter begin,
+                          InputIter end,
+                          BufInfo info_begin,
+                          detail::metadata_builder& mb)
+{
+  auto current_info = info_begin;
+  std::for_each(begin, end, [&current_info, &mb](column_view const& src) {
+    build_output_column_metadata<BufInfo>(src, current_info, mb, true);
+
+    // children
+    current_info = populate_metadata(src.child_begin(), src.child_end(), current_info, mb);
   });
 
   return current_info;
@@ -739,8 +834,8 @@ struct buf_size_functor {
  * The key is simply the partition index.
  */
 struct split_key_functor {
-  int num_columns;
-  int operator() __device__(int buf_index) { return buf_index / num_columns; }
+  int const num_src_bufs;
+  int operator() __device__(int buf_index) const { return buf_index / num_src_bufs; }
 };
 
 /**
@@ -813,282 +908,291 @@ struct size_of_helper {
 };
 
 /**
- * @brief Functor for returning the number of chunks an input buffer is being
+ * @brief Functor for returning the number of batches an input buffer is being
  * subdivided into during the repartitioning step.
  *
  * Note: columns types which themselves inherently have no data (strings, lists,
  * structs) return 0.
  */
-struct num_chunks_func {
-  thrust::pair<std::size_t, std::size_t> const* chunks;
-  __device__ std::size_t operator()(size_type i) const { return thrust::get<0>(chunks[i]); }
+struct num_batches_func {
+  thrust::pair<std::size_t, std::size_t> const* const batches;
+  __device__ std::size_t operator()(size_type i) const { return thrust::get<0>(batches[i]); }
 };
 
-void copy_data(int num_bufs,
-               int num_src_bufs,
-               uint8_t const** d_src_bufs,
-               uint8_t** d_dst_bufs,
-               dst_buf_info* _d_dst_buf_info,
-               rmm::cuda_stream_view stream)
-{
-  // Since we parallelize at one block per copy, we are vulnerable to situations where we
-  // have small numbers of copies to do (a combination of small numbers of splits and/or columns),
-  // so we will take the actual set of outgoing source/destination buffers and further partition
-  // them into much smaller chunks in order to drive up the number of blocks and overall occupancy.
-  auto const desired_chunk_size = std::size_t{1 * 1024 * 1024};
-  rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> chunks(num_bufs, stream);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    _d_dst_buf_info,
-    _d_dst_buf_info + num_bufs,
-    chunks.begin(),
-    [desired_chunk_size] __device__(
-      dst_buf_info const& buf) -> thrust::pair<std::size_t, std::size_t> {
-      // Total bytes for this incoming partition
-      std::size_t const bytes =
-        static_cast<std::size_t>(buf.num_elements) * static_cast<std::size_t>(buf.element_size);
-
-      // This clause handles nested data types (e.g. list or string) that store no data in the row
-      // columns, only in their children.
-      if (bytes == 0) { return {1, 0}; }
+/**
+ * @brief Get the size in bytes of a batch described by `dst_buf_info`.
+ */
+struct batch_byte_size_function {
+  size_type const num_batches;
+  dst_buf_info const* const infos;
+  __device__ std::size_t operator()(size_type i) const
+  {
+    if (i == num_batches) { return 0; }
+    auto const& buf = *(infos + i);
+    std::size_t const bytes =
+      static_cast<std::size_t>(buf.num_elements) * static_cast<std::size_t>(buf.element_size);
+    return util::round_up_unsafe(bytes, split_align);
+  }
+};
 
-      // The number of chunks we want to subdivide this buffer into
-      std::size_t const num_chunks =
-        max(std::size_t{1}, util::round_up_unsafe(bytes, desired_chunk_size) / desired_chunk_size);
+/**
+ * @brief Get the input buffer index given the output buffer index.
+ */
+struct out_to_in_index_function {
+  size_type const* const batch_offsets;
+  int const num_bufs;
+  __device__ int operator()(size_type i) const
+  {
+    return static_cast<size_type>(
+             thrust::upper_bound(thrust::seq, batch_offsets, batch_offsets + num_bufs + 1, i) -
+             batch_offsets) -
+           1;
+  }
+};
 
-      // NOTE: leaving chunk size as a separate parameter for future tuning
-      // possibilities, even though in the current implementation it will be a
-      // constant.
-      return {num_chunks, desired_chunk_size};
-    });
+// packed block of memory 1: split indices and src_buf_info structs
+struct packed_split_indices_and_src_buf_info {
+  packed_split_indices_and_src_buf_info(cudf::table_view const& input,
+                                        std::vector<size_type> const& splits,
+                                        std::size_t num_partitions,
+                                        cudf::size_type num_src_bufs,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* temp_mr)
+    : indices_size(
+        cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)),
+      src_buf_info_size(
+        cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align)),
+      // host-side
+      h_indices_and_source_info(indices_size + src_buf_info_size),
+      h_indices{reinterpret_cast<size_type*>(h_indices_and_source_info.data())},
+      h_src_buf_info{
+        reinterpret_cast<src_buf_info*>(h_indices_and_source_info.data() + indices_size)}
+  {
+    // compute splits -> indices.
+    // these are row numbers per split
+    h_indices[0]              = 0;
+    h_indices[num_partitions] = input.column(0).size();
+    std::copy(splits.begin(), splits.end(), std::next(h_indices));
+
+    // setup source buf info
+    setup_source_buf_info(input.begin(), input.end(), h_src_buf_info, h_src_buf_info, stream);
+
+    offset_stack_partition_size = compute_offset_stack_size(input.begin(), input.end());
+    offset_stack_size           = offset_stack_partition_size * num_partitions * sizeof(size_type);
+    // device-side
+    // gpu-only : stack space needed for nested list offset calculation
+    d_indices_and_source_info =
+      rmm::device_buffer(indices_size + src_buf_info_size + offset_stack_size, stream, temp_mr);
+    d_indices      = reinterpret_cast<size_type*>(d_indices_and_source_info.data());
+    d_src_buf_info = reinterpret_cast<src_buf_info*>(
+      reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) + indices_size);
+    d_offset_stack =
+      reinterpret_cast<size_type*>(reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) +
+                                   indices_size + src_buf_info_size);
+
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
+      d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value()));
+  }
 
-  rmm::device_uvector<offset_type> chunk_offsets(num_bufs + 1, stream);
-  auto buf_count_iter = cudf::detail::make_counting_transform_iterator(
-    0, [num_bufs, num_chunks = num_chunks_func{chunks.begin()}] __device__(size_type i) {
-      return i == num_bufs ? 0 : num_chunks(i);
-    });
-  thrust::exclusive_scan(rmm::exec_policy(stream),
-                         buf_count_iter,
-                         buf_count_iter + num_bufs + 1,
-                         chunk_offsets.begin(),
-                         0);
+  size_type const indices_size;
+  std::size_t const src_buf_info_size;
+  std::size_t offset_stack_size;
 
-  auto out_to_in_index = [chunk_offsets = chunk_offsets.begin(), num_bufs] __device__(size_type i) {
-    return static_cast<size_type>(
-             thrust::upper_bound(thrust::seq, chunk_offsets, chunk_offsets + num_bufs + 1, i) -
-             chunk_offsets) -
-           1;
-  };
-
-  // apply the chunking.
-  auto const num_chunks =
-    cudf::detail::make_counting_transform_iterator(0, num_chunks_func{chunks.begin()});
-  size_type const new_buf_count =
-    thrust::reduce(rmm::exec_policy(stream), num_chunks, num_chunks + chunks.size());
-  rmm::device_uvector<dst_buf_info> d_dst_buf_info(new_buf_count, stream);
-  auto iter = thrust::make_counting_iterator(0);
-  thrust::for_each(
-    rmm::exec_policy(stream),
-    iter,
-    iter + new_buf_count,
-    [_d_dst_buf_info,
-     d_dst_buf_info = d_dst_buf_info.begin(),
-     chunks         = chunks.begin(),
-     chunk_offsets  = chunk_offsets.begin(),
-     num_bufs,
-     num_src_bufs,
-     out_to_in_index] __device__(size_type i) {
-      size_type const in_buf_index = out_to_in_index(i);
-      size_type const chunk_index  = i - chunk_offsets[in_buf_index];
-      auto const chunk_size        = thrust::get<1>(chunks[in_buf_index]);
-      dst_buf_info const& in       = _d_dst_buf_info[in_buf_index];
+  std::vector<uint8_t> h_indices_and_source_info;
+  rmm::device_buffer d_indices_and_source_info;
 
-      // adjust info
-      dst_buf_info& out = d_dst_buf_info[i];
-      out.element_size  = in.element_size;
-      out.value_shift   = in.value_shift;
-      out.bit_shift     = in.bit_shift;
-      out.valid_count =
-        in.valid_count;  // valid count will be set to 1 if this is a validity buffer
-      out.src_buf_index = in.src_buf_index;
-      out.dst_buf_index = in.dst_buf_index;
+  size_type* const h_indices;
+  src_buf_info* const h_src_buf_info;
 
-      size_type const elements_per_chunk =
-        out.element_size == 0 ? 0 : chunk_size / out.element_size;
-      out.num_elements = ((chunk_index + 1) * elements_per_chunk) > in.num_elements
-                           ? in.num_elements - (chunk_index * elements_per_chunk)
-                           : elements_per_chunk;
+  int offset_stack_partition_size;
+  size_type* d_indices;
+  src_buf_info* d_src_buf_info;
+  size_type* d_offset_stack;
+};
 
-      size_type const rows_per_chunk =
-        // if this is a validity buffer, each element is a bitmask_type, which
-        // corresponds to 32 rows.
-        out.valid_count > 0
-          ? elements_per_chunk * static_cast<size_type>(detail::size_in_bits<bitmask_type>())
-          : elements_per_chunk;
-      out.num_rows = ((chunk_index + 1) * rows_per_chunk) > in.num_rows
-                       ? in.num_rows - (chunk_index * rows_per_chunk)
-                       : rows_per_chunk;
+// packed block of memory 2: partition buffer sizes and dst_buf_info structs
+struct packed_partition_buf_size_and_dst_buf_info {
+  packed_partition_buf_size_and_dst_buf_info(std::size_t num_partitions,
+                                             std::size_t num_bufs,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* temp_mr)
+    : stream(stream),
+      buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)},
+      dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)},
+      // host-side
+      h_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size),
+      h_buf_sizes{reinterpret_cast<std::size_t*>(h_buf_sizes_and_dst_info.data())},
+      h_dst_buf_info{
+        reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size)},
+      // device-side
+      d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, temp_mr),
+      d_buf_sizes{reinterpret_cast<std::size_t*>(d_buf_sizes_and_dst_info.data())},
+      // destination buffer info
+      d_dst_buf_info{reinterpret_cast<dst_buf_info*>(
+        static_cast<uint8_t*>(d_buf_sizes_and_dst_info.data()) + buf_sizes_size)}
+  {
+  }
 
-      out.src_element_index = in.src_element_index + (chunk_index * elements_per_chunk);
-      out.dst_offset        = in.dst_offset + (chunk_index * chunk_size);
+  void copy_to_host()
+  {
+    // DtoH buf sizes and col info back to the host
+    CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes,
+                                  d_buf_sizes,
+                                  buf_sizes_size + dst_buf_info_size,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+  }
 
-      // out.bytes and out.buf_size are unneeded here because they are only used to
-      // calculate real output buffer sizes. the data we are generating here is
-      // purely intermediate for the purposes of doing more uniform copying of data
-      // underneath the final structure of the output
-    });
+  rmm::cuda_stream_view const stream;
 
-  // perform the copy
-  constexpr size_type block_size = 256;
-  copy_partitions<block_size><<<new_buf_count, block_size, 0, stream.value()>>>(
-    d_src_bufs, d_dst_bufs, d_dst_buf_info.data());
-
-  // postprocess valid_counts
-  auto keys = cudf::detail::make_counting_transform_iterator(
-    0, [out_to_in_index] __device__(size_type i) { return out_to_in_index(i); });
-  auto values = thrust::make_transform_iterator(
-    d_dst_buf_info.begin(), [] __device__(dst_buf_info const& info) { return info.valid_count; });
-  thrust::reduce_by_key(rmm::exec_policy(stream),
-                        keys,
-                        keys + new_buf_count,
-                        values,
-                        thrust::make_discard_iterator(),
-                        dst_valid_count_output_iterator{_d_dst_buf_info});
-}
+  // buffer sizes and destination info (used in batched copies)
+  std::size_t const buf_sizes_size;
+  std::size_t const dst_buf_info_size;
 
-};  // anonymous namespace
+  std::vector<uint8_t> h_buf_sizes_and_dst_info;
+  std::size_t* const h_buf_sizes;
+  dst_buf_info* const h_dst_buf_info;
 
-namespace detail {
+  rmm::device_buffer d_buf_sizes_and_dst_info;
+  std::size_t* const d_buf_sizes;
+  dst_buf_info* const d_dst_buf_info;
+};
 
-std::vector<packed_table> contiguous_split(cudf::table_view const& input,
-                                           std::vector<size_type> const& splits,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
-{
-  if (input.num_columns() == 0) { return {}; }
-  if (splits.size() > 0) {
-    CUDF_EXPECTS(splits.back() <= input.column(0).size(),
-                 "splits can't exceed size of input columns");
+// Packed block of memory 3:
+// Pointers to source and destination buffers (and stack space on the
+// gpu for offset computation)
+struct packed_src_and_dst_pointers {
+  packed_src_and_dst_pointers(cudf::table_view const& input,
+                              std::size_t num_partitions,
+                              cudf::size_type num_src_bufs,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* temp_mr)
+    : stream(stream),
+      src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)},
+      dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)},
+      // host-side
+      h_src_and_dst_buffers(src_bufs_size + dst_bufs_size),
+      h_src_bufs{reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data())},
+      h_dst_bufs{reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data() + src_bufs_size)},
+      // device-side
+      d_src_and_dst_buffers{rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, temp_mr)},
+      d_src_bufs{reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data())},
+      d_dst_bufs{reinterpret_cast<uint8_t**>(
+        reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size)}
+  {
+    // setup src buffers
+    setup_src_buf_data(input.begin(), input.end(), h_src_bufs);
   }
+
+  void copy_to_device()
   {
-    size_type begin = 0;
-    for (std::size_t i = 0; i < splits.size(); i++) {
-      size_type end = splits[i];
-      CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.");
-      CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index.");
-      CUDF_EXPECTS(end <= input.column(0).size(), "Slice range out of bounds.");
-      begin = end;
-    }
+    CUDF_CUDA_TRY(cudaMemcpyAsync(d_src_and_dst_buffers.data(),
+                                  h_src_and_dst_buffers.data(),
+                                  src_bufs_size + dst_bufs_size,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
   }
 
-  std::size_t const num_partitions   = splits.size() + 1;
-  std::size_t const num_root_columns = input.num_columns();
+  rmm::cuda_stream_view const stream;
+  std::size_t const src_bufs_size;
+  std::size_t const dst_bufs_size;
 
-  // if inputs are empty, just return num_partitions empty tables
-  if (input.column(0).size() == 0) {
-    // sanitize the inputs (to handle corner cases like sliced tables)
-    std::vector<std::unique_ptr<column>> empty_columns;
-    empty_columns.reserve(input.num_columns());
-    std::transform(
-      input.begin(), input.end(), std::back_inserter(empty_columns), [](column_view const& col) {
-        return cudf::empty_like(col);
-      });
-    std::vector<cudf::column_view> empty_column_views;
-    empty_column_views.reserve(input.num_columns());
-    std::transform(empty_columns.begin(),
-                   empty_columns.end(),
-                   std::back_inserter(empty_column_views),
-                   [](std::unique_ptr<column> const& col) { return col->view(); });
-    table_view empty_inputs(empty_column_views);
+  std::vector<uint8_t> h_src_and_dst_buffers;
+  uint8_t const** const h_src_bufs;
+  uint8_t** const h_dst_bufs;
 
-    // build the empty results
-    std::vector<packed_table> result;
-    result.reserve(num_partitions);
-    auto iter = thrust::make_counting_iterator(0);
-    std::transform(iter,
-                   iter + num_partitions,
-                   std::back_inserter(result),
-                   [&empty_inputs](int partition_index) {
-                     return packed_table{
-                       empty_inputs,
-                       packed_columns{std::make_unique<std::vector<uint8_t>>(pack_metadata(
-                                        empty_inputs, static_cast<uint8_t const*>(nullptr), 0)),
-                                      std::make_unique<rmm::device_buffer>()}};
-                   });
+  rmm::device_buffer d_src_and_dst_buffers;
+  uint8_t const** const d_src_bufs;
+  uint8_t** const d_dst_bufs;
+};
 
-    return result;
-  }
+/**
+ * @brief Create an instance of `packed_src_and_dst_pointers` populating destination
+ * partitition buffers (if any) from `out_buffers`. In the chunked_pack case
+ * `out_buffers` is empty, and the destination pointer is provided separately
+ * to the `copy_partitions` kernel.
+ *
+ * @param input source table view
+ * @param num_partitions the number of partitions (1 meaning no splits)
+ * @param num_src_bufs number of buffers for the source columns including children
+ * @param out_buffers the destination buffers per partition if in the non-chunked case
+ * @param stream Optional CUDA stream on which to execute kernels
+ * @param temp_mr A memory resource for temporary and scratch space
+ *
+ * @returns new unique pointer to packed_src_and_dst_pointers
+ */
+std::unique_ptr<packed_src_and_dst_pointers> setup_src_and_dst_pointers(
+  cudf::table_view const& input,
+  std::size_t num_partitions,
+  cudf::size_type num_src_bufs,
+  std::vector<rmm::device_buffer>& out_buffers,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* temp_mr)
+{
+  auto src_and_dst_pointers = std::make_unique<packed_src_and_dst_pointers>(
+    input, num_partitions, num_src_bufs, stream, temp_mr);
+
+  std::transform(
+    out_buffers.begin(), out_buffers.end(), src_and_dst_pointers->h_dst_bufs, [](auto& buf) {
+      return static_cast<uint8_t*>(buf.data());
+    });
+
+  // copy the struct to device memory to access from the kernel
+  src_and_dst_pointers->copy_to_device();
+
+  return src_and_dst_pointers;
+}
+
+/**
+ * @brief Create an instance of `packed_partition_buf_size_and_dst_buf_info` containing
+ * the partition-level dst_buf_info structs for each partition and column buffer.
+ *
+ * @param input source table view
+ * @param splits the numeric value (in rows) for each split, empty for 1 partition
+ * @param num_partitions the number of partitions create (1 meaning no splits)
+ * @param num_src_bufs number of buffers for the source columns including children
+ * @param num_bufs num_src_bufs times the number of partitions
+ * @param stream Optional CUDA stream on which to execute kernels
+ * @param temp_mr A memory resource for temporary and scratch space
+ *
+ * @returns new unique pointer to `packed_partition_buf_size_and_dst_buf_info`
+ */
+std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
+  cudf::table_view const& input,
+  std::vector<size_type> const& splits,
+  std::size_t num_partitions,
+  cudf::size_type num_src_bufs,
+  std::size_t num_bufs,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* temp_mr)
+{
+  auto partition_buf_size_and_dst_buf_info =
+    std::make_unique<packed_partition_buf_size_and_dst_buf_info>(
+      num_partitions, num_bufs, stream, temp_mr);
+
+  auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
+  auto const d_buf_sizes    = partition_buf_size_and_dst_buf_info->d_buf_sizes;
+
+  auto const split_indices_and_src_buf_info = packed_split_indices_and_src_buf_info(
+    input, splits, num_partitions, num_src_bufs, stream, temp_mr);
 
-  // compute # of source buffers (column data, validity, children), # of partitions
-  // and total # of buffers
-  size_type const num_src_bufs = count_src_bufs(input.begin(), input.end());
-  std::size_t const num_bufs   = num_src_bufs * num_partitions;
-
-  // packed block of memory 1. split indices and src_buf_info structs
-  std::size_t const indices_size =
-    cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align);
-  std::size_t const src_buf_info_size =
-    cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align);
-  // host-side
-  std::vector<uint8_t> h_indices_and_source_info(indices_size + src_buf_info_size);
-  size_type* h_indices = reinterpret_cast<size_type*>(h_indices_and_source_info.data());
-  src_buf_info* h_src_buf_info =
-    reinterpret_cast<src_buf_info*>(h_indices_and_source_info.data() + indices_size);
-  // device-side
-  // gpu-only : stack space needed for nested list offset calculation
-  int const offset_stack_partition_size = compute_offset_stack_size(input.begin(), input.end());
-  std::size_t const offset_stack_size =
-    offset_stack_partition_size * num_partitions * sizeof(size_type);
-  rmm::device_buffer d_indices_and_source_info(indices_size + src_buf_info_size + offset_stack_size,
-                                               stream,
-                                               rmm::mr::get_current_device_resource());
-  auto* d_indices              = reinterpret_cast<size_type*>(d_indices_and_source_info.data());
-  src_buf_info* d_src_buf_info = reinterpret_cast<src_buf_info*>(
-    reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) + indices_size);
-  size_type* d_offset_stack =
-    reinterpret_cast<size_type*>(reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) +
-                                 indices_size + src_buf_info_size);
-
-  // compute splits -> indices.
-  h_indices[0]              = 0;
-  h_indices[num_partitions] = input.column(0).size();
-  std::copy(splits.begin(), splits.end(), std::next(h_indices));
-
-  // setup source buf info
-  setup_source_buf_info(input.begin(), input.end(), h_src_buf_info, h_src_buf_info, stream);
-
-  // HtoD indices and source buf info to device
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value()));
-
-  // packed block of memory 2. partition buffer sizes and dst_buf_info structs
-  std::size_t const buf_sizes_size =
-    cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align);
-  std::size_t const dst_buf_info_size =
-    cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align);
-  // host-side
-  std::vector<uint8_t> h_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size);
-  std::size_t* h_buf_sizes = reinterpret_cast<std::size_t*>(h_buf_sizes_and_dst_info.data());
-  dst_buf_info* h_dst_buf_info =
-    reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size);
-  // device-side
-  rmm::device_buffer d_buf_sizes_and_dst_info(
-    buf_sizes_size + dst_buf_info_size, stream, rmm::mr::get_current_device_resource());
-  std::size_t* d_buf_sizes     = reinterpret_cast<std::size_t*>(d_buf_sizes_and_dst_info.data());
-  dst_buf_info* d_dst_buf_info = reinterpret_cast<dst_buf_info*>(
-    static_cast<uint8_t*>(d_buf_sizes_and_dst_info.data()) + buf_sizes_size);
+  auto const d_src_buf_info = split_indices_and_src_buf_info.d_src_buf_info;
+  auto const offset_stack_partition_size =
+    split_indices_and_src_buf_info.offset_stack_partition_size;
+  auto const d_offset_stack = split_indices_and_src_buf_info.d_offset_stack;
+  auto const d_indices      = split_indices_and_src_buf_info.d_indices;
 
   // compute sizes of each column in each partition, including alignment.
   thrust::transform(
-    rmm::exec_policy(stream),
+    rmm::exec_policy(stream, temp_mr),
     thrust::make_counting_iterator<std::size_t>(0),
     thrust::make_counting_iterator<std::size_t>(num_bufs),
     d_dst_buf_info,
-    [num_src_bufs,
-     d_indices,
-     d_src_buf_info,
+    [d_src_buf_info,
+     offset_stack_partition_size,
      d_offset_stack,
-     offset_stack_partition_size] __device__(std::size_t t) {
+     d_indices,
+     num_src_bufs] __device__(std::size_t t) {
       int const split_index   = t / num_src_bufs;
       int const src_buf_index = t % num_src_bufs;
       auto const& src_info    = d_src_buf_info[src_buf_index];
@@ -1158,14 +1262,14 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
     });
 
   // compute total size of each partition
+  // key is the split index
   {
-    // key is split index
-    auto keys = cudf::detail::make_counting_transform_iterator(
+    auto const keys = cudf::detail::make_counting_transform_iterator(
       0, split_key_functor{static_cast<int>(num_src_bufs)});
     auto values =
       cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info});
 
-    thrust::reduce_by_key(rmm::exec_policy(stream),
+    thrust::reduce_by_key(rmm::exec_policy(stream, temp_mr),
                           keys,
                           keys + num_bufs,
                           values,
@@ -1173,14 +1277,14 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                           d_buf_sizes);
   }
 
-  // compute start offset for each output buffer
+  // compute start offset for each output buffer for each split
   {
-    auto keys = cudf::detail::make_counting_transform_iterator(
+    auto const keys = cudf::detail::make_counting_transform_iterator(
       0, split_key_functor{static_cast<int>(num_src_bufs)});
     auto values =
       cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info});
 
-    thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
+    thrust::exclusive_scan_by_key(rmm::exec_policy(stream, temp_mr),
                                   keys,
                                   keys + num_bufs,
                                   values,
@@ -1188,92 +1292,746 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                   std::size_t{0});
   }
 
-  // DtoH buf sizes and col info back to the host
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes,
-                                d_buf_sizes,
-                                buf_sizes_size + dst_buf_info_size,
-                                cudaMemcpyDefault,
-                                stream.value()));
+  partition_buf_size_and_dst_buf_info->copy_to_host();
+
   stream.synchronize();
 
-  // allocate output partition buffers
-  std::vector<rmm::device_buffer> out_buffers;
-  out_buffers.reserve(num_partitions);
-  std::transform(h_buf_sizes,
-                 h_buf_sizes + num_partitions,
-                 std::back_inserter(out_buffers),
-                 [stream, mr](std::size_t bytes) {
-                   return rmm::device_buffer{bytes, stream, mr};
-                 });
-
-  // packed block of memory 3. pointers to source and destination buffers (and stack space on the
-  // gpu for offset computation)
-  std::size_t const src_bufs_size =
-    cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align);
-  std::size_t const dst_bufs_size =
-    cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align);
-  // host-side
-  std::vector<uint8_t> h_src_and_dst_buffers(src_bufs_size + dst_bufs_size);
-  uint8_t const** h_src_bufs = reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data());
-  uint8_t** h_dst_bufs = reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data() + src_bufs_size);
-  // device-side
-  rmm::device_buffer d_src_and_dst_buffers(src_bufs_size + dst_bufs_size + offset_stack_size,
-                                           stream,
-                                           rmm::mr::get_current_device_resource());
-  auto const** d_src_bufs = reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data());
-  uint8_t** d_dst_bufs    = reinterpret_cast<uint8_t**>(
-    reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size);
-
-  // setup src buffers
-  setup_src_buf_data(input.begin(), input.end(), h_src_bufs);
-
-  // setup dst buffers
-  std::transform(out_buffers.begin(), out_buffers.end(), h_dst_bufs, [](auto& buf) {
-    return static_cast<uint8_t*>(buf.data());
-  });
+  return partition_buf_size_and_dst_buf_info;
+}
 
-  // HtoD src and dest buffers
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    d_src_bufs, h_src_bufs, src_bufs_size + dst_bufs_size, cudaMemcpyDefault, stream.value()));
+/**
+ * @brief Struct containing information about the actual batches we will send to the
+ * `copy_partitions` kernel and the number of iterations we need to carry out this copy.
+ *
+ * For the non-chunked contiguous_split case, this contains the batched dst_buf_infos and the
+ * number of iterations is going to be 1 since the non-chunked case is single pass.
+ *
+ * For the chunked_pack case, this also contains the batched dst_buf_infos for all
+ * iterations in addition to helping keep the state about what batches have been copied so far
+ * and what are the sizes (in bytes) of each iteration.
+ */
+struct chunk_iteration_state {
+  chunk_iteration_state(rmm::device_uvector<dst_buf_info> _d_batched_dst_buf_info,
+                        rmm::device_uvector<size_type> _d_batch_offsets,
+                        std::vector<std::size_t>&& _h_num_buffs_per_iteration,
+                        std::vector<std::size_t>&& _h_size_of_buffs_per_iteration,
+                        std::size_t total_size)
+    : num_iterations(_h_num_buffs_per_iteration.size()),
+      current_iteration{0},
+      starting_batch{0},
+      d_batched_dst_buf_info(std::move(_d_batched_dst_buf_info)),
+      d_batch_offsets(std::move(_d_batch_offsets)),
+      h_num_buffs_per_iteration(std::move(_h_num_buffs_per_iteration)),
+      h_size_of_buffs_per_iteration(std::move(_h_size_of_buffs_per_iteration)),
+      total_size(total_size)
+  {
+  }
 
-  // perform the copy.
-  copy_data(num_bufs, num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info, stream);
+  static std::unique_ptr<chunk_iteration_state> create(
+    rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> const& batches,
+    int num_bufs,
+    dst_buf_info* d_orig_dst_buf_info,
+    std::size_t const* const h_buf_sizes,
+    std::size_t num_partitions,
+    std::size_t user_buffer_size,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* temp_mr);
+
+  /**
+   * @brief As of the time of the call, return the starting 1MB batch index, and the
+   * number of batches to copy.
+   *
+   * @return the current iteration's starting_batch and batch count as a pair
+   */
+  std::pair<std::size_t, std::size_t> get_current_starting_index_and_buff_count() const
+  {
+    CUDF_EXPECTS(current_iteration < num_iterations,
+                 "current_iteration cannot exceed num_iterations");
+    auto count_for_current = h_num_buffs_per_iteration[current_iteration];
+    return {starting_batch, count_for_current};
+  }
+
+  /**
+   * @brief Advance the iteration state if there are iterations left, updating the
+   * starting batch and returning the amount of bytes were copied in the iteration
+   * we just finished.
+   * @throws cudf::logic_error If the state was at the last iteration before entering
+   * this function.
+   * @return size in bytes that were copied in the finished iteration
+   */
+  std::size_t advance_iteration()
+  {
+    CUDF_EXPECTS(current_iteration < num_iterations,
+                 "current_iteration cannot exceed num_iterations");
+    std::size_t bytes_copied = h_size_of_buffs_per_iteration[current_iteration];
+    starting_batch += h_num_buffs_per_iteration[current_iteration];
+    ++current_iteration;
+    return bytes_copied;
+  }
 
-  // DtoH dst info (to retrieve null counts)
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    h_dst_buf_info, d_dst_buf_info, dst_buf_info_size, cudaMemcpyDefault, stream.value()));
+  /**
+   * Returns true if there are iterations left.
+   */
+  bool has_more_copies() const { return current_iteration < num_iterations; }
 
-  stream.synchronize();
+  rmm::device_uvector<dst_buf_info> d_batched_dst_buf_info;  ///< dst_buf_info per 1MB batch
+  rmm::device_uvector<size_type> const d_batch_offsets;  ///< Offset within a batch per dst_buf_info
+  std::size_t const total_size;                          ///< The aggregate size of all iterations
+  int const num_iterations;                              ///< The total number of iterations
+  int current_iteration;  ///< Marks the current iteration being worked on
+
+ private:
+  std::size_t starting_batch;  ///< Starting batch index for the current iteration
+  std::vector<std::size_t> const h_num_buffs_per_iteration;  ///< The count of batches per iteration
+  std::vector<std::size_t> const
+    h_size_of_buffs_per_iteration;                           ///< The size in bytes per iteration
+};
+
+std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
+  rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> const& batches,
+  int num_bufs,
+  dst_buf_info* d_orig_dst_buf_info,
+  std::size_t const* const h_buf_sizes,
+  std::size_t num_partitions,
+  std::size_t user_buffer_size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* temp_mr)
+{
+  rmm::device_uvector<size_type> d_batch_offsets(num_bufs + 1, stream, temp_mr);
+
+  auto const buf_count_iter = cudf::detail::make_counting_transform_iterator(
+    0, [num_bufs, num_batches = num_batches_func{batches.begin()}] __device__(size_type i) {
+      return i == num_bufs ? 0 : num_batches(i);
+    });
 
-  // build the output.
-  std::vector<packed_table> result;
-  result.reserve(num_partitions);
-  std::vector<column_view> cols;
-  cols.reserve(num_root_columns);
-  auto cur_dst_buf_info = h_dst_buf_info;
-  cudf::detail::metadata_builder meta_builder(num_root_columns);
-
-  for (std::size_t idx = 0; idx < num_partitions; idx++) {
-    // traverse the buffers and build the columns.
-    cur_dst_buf_info = build_output_columns(
-      input.begin(), input.end(), cur_dst_buf_info, std::back_inserter(cols), h_dst_bufs[idx]);
-
-    // pack the columns
-    cudf::table_view t{cols};
-    cols.clear();
-
-    cudf::packed_columns packed_cols{
-      std::make_unique<std::vector<uint8_t>>(
-        cudf::detail::pack_metadata(t,
-                                    reinterpret_cast<uint8_t const*>(out_buffers[idx].data()),
-                                    out_buffers[idx].size(),
-                                    meta_builder)),
-      std::make_unique<rmm::device_buffer>(std::move(out_buffers[idx]))};
-    meta_builder.clear();
-
-    result.emplace_back(packed_table{std::move(t), std::move(packed_cols)});
+  thrust::exclusive_scan(rmm::exec_policy(stream, temp_mr),
+                         buf_count_iter,
+                         buf_count_iter + num_bufs + 1,
+                         d_batch_offsets.begin(),
+                         0);
+
+  auto const num_batches_iter =
+    cudf::detail::make_counting_transform_iterator(0, num_batches_func{batches.begin()});
+  size_type const num_batches = thrust::reduce(
+    rmm::exec_policy(stream, temp_mr), num_batches_iter, num_batches_iter + batches.size());
+
+  auto out_to_in_index = out_to_in_index_function{d_batch_offsets.begin(), num_bufs};
+
+  auto const iter = thrust::make_counting_iterator(0);
+
+  // load up the batches as d_dst_buf_info
+  rmm::device_uvector<dst_buf_info> d_batched_dst_buf_info(num_batches, stream, temp_mr);
+
+  thrust::for_each(
+    rmm::exec_policy(stream, temp_mr),
+    iter,
+    iter + num_batches,
+    [d_orig_dst_buf_info,
+     d_batched_dst_buf_info = d_batched_dst_buf_info.begin(),
+     batches                = batches.begin(),
+     d_batch_offsets        = d_batch_offsets.begin(),
+     out_to_in_index] __device__(size_type i) {
+      size_type const in_buf_index = out_to_in_index(i);
+      size_type const batch_index  = i - d_batch_offsets[in_buf_index];
+      auto const batch_size        = thrust::get<1>(batches[in_buf_index]);
+      dst_buf_info const& in       = d_orig_dst_buf_info[in_buf_index];
+
+      // adjust info
+      dst_buf_info& out = d_batched_dst_buf_info[i];
+      out.element_size  = in.element_size;
+      out.value_shift   = in.value_shift;
+      out.bit_shift     = in.bit_shift;
+      out.valid_count =
+        in.valid_count;  // valid count will be set to 1 if this is a validity buffer
+      out.src_buf_index = in.src_buf_index;
+      out.dst_buf_index = in.dst_buf_index;
+
+      size_type const elements_per_batch =
+        out.element_size == 0 ? 0 : batch_size / out.element_size;
+      out.num_elements = ((batch_index + 1) * elements_per_batch) > in.num_elements
+                           ? in.num_elements - (batch_index * elements_per_batch)
+                           : elements_per_batch;
+
+      size_type const rows_per_batch =
+        // if this is a validity buffer, each element is a bitmask_type, which
+        // corresponds to 32 rows.
+        out.valid_count > 0
+          ? elements_per_batch * static_cast<size_type>(cudf::detail::size_in_bits<bitmask_type>())
+          : elements_per_batch;
+      out.num_rows = ((batch_index + 1) * rows_per_batch) > in.num_rows
+                       ? in.num_rows - (batch_index * rows_per_batch)
+                       : rows_per_batch;
+
+      out.src_element_index = in.src_element_index + (batch_index * elements_per_batch);
+      out.dst_offset        = in.dst_offset + (batch_index * batch_size);
+
+      // out.bytes and out.buf_size are unneeded here because they are only used to
+      // calculate real output buffer sizes. the data we are generating here is
+      // purely intermediate for the purposes of doing more uniform copying of data
+      // underneath the final structure of the output
+    });
+
+  /**
+   * In the chunked case, this is the code that fixes up the offsets of each batch
+   * and prepares each iteration. Given the batches computed before, it figures
+   * out the number of batches that will fit in an iteration of `user_buffer_size`.
+   *
+   * Specifically, offsets for batches are reset to the 0th byte when a new iteration
+   * of `user_buffer_size` bytes is needed.
+   */
+  if (user_buffer_size != 0) {
+    // copy the batch offsets back to host
+    std::vector<std::size_t> h_offsets(num_batches + 1);
+    {
+      rmm::device_uvector<std::size_t> offsets(h_offsets.size(), stream, temp_mr);
+      auto const batch_byte_size_iter = cudf::detail::make_counting_transform_iterator(
+        0, batch_byte_size_function{num_batches, d_batched_dst_buf_info.begin()});
+
+      thrust::exclusive_scan(rmm::exec_policy(stream, temp_mr),
+                             batch_byte_size_iter,
+                             batch_byte_size_iter + num_batches + 1,
+                             offsets.begin());
+
+      CUDF_CUDA_TRY(cudaMemcpyAsync(h_offsets.data(),
+                                    offsets.data(),
+                                    sizeof(std::size_t) * offsets.size(),
+                                    cudaMemcpyDefault,
+                                    stream.value()));
+
+      // the next part is working on the CPU, so we want to synchronize here
+      stream.synchronize();
+    }
+
+    std::vector<std::size_t> num_batches_per_iteration;
+    std::vector<std::size_t> size_of_batches_per_iteration;
+    std::vector<std::size_t> accum_size_per_iteration;
+    std::size_t accum_size = 0;
+    {
+      auto current_offset_it = h_offsets.begin();
+      // figure out how many iterations we need, while fitting batches to iterations
+      // with no more than user_buffer_size bytes worth of batches
+      while (current_offset_it != h_offsets.end()) {
+        // next_iteration_it points to the batch right above the boundary (the batch
+        // that didn't fit).
+        auto next_iteration_it =
+          std::lower_bound(current_offset_it,
+                           h_offsets.end(),
+                           // We add the cumulative size + 1 because we want to find what would fit
+                           // within a buffer of user_buffer_size (up to user_buffer_size).
+                           // Since h_offsets is a prefix scan, we add the size we accumulated so
+                           // far so we are looking for the next user_buffer_sized boundary.
+                           user_buffer_size + accum_size + 1);
+
+        // we subtract 1 from the number of batch here because next_iteration_it points
+        // to the batch that didn't fit, so it's one off.
+        auto batches_in_iter = std::distance(current_offset_it, next_iteration_it) - 1;
+
+        // to get the amount of bytes in this iteration we get the prefix scan size
+        // and subtract the cumulative size so far, leaving the bytes belonging to this
+        // iteration
+        auto iter_size_bytes = *(current_offset_it + batches_in_iter) - accum_size;
+        accum_size += iter_size_bytes;
+
+        num_batches_per_iteration.push_back(batches_in_iter);
+        size_of_batches_per_iteration.push_back(iter_size_bytes);
+        accum_size_per_iteration.push_back(accum_size);
+
+        if (next_iteration_it == h_offsets.end()) { break; }
+
+        current_offset_it += batches_in_iter;
+      }
+    }
+
+    // apply changed offset
+    {
+      auto d_accum_size_per_iteration =
+        cudf::detail::make_device_uvector_async(accum_size_per_iteration, stream, temp_mr);
+
+      // we want to update the offset of batches for every iteration, except the first one (because
+      // offsets in the first iteration are all 0 based)
+      auto num_batches_in_first_iteration = num_batches_per_iteration[0];
+      auto const iter     = thrust::make_counting_iterator(num_batches_in_first_iteration);
+      auto num_iterations = accum_size_per_iteration.size();
+      thrust::for_each(
+        rmm::exec_policy(stream, temp_mr),
+        iter,
+        iter + num_batches - num_batches_in_first_iteration,
+        [num_iterations,
+         d_batched_dst_buf_info     = d_batched_dst_buf_info.begin(),
+         d_accum_size_per_iteration = d_accum_size_per_iteration.begin()] __device__(size_type i) {
+          auto prior_iteration_size =
+            thrust::upper_bound(thrust::seq,
+                                d_accum_size_per_iteration,
+                                d_accum_size_per_iteration + num_iterations,
+                                d_batched_dst_buf_info[i].dst_offset) -
+            1;
+          d_batched_dst_buf_info[i].dst_offset -= *prior_iteration_size;
+        });
+    }
+    return std::make_unique<chunk_iteration_state>(std::move(d_batched_dst_buf_info),
+                                                   std::move(d_batch_offsets),
+                                                   std::move(num_batches_per_iteration),
+                                                   std::move(size_of_batches_per_iteration),
+                                                   accum_size);
+
+  } else {
+    // we instantiate an "iteration state" for the regular single pass contiguous_split
+    // consisting of 1 iteration with all of the batches and totalling `total_size` bytes.
+    auto const total_size = std::reduce(h_buf_sizes, h_buf_sizes + num_partitions);
+
+    // 1 iteration with the whole size
+    return std::make_unique<chunk_iteration_state>(
+      std::move(d_batched_dst_buf_info),
+      std::move(d_batch_offsets),
+      std::move(std::vector<std::size_t>{static_cast<std::size_t>(num_batches)}),
+      std::move(std::vector<std::size_t>{total_size}),
+      total_size);
+  }
+}
+
+/**
+ * @brief Create an instance of `chunk_iteration_state` containing 1MB batches of work
+ * that are further grouped into chunks or iterations.
+ *
+ * This function handles both the `chunked_pack` case: when `user_buffer_size` is non-zero,
+ * and the single-shot `contiguous_split` case.
+ *
+ * @param num_bufs num_src_bufs times the number of partitions
+ * @param d_dst_buf_info dst_buf_info per partition produced in `compute_splits`
+ * @param h_buf_sizes size in bytes of a partition (accessible from host)
+ * @param num_partitions the number of partitions (1 meaning no splits)
+ * @param user_buffer_size if non-zero, it is the size in bytes that 1MB batches should be
+ *        grouped in, as different iterations.
+ * @param stream Optional CUDA stream on which to execute kernels
+ * @param temp_mr A memory resource for temporary and scratch space
+ *
+ * @returns new unique pointer to `chunk_iteration_state`
+ */
+std::unique_ptr<chunk_iteration_state> compute_batches(int num_bufs,
+                                                       dst_buf_info* const d_dst_buf_info,
+                                                       std::size_t const* const h_buf_sizes,
+                                                       std::size_t num_partitions,
+                                                       std::size_t user_buffer_size,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* temp_mr)
+{
+  // Since we parallelize at one block per copy, performance is vulnerable to situations where we
+  // have small numbers of copies to do (a combination of small numbers of splits and/or columns),
+  // so we will take the actual set of outgoing source/destination buffers and further partition
+  // them into much smaller batches in order to drive up the number of blocks and overall
+  // occupancy.
+  rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> batches(num_bufs, stream, temp_mr);
+  thrust::transform(
+    rmm::exec_policy(stream, temp_mr),
+    d_dst_buf_info,
+    d_dst_buf_info + num_bufs,
+    batches.begin(),
+    [desired_batch_size = desired_batch_size] __device__(
+      dst_buf_info const& buf) -> thrust::pair<std::size_t, std::size_t> {
+      // Total bytes for this incoming partition
+      std::size_t const bytes =
+        static_cast<std::size_t>(buf.num_elements) * static_cast<std::size_t>(buf.element_size);
+
+      // This clause handles nested data types (e.g. list or string) that store no data in the row
+      // columns, only in their children.
+      if (bytes == 0) { return {1, 0}; }
+
+      // The number of batches we want to subdivide this buffer into
+      std::size_t const num_batches = std::max(
+        std::size_t{1}, util::round_up_unsafe(bytes, desired_batch_size) / desired_batch_size);
+
+      // NOTE: leaving batch size as a separate parameter for future tuning
+      // possibilities, even though in the current implementation it will be a
+      // constant.
+      return {num_batches, desired_batch_size};
+    });
+
+  return chunk_iteration_state::create(batches,
+                                       num_bufs,
+                                       d_dst_buf_info,
+                                       h_buf_sizes,
+                                       num_partitions,
+                                       user_buffer_size,
+                                       stream,
+                                       temp_mr);
+}
+
+void copy_data(int num_batches_to_copy,
+               int starting_batch,
+               uint8_t const** d_src_bufs,
+               uint8_t** d_dst_bufs,
+               rmm::device_uvector<dst_buf_info>& d_dst_buf_info,
+               uint8_t* user_buffer,
+               rmm::cuda_stream_view stream)
+{
+  constexpr size_type block_size = 256;
+  if (user_buffer != nullptr) {
+    auto index_to_buffer = [user_buffer] __device__(unsigned int) { return user_buffer; };
+    copy_partitions<block_size><<<num_batches_to_copy, block_size, 0, stream.value()>>>(
+      index_to_buffer, d_src_bufs, d_dst_buf_info.data() + starting_batch);
+  } else {
+    auto index_to_buffer = [d_dst_bufs,
+                            dst_buf_info = d_dst_buf_info.data(),
+                            user_buffer] __device__(unsigned int buf_index) {
+      auto const dst_buf_index = dst_buf_info[buf_index].dst_buf_index;
+      return d_dst_bufs[dst_buf_index];
+    };
+    copy_partitions<block_size><<<num_batches_to_copy, block_size, 0, stream.value()>>>(
+      index_to_buffer, d_src_bufs, d_dst_buf_info.data() + starting_batch);
   }
-  return result;
+}
+
+/**
+ * @brief Function that checks an input table_view and splits for specific edge cases.
+ *
+ * It will return true if the input is "empty" (no rows or columns), which means
+ * special handling has to happen in the calling code.
+ *
+ * @param input table_view of source table to be split
+ * @param splits the splits specified by the user, or an empty vector if no splits
+ * @returns true if the input is empty, false otherwise
+ */
+bool check_inputs(cudf::table_view const& input, std::vector<size_type> const& splits)
+{
+  if (input.num_columns() == 0) { return true; }
+  if (splits.size() > 0) {
+    CUDF_EXPECTS(splits.back() <= input.column(0).size(),
+                 "splits can't exceed size of input columns");
+  }
+  size_type begin = 0;
+  for (auto end : splits) {
+    CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.");
+    CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index.");
+    CUDF_EXPECTS(end <= input.column(0).size(), "Slice range out of bounds.");
+    begin = end;
+  }
+  return input.column(0).size() == 0;
+}
+
+};  // anonymous namespace
+
+namespace detail {
+
+/**
+ * @brief A helper struct containing the state of contiguous_split, whether the caller
+ * is using the single-pass contiguous_split or chunked_pack.
+ *
+ * It exposes an iterator-like pattern where contiguous_split_state::has_next()
+ * returns true when there is work to be done, and false otherwise.
+ *
+ * contiguous_split_state::contiguous_split() performs a single-pass contiguous_split
+ * and is valid iff contiguous_split_state is instantiated with 0 for the user_buffer_size.
+ *
+ * contiguous_split_state::contiguous_split_chunk(device_span) is only valid when
+ * user_buffer_size > 0. It should be called as long as has_next() returns true. The
+ * device_span passed to contiguous_split_chunk must be allocated in stream `stream` by
+ * the user.
+ *
+ * None of the methods are thread safe.
+ */
+struct contiguous_split_state {
+  contiguous_split_state(cudf::table_view const& input,
+                         std::size_t user_buffer_size,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr,
+                         rmm::mr::device_memory_resource* temp_mr)
+    : contiguous_split_state(input, {}, user_buffer_size, stream, mr, temp_mr)
+  {
+  }
+
+  contiguous_split_state(cudf::table_view const& input,
+                         std::vector<size_type> const& splits,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr,
+                         rmm::mr::device_memory_resource* temp_mr)
+    : contiguous_split_state(input, splits, 0, stream, mr, temp_mr)
+  {
+  }
+
+  bool has_next() const { return !is_empty && chunk_iter_state->has_more_copies(); }
+
+  std::size_t get_total_contiguous_size() const
+  {
+    return is_empty ? 0 : chunk_iter_state->total_size;
+  }
+
+  std::vector<packed_table> contiguous_split()
+  {
+    CUDF_EXPECTS(user_buffer_size == 0, "Cannot contiguous split with a user buffer");
+    if (is_empty || input.num_columns() == 0) { return make_packed_tables(); }
+
+    auto const num_batches_total =
+      std::get<1>(chunk_iter_state->get_current_starting_index_and_buff_count());
+
+    // perform the copy.
+    copy_data(num_batches_total,
+              0 /* starting at buffer for single-shot 0*/,
+              src_and_dst_pointers->d_src_bufs,
+              src_and_dst_pointers->d_dst_bufs,
+              chunk_iter_state->d_batched_dst_buf_info,
+              nullptr,
+              stream);
+
+    // these "orig" dst_buf_info pointers describe the prior-to-batching destination
+    // buffers per partition
+    auto d_orig_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
+    auto h_orig_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
+
+    // postprocess valid_counts: apply the valid counts computed by copy_data for each
+    // batch back to the original dst_buf_infos
+    auto const keys = cudf::detail::make_counting_transform_iterator(
+      0, out_to_in_index_function{chunk_iter_state->d_batch_offsets.begin(), (int)num_bufs});
+
+    auto values = thrust::make_transform_iterator(
+      chunk_iter_state->d_batched_dst_buf_info.begin(),
+      [] __device__(dst_buf_info const& info) { return info.valid_count; });
+
+    thrust::reduce_by_key(rmm::exec_policy(stream, temp_mr),
+                          keys,
+                          keys + num_batches_total,
+                          values,
+                          thrust::make_discard_iterator(),
+                          dst_valid_count_output_iterator{d_orig_dst_buf_info});
+
+    CUDF_CUDA_TRY(cudaMemcpyAsync(h_orig_dst_buf_info,
+                                  d_orig_dst_buf_info,
+                                  partition_buf_size_and_dst_buf_info->dst_buf_info_size,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+
+    stream.synchronize();
+
+    // not necessary for the non-chunked case, but it makes it so further calls to has_next
+    // return false, just in case
+    chunk_iter_state->advance_iteration();
+
+    return make_packed_tables();
+  }
+
+  cudf::size_type contiguous_split_chunk(cudf::device_span<uint8_t> const& user_buffer)
+  {
+    CUDF_FUNC_RANGE();
+    CUDF_EXPECTS(
+      user_buffer.size() == user_buffer_size,
+      "Cannot use a device span smaller than the output buffer size configured at instantiation!");
+    CUDF_EXPECTS(has_next(), "Cannot call contiguous_split_chunk with has_next() == false!");
+
+    auto [starting_batch, num_batches_to_copy] =
+      chunk_iter_state->get_current_starting_index_and_buff_count();
+
+    // perform the copy.
+    copy_data(num_batches_to_copy,
+              starting_batch,
+              src_and_dst_pointers->d_src_bufs,
+              src_and_dst_pointers->d_dst_bufs,
+              chunk_iter_state->d_batched_dst_buf_info,
+              user_buffer.data(),
+              stream);
+
+    // We do not need to post-process null counts since the null count info is
+    // taken from the source table in the contiguous_split_chunk case (no splits)
+    return chunk_iter_state->advance_iteration();
+  }
+
+  std::unique_ptr<std::vector<uint8_t>> build_packed_column_metadata()
+  {
+    CUDF_EXPECTS(num_partitions == 1, "build_packed_column_metadata supported only without splits");
+
+    if (input.num_columns() == 0) { return std::unique_ptr<std::vector<uint8_t>>(); }
+
+    if (is_empty) {
+      // this is a bit ugly, but it was done to re-use make_empty_packed_table between the
+      // regular contiguous_split and chunked_pack cases.
+      auto empty_packed_tables = std::move(make_empty_packed_table().front());
+      return std::move(empty_packed_tables.data.metadata);
+    }
+
+    auto& h_dst_buf_info  = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
+    auto cur_dst_buf_info = h_dst_buf_info;
+    detail::metadata_builder mb{input.num_columns()};
+
+    populate_metadata(input.begin(), input.end(), cur_dst_buf_info, mb);
+
+    return std::make_unique<std::vector<uint8_t>>(std::move(mb.build()));
+  }
+
+ private:
+  contiguous_split_state(cudf::table_view const& input,
+                         std::vector<size_type> const& splits,
+                         std::size_t user_buffer_size,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr,
+                         rmm::mr::device_memory_resource* temp_mr)
+    : input(input),
+      user_buffer_size(user_buffer_size),
+      stream(stream),
+      mr(mr),
+      temp_mr(temp_mr),
+      is_empty{check_inputs(input, splits)},
+      num_partitions{splits.size() + 1},
+      num_src_bufs{count_src_bufs(input.begin(), input.end())},
+      num_bufs{num_src_bufs * num_partitions}
+  {
+    // if the table we are about to contig split is empty, we have special
+    // handling where metadata is produced and a 0-byte contiguous buffer
+    // is the result.
+    if (is_empty) { return; }
+
+    // First pass over the source tables to generate a `dst_buf_info` per split and column buffer
+    // (`num_bufs`). After this, contiguous_split uses `dst_buf_info` to further subdivide the work
+    // into 1MB batches in `compute_batches`
+    partition_buf_size_and_dst_buf_info = std::move(
+      compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, temp_mr));
+
+    // Second pass: uses `dst_buf_info` to break down the work into 1MB batches.
+    chunk_iter_state = compute_batches(num_bufs,
+                                       partition_buf_size_and_dst_buf_info->d_dst_buf_info,
+                                       partition_buf_size_and_dst_buf_info->h_buf_sizes,
+                                       num_partitions,
+                                       user_buffer_size,
+                                       stream,
+                                       temp_mr);
+
+    // allocate output partition buffers, in the non-chunked case
+    if (user_buffer_size == 0) {
+      out_buffers.reserve(num_partitions);
+      auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes;
+      std::transform(h_buf_sizes,
+                     h_buf_sizes + num_partitions,
+                     std::back_inserter(out_buffers),
+                     [stream = stream, mr = mr](std::size_t bytes) {
+                       return rmm::device_buffer{bytes, stream, mr};
+                     });
+    }
+
+    src_and_dst_pointers = std::move(setup_src_and_dst_pointers(
+      input, num_partitions, num_src_bufs, out_buffers, stream, temp_mr));
+  }
+
+  std::vector<packed_table> make_packed_tables()
+  {
+    if (input.num_columns() == 0) { return std::vector<packed_table>(); }
+    if (is_empty) { return make_empty_packed_table(); }
+    std::vector<packed_table> result;
+    result.reserve(num_partitions);
+    std::vector<column_view> cols;
+    cols.reserve(input.num_columns());
+
+    auto& h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
+    auto& h_dst_bufs     = src_and_dst_pointers->h_dst_bufs;
+
+    auto cur_dst_buf_info = h_dst_buf_info;
+    detail::metadata_builder mb(input.num_columns());
+
+    for (std::size_t idx = 0; idx < num_partitions; idx++) {
+      // traverse the buffers and build the columns.
+      cur_dst_buf_info = build_output_columns(input.begin(),
+                                              input.end(),
+                                              cur_dst_buf_info,
+                                              std::back_inserter(cols),
+                                              h_dst_bufs[idx],
+                                              mb);
+
+      // pack the columns
+      result.emplace_back(packed_table{
+        cudf::table_view{cols},
+        packed_columns{std::make_unique<std::vector<uint8_t>>(mb.build()),
+                       std::make_unique<rmm::device_buffer>(std::move(out_buffers[idx]))}});
+
+      cols.clear();
+      mb.clear();
+    }
+
+    return result;
+  }
+
+  std::vector<packed_table> make_empty_packed_table()
+  {
+    // sanitize the inputs (to handle corner cases like sliced tables)
+    std::vector<cudf::column_view> empty_column_views;
+    empty_column_views.reserve(input.num_columns());
+    std::transform(input.begin(),
+                   input.end(),
+                   std::back_inserter(empty_column_views),
+                   [](column_view const& col) { return cudf::empty_like(col)->view(); });
+
+    table_view empty_inputs(empty_column_views);
+
+    // build the empty results
+    std::vector<packed_table> result;
+    result.reserve(num_partitions);
+    auto const iter = thrust::make_counting_iterator(0);
+    std::transform(iter,
+                   iter + num_partitions,
+                   std::back_inserter(result),
+                   [&empty_inputs](int partition_index) {
+                     return packed_table{empty_inputs,
+                                         packed_columns{std::make_unique<std::vector<uint8_t>>(
+                                                          pack_metadata(empty_inputs, nullptr, 0)),
+                                                        std::make_unique<rmm::device_buffer>()}};
+                   });
+
+    return result;
+  }
+
+  cudf::table_view const input;        ///< The input table_view to operate on
+  std::size_t const user_buffer_size;  ///< The size of the user buffer for the chunked_pack case
+  rmm::cuda_stream_view const stream;
+  rmm::mr::device_memory_resource* const mr;  ///< The memory resource for any data returned
+
+  // this resource defaults to `mr` for the contiguous_split case, but it can be useful for the
+  // `chunked_pack` case to allocate scratch/temp memory in a pool
+  rmm::mr::device_memory_resource* const temp_mr;  ///< The memory resource for scratch/temp space
+
+  // whether the table was empty to begin with (0 rows or 0 columns) and should be metadata-only
+  bool const is_empty;  ///< True if the source table has 0 rows or 0 columns
+
+  // This can be 1 if `contiguous_split` is just packing and not splitting
+  std::size_t const num_partitions;  ///< The number of partitions to produce
+
+  size_type const num_src_bufs;      ///< Number of source buffers including children
+
+  std::size_t const num_bufs;  ///< Number of source buffers including children * number of splits
+
+  std::unique_ptr<packed_partition_buf_size_and_dst_buf_info>
+    partition_buf_size_and_dst_buf_info;  ///< Per-partition buffer size and destination buffer info
+
+  std::unique_ptr<packed_src_and_dst_pointers>
+    src_and_dst_pointers;  ///< Src. and dst. pointers for `copy_partition`
+
+  //
+  // State around the chunked pattern
+  //
+
+  // chunked_pack will have 1 or more "chunks" to iterate on, defined in chunk_iter_state
+  // contiguous_split will have a single "chunk" in chunk_iter_state, so no iteration.
+  std::unique_ptr<chunk_iteration_state>
+    chunk_iter_state;  ///< State object for chunk iteration state
+
+  // Two API usages are allowed:
+  //  - `chunked_pack`: for this mode, the user will provide a buffer that must be at least 1MB.
+  //    The behavior is "chunked" in that it will contiguously copy up until the user specified
+  //    `user_buffer_size` limit, exposing a next() call for the user to invoke. Note that in this
+  //    mode, no partitioning occurs, hence the name "pack".
+  //
+  //  - `contiguous_split` (default): when the user doesn't provide their own buffer,
+  //    `contiguous_split` will allocate a buffer per partition and will place contiguous results in
+  //    each buffer.
+  //
+  std::vector<rmm::device_buffer>
+    out_buffers;  ///< Buffers allocated for a regular `contiguous_split`
+};
+
+std::vector<packed_table> contiguous_split(cudf::table_view const& input,
+                                           std::vector<size_type> const& splits,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  // `temp_mr` is the same as `mr` for contiguous_split as it allocates all
+  // of its memory from the default memory resource in cuDF
+  auto temp_mr = mr;
+  auto state   = contiguous_split_state(input, splits, stream, mr, temp_mr);
+  return state.contiguous_split();
 }
 
 };  // namespace detail
@@ -1286,4 +2044,43 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   return detail::contiguous_split(input, splits, cudf::get_default_stream(), mr);
 }
 
+chunked_pack::chunked_pack(cudf::table_view const& input,
+                           std::size_t user_buffer_size,
+                           rmm::mr::device_memory_resource* temp_mr)
+{
+  CUDF_EXPECTS(user_buffer_size >= desired_batch_size,
+               "The output buffer size must be at least 1MB in size");
+  // We pass `nullptr` for the first `mr` in `contiguous_split_state` to indicate
+  // that it does not allocate any user-bound data for the `chunked_pack` case.
+  state = std::make_unique<detail::contiguous_split_state>(
+    input, user_buffer_size, cudf::get_default_stream(), nullptr, temp_mr);
+}
+
+// required for the unique_ptr to work with a incomplete type (contiguous_split_state)
+chunked_pack::~chunked_pack() = default;
+
+std::size_t chunked_pack::get_total_contiguous_size() const
+{
+  return state->get_total_contiguous_size();
+}
+
+bool chunked_pack::has_next() const { return state->has_next(); }
+
+std::size_t chunked_pack::next(cudf::device_span<uint8_t> const& user_buffer)
+{
+  return state->contiguous_split_chunk(user_buffer);
+}
+
+std::unique_ptr<std::vector<uint8_t>> chunked_pack::build_metadata() const
+{
+  return state->build_packed_column_metadata();
+}
+
+std::unique_ptr<chunked_pack> chunked_pack::create(cudf::table_view const& input,
+                                                   std::size_t user_buffer_size,
+                                                   rmm::mr::device_memory_resource* temp_mr)
+{
+  return std::make_unique<chunked_pack>(input, user_buffer_size, temp_mr);
+}
+
 };  // namespace cudf
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 2bcfeae20c4..490a1ccb254 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -66,7 +66,7 @@ struct scalar_empty_like_functor_impl<cudf::list_view> {
     auto ls = static_cast<list_scalar const*>(&input);
 
     // TODO:  add a manual constructor for lists_column_view.
-    column_view offsets{cudf::data_type{cudf::type_id::INT32}, 0, nullptr};
+    column_view offsets{cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0};
     std::vector<column_view> children;
     children.push_back(offsets);
     children.push_back(ls->view());
@@ -129,7 +129,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                   size,
                                   rmm::device_buffer(size * size_of(input.type()), stream, mr),
                                   detail::create_null_mask(size, allocate_mask, stream, mr),
-                                  state_null_count(allocate_mask, input.size()));
+                                  0);
 }
 
 }  // namespace detail
@@ -175,19 +175,21 @@ std::unique_ptr<table> empty_like(table_view const& input_table)
 
 std::unique_ptr<column> allocate_like(column_view const& input,
                                       mask_allocation_policy mask_alloc,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::allocate_like(input, input.size(), mask_alloc, cudf::get_default_stream(), mr);
+  return detail::allocate_like(input, input.size(), mask_alloc, stream, mr);
 }
 
 std::unique_ptr<column> allocate_like(column_view const& input,
                                       size_type size,
                                       mask_allocation_policy mask_alloc,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::allocate_like(input, size, mask_alloc, cudf::get_default_stream(), mr);
+  return detail::allocate_like(input, size, mask_alloc, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 9ec00612f2f..6b7fae32d48 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -204,10 +204,12 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::scalar const& lhs,
                                                is_left);
 
   auto const scatter_map_size  = std::distance(scatter_map.begin(), scatter_map_end);
-  auto scatter_source          = std::vector<std::reference_wrapper<const scalar>>{std::ref(lhs)};
+  auto scatter_source          = std::vector<std::reference_wrapper<scalar const>>{std::ref(lhs)};
   auto scatter_map_column_view = cudf::column_view{cudf::data_type{cudf::type_id::INT32},
                                                    static_cast<cudf::size_type>(scatter_map_size),
-                                                   scatter_map.begin()};
+                                                   scatter_map.begin(),
+                                                   nullptr,
+                                                   0};
 
   auto result = cudf::detail::scatter(
     scatter_source, scatter_map_column_view, table_view{std::vector<column_view>{rhs}}, stream, mr);
@@ -409,37 +411,41 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
 std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::get_default_stream(), mr);
+  return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
 }
 
 std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::get_default_stream(), mr);
+  return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
 }
 
 std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::get_default_stream(), mr);
+  return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
 }
 
 std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::get_default_stream(), mr);
+  return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index 9a506c866bc..af253858c73 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -108,6 +108,7 @@ struct out_of_place_copy_range_dispatch {
     if (source_end != source_begin) {  // otherwise no-op
       auto ret_view = p_ret->mutable_view();
       in_place_copy_range<T>(source, ret_view, source_begin, source_end, target_begin, stream);
+      p_ret->set_null_count(ret_view.null_count());
     }
 
     return p_ret;
@@ -271,11 +272,12 @@ void copy_range_in_place(column_view const& source,
                          mutable_column_view& target,
                          size_type source_begin,
                          size_type source_end,
-                         size_type target_begin)
+                         size_type target_begin,
+                         rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_range_in_place(
-    source, target, source_begin, source_end, target_begin, cudf::get_default_stream());
+    source, target, source_begin, source_end, target_begin, stream);
 }
 
 std::unique_ptr<column> copy_range(column_view const& source,
@@ -283,11 +285,11 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    size_type source_begin,
                                    size_type source_end,
                                    size_type target_begin,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_range(
-    source, target, source_begin, source_end, target_begin, cudf::get_default_stream(), mr);
+  return detail::copy_range(source, target, source_begin, source_end, target_begin, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
index 2a953dc64fd..267c71591d5 100644
--- a/cpp/src/copying/gather.cu
+++ b/cpp/src/copying/gather.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,10 +65,13 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(gather_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
-               "invalid gather map size");
+               "gather map size exceeds the column size limit",
+               std::overflow_error);
   auto map_col = column_view(data_type{type_to_id<size_type>()},
                              static_cast<size_type>(gather_map.size()),
-                             gather_map.data());
+                             gather_map.data(),
+                             nullptr,
+                             0);
   return gather(source_table, map_col, bounds_policy, neg_indices, stream, mr);
 }
 
@@ -77,6 +80,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
 std::unique_ptr<table> gather(table_view const& source_table,
                               column_view const& gather_map,
                               out_of_bounds_policy bounds_policy,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -84,8 +88,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
   auto index_policy = is_unsigned(gather_map.type()) ? detail::negative_index_policy::NOT_ALLOWED
                                                      : detail::negative_index_policy::ALLOWED;
 
-  return detail::gather(
-    source_table, gather_map, bounds_policy, index_policy, cudf::get_default_stream(), mr);
+  return detail::gather(source_table, gather_map, bounds_policy, index_policy, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 798ef44d84e..a3f9be0bc76 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -201,10 +201,11 @@ std::unique_ptr<scalar> get_element(column_view const& input,
 
 std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_element(input, index, cudf::get_default_stream(), mr);
+  return detail::get_element(input, index, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
index 20a8ce986aa..b578f319a89 100644
--- a/cpp/src/copying/purge_nonempty_nulls.cu
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -42,11 +42,10 @@ bool has_nonempty_null_rows(cudf::column_view const& input, rmm::cuda_stream_vie
 
   // Cross-reference nullmask and offsets.
   auto const type         = input.type().id();
-  auto const offsets      = (type == type_id::STRING) ? (strings_column_view{input}).offsets()
-                                                      : (lists_column_view{input}).offsets();
+  auto const offsets      = (type == type_id::STRING) ? (strings_column_view{input}).offsets_begin()
+                                                      : (lists_column_view{input}).offsets_begin();
   auto const d_input      = cudf::column_device_view::create(input, stream);
-  auto const is_dirty_row = [d_input = *d_input, offsets = offsets.begin<size_type>()] __device__(
-                              size_type const& row_idx) {
+  auto const is_dirty_row = [d_input = *d_input, offsets] __device__(size_type const& row_idx) {
     return d_input.is_null_nocheck(row_idx) && (offsets[row_idx] != offsets[row_idx + 1]);
   };
 
@@ -124,18 +123,19 @@ bool may_have_nonempty_nulls(column_view const& input)
 /**
  * @copydoc cudf::has_nonempty_nulls
  */
-bool has_nonempty_nulls(column_view const& input)
+bool has_nonempty_nulls(column_view const& input, rmm::cuda_stream_view stream)
 {
-  return detail::has_nonempty_nulls(input, cudf::get_default_stream());
+  return detail::has_nonempty_nulls(input, stream);
 }
 
 /**
  * @copydoc cudf::purge_nonempty_nulls(column_view const&, rmm::mr::device_memory_resource*)
  */
 std::unique_ptr<cudf::column> purge_nonempty_nulls(column_view const& input,
+                                                   rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
-  return detail::purge_nonempty_nulls(input, cudf::get_default_stream(), mr);
+  return detail::purge_nonempty_nulls(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/reverse.cu b/cpp/src/copying/reverse.cu
index cf8ca7d9a92..fbbbc56e712 100644
--- a/cpp/src/copying/reverse.cu
+++ b/cpp/src/copying/reverse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,20 +50,24 @@ std::unique_ptr<column> reverse(column_view const& source_column,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  return std::move(cudf::reverse(table_view({source_column}))->release().front());
+  return std::move(
+    cudf::detail::reverse(table_view({source_column}), stream, mr)->release().front());
 }
 }  // namespace detail
 
-std::unique_ptr<table> reverse(table_view const& source_table, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<table> reverse(table_view const& source_table,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reverse(source_table, cudf::get_default_stream(), mr);
+  return detail::reverse(source_table, stream, mr);
 }
 
 std::unique_ptr<column> reverse(column_view const& source_column,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reverse(source_column, cudf::get_default_stream(), mr);
+  return detail::reverse(source_column, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index 27a3f145caa..f3d8d624171 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -90,9 +90,10 @@ std::unique_ptr<table> sample(table_view const& input,
                               size_type const n,
                               sample_with_replacement replacement,
                               int64_t const seed,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sample(input, n, replacement, seed, cudf::get_default_stream(), mr);
+  return detail::sample(input, n, replacement, seed, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 316f39b616c..11c27fc86e3 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -52,8 +52,8 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
                                        MapIterator scatter_map,
                                        size_type num_scatter_rows)
 {
-  thread_index_type row          = threadIdx.x + blockIdx.x * blockDim.x;
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto row          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   while (row < num_scatter_rows) {
     size_type const output_row = scatter_map[row];
@@ -69,7 +69,7 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
 }
 
 template <typename MapIterator>
-void scatter_scalar_bitmask_inplace(std::reference_wrapper<const scalar> const& source,
+void scatter_scalar_bitmask_inplace(std::reference_wrapper<scalar const> const& source,
                                     MapIterator scatter_map,
                                     size_type num_scatter_rows,
                                     column& target,
@@ -101,7 +101,7 @@ void scatter_scalar_bitmask_inplace(std::reference_wrapper<const scalar> const&
 
 template <typename Element, typename MapIterator>
 struct column_scalar_scatterer_impl {
-  std::unique_ptr<column> operator()(std::reference_wrapper<const scalar> const& source,
+  std::unique_ptr<column> operator()(std::reference_wrapper<scalar const> const& source,
                                      MapIterator scatter_iter,
                                      size_type scatter_rows,
                                      column_view const& target,
@@ -115,7 +115,7 @@ struct column_scalar_scatterer_impl {
     auto result_view = result->mutable_view();
 
     // Use permutation iterator with constant index to dereference scalar data
-    auto scalar_impl = static_cast<const scalar_type_t<Element>*>(&source.get());
+    auto scalar_impl = static_cast<scalar_type_t<Element> const*>(&source.get());
     auto scalar_iter =
       thrust::make_permutation_iterator(scalar_impl->data(), thrust::make_constant_iterator(0));
 
@@ -132,7 +132,7 @@ struct column_scalar_scatterer_impl {
 
 template <typename MapIterator>
 struct column_scalar_scatterer_impl<string_view, MapIterator> {
-  std::unique_ptr<column> operator()(std::reference_wrapper<const scalar> const& source,
+  std::unique_ptr<column> operator()(std::reference_wrapper<scalar const> const& source,
                                      MapIterator scatter_iter,
                                      size_type scatter_rows,
                                      column_view const& target,
@@ -141,7 +141,7 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
   {
     CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match");
 
-    auto const scalar_impl = static_cast<const string_scalar*>(&source.get());
+    auto const scalar_impl = static_cast<string_scalar const*>(&source.get());
     auto const source_view = string_view(scalar_impl->data(), scalar_impl->size());
     auto const begin       = thrust::make_constant_iterator(source_view);
     auto const end         = begin + scatter_rows;
@@ -154,7 +154,7 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
 
 template <typename MapIterator>
 struct column_scalar_scatterer_impl<list_view, MapIterator> {
-  std::unique_ptr<column> operator()(std::reference_wrapper<const scalar> const& source,
+  std::unique_ptr<column> operator()(std::reference_wrapper<scalar const> const& source,
                                      MapIterator scatter_iter,
                                      size_type scatter_rows,
                                      column_view const& target,
@@ -171,7 +171,7 @@ struct column_scalar_scatterer_impl<list_view, MapIterator> {
 
 template <typename MapIterator>
 struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
-  std::unique_ptr<column> operator()(std::reference_wrapper<const scalar> const& source,
+  std::unique_ptr<column> operator()(std::reference_wrapper<scalar const> const& source,
                                      MapIterator scatter_iter,
                                      size_type scatter_rows,
                                      column_view const& target,
@@ -223,7 +223,7 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
 template <typename MapIterator>
 struct column_scalar_scatterer {
   template <typename Element>
-  std::unique_ptr<column> operator()(std::reference_wrapper<const scalar> const& source,
+  std::unique_ptr<column> operator()(std::reference_wrapper<scalar const> const& source,
                                      MapIterator scatter_iter,
                                      size_type scatter_rows,
                                      column_view const& target,
@@ -237,7 +237,7 @@ struct column_scalar_scatterer {
 
 template <typename MapIterator>
 struct column_scalar_scatterer_impl<struct_view, MapIterator> {
-  std::unique_ptr<column> operator()(std::reference_wrapper<const scalar> const& source,
+  std::unique_ptr<column> operator()(std::reference_wrapper<scalar const> const& source,
                                      MapIterator scatter_iter,
                                      size_type scatter_rows,
                                      column_view const& target,
@@ -253,8 +253,8 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
 
     auto scatter_functor   = column_scalar_scatterer<decltype(scatter_iter)>{};
     auto fields_iter_begin = make_counting_transform_iterator(0, [&](auto const& i) {
-      auto row_slr =
-        get_element(typed_s->view().column(i), 0, stream, rmm::mr::get_current_device_resource());
+      auto row_slr = detail::get_element(
+        typed_s->view().column(i), 0, stream, rmm::mr::get_current_device_resource());
       return type_dispatcher<dispatch_storage_type>(row_slr->type(),
                                                     scatter_functor,
                                                     *row_slr,
@@ -321,14 +321,17 @@ std::unique_ptr<table> scatter(table_view const& source,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(scatter_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
-               "invalid scatter map size");
+               "scatter map size exceeds the column size limit",
+               std::overflow_error);
   auto map_col = column_view(data_type{type_to_id<size_type>()},
                              static_cast<size_type>(scatter_map.size()),
-                             scatter_map.data());
-  return scatter(source, map_col, target, stream, mr);
+                             scatter_map.data(),
+                             nullptr,
+                             0);
+  return detail::scatter(source, map_col, target, stream, mr);
 }
 
-std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>> const& source,
+std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<scalar const>> const& source,
                                column_view const& indices,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
@@ -449,7 +452,7 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
 }
 
 std::unique_ptr<table> boolean_mask_scatter(
-  std::vector<std::reference_wrapper<const scalar>> const& input,
+  std::vector<std::reference_wrapper<scalar const>> const& input,
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
@@ -492,38 +495,42 @@ std::unique_ptr<table> boolean_mask_scatter(
 std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::scatter(source, scatter_map, target, cudf::get_default_stream(), mr);
+  return detail::scatter(source, scatter_map, target, stream, mr);
 }
 
-std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>> const& source,
+std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<scalar const>> const& source,
                                column_view const& indices,
                                table_view const& target,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::scatter(source, indices, target, cudf::get_default_stream(), mr);
+  return detail::scatter(source, indices, target, stream, mr);
 }
 
 std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                                             table_view const& target,
                                             column_view const& boolean_mask,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::boolean_mask_scatter(input, target, boolean_mask, cudf::get_default_stream(), mr);
+  return detail::boolean_mask_scatter(input, target, boolean_mask, stream, mr);
 }
 
 std::unique_ptr<table> boolean_mask_scatter(
-  std::vector<std::reference_wrapper<const scalar>> const& input,
+  std::vector<std::reference_wrapper<scalar const>> const& input,
   table_view const& target,
   column_view const& boolean_mask,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::boolean_mask_scatter(input, target, boolean_mask, cudf::get_default_stream(), mr);
+  return detail::boolean_mask_scatter(input, target, boolean_mask, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index 87bf6403e8d..89d6551737b 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -170,10 +170,11 @@ std::unique_ptr<column> shift(column_view const& input,
 std::unique_ptr<column> shift(column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::shift(input, offset, fill_value, cudf::get_default_stream(), mr);
+  return detail::shift(input, offset, fill_value, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/slice.cu b/cpp/src/copying/slice.cu
index 1cc897434b7..7c524dde3c8 100644
--- a/cpp/src/copying/slice.cu
+++ b/cpp/src/copying/slice.cu
@@ -136,40 +136,48 @@ std::vector<column_view> slice(column_view const& input,
                                std::initializer_list<size_type> indices,
                                rmm::cuda_stream_view stream)
 {
-  return slice(input, host_span<size_type const>(indices.begin(), indices.size()), stream);
+  return detail::slice(input, host_span<size_type const>(indices.begin(), indices.size()), stream);
 }
 
 std::vector<table_view> slice(table_view const& input,
                               std::initializer_list<size_type> indices,
                               rmm::cuda_stream_view stream)
 {
-  return slice(input, host_span<size_type const>(indices.begin(), indices.size()), stream);
+  return detail::slice(input, host_span<size_type const>(indices.begin(), indices.size()), stream);
 };
 
 }  // namespace detail
 
-std::vector<column_view> slice(column_view const& input, host_span<size_type const> indices)
+std::vector<column_view> slice(column_view const& input,
+                               host_span<size_type const> indices,
+                               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice(input, indices, cudf::get_default_stream());
+  return detail::slice(input, indices, stream);
 }
 
-std::vector<table_view> slice(table_view const& input, host_span<size_type const> indices)
+std::vector<table_view> slice(table_view const& input,
+                              host_span<size_type const> indices,
+                              rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice(input, indices, cudf::get_default_stream());
+  return detail::slice(input, indices, stream);
 };
 
-std::vector<column_view> slice(column_view const& input, std::initializer_list<size_type> indices)
+std::vector<column_view> slice(column_view const& input,
+                               std::initializer_list<size_type> indices,
+                               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice(input, indices, cudf::get_default_stream());
+  return detail::slice(input, indices, stream);
 }
 
-std::vector<table_view> slice(table_view const& input, std::initializer_list<size_type> indices)
+std::vector<table_view> slice(table_view const& input,
+                              std::initializer_list<size_type> indices,
+                              rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice(input, indices, cudf::get_default_stream());
+  return detail::slice(input, indices, stream);
 };
 
 }  // namespace cudf
diff --git a/cpp/src/copying/split.cpp b/cpp/src/copying/split.cpp
index 35e1ec7cf47..1621bcdb36d 100644
--- a/cpp/src/copying/split.cpp
+++ b/cpp/src/copying/split.cpp
@@ -69,7 +69,7 @@ std::vector<column_view> split(column_view const& input,
                                std::initializer_list<size_type> splits,
                                rmm::cuda_stream_view stream)
 {
-  return split(input, host_span<size_type const>(splits.begin(), splits.size()), stream);
+  return detail::split(input, host_span<size_type const>(splits.begin(), splits.size()), stream);
 }
 
 std::vector<table_view> split(table_view const& input,
@@ -82,29 +82,35 @@ std::vector<table_view> split(table_view const& input,
 }  // namespace detail
 
 std::vector<cudf::column_view> split(cudf::column_view const& input,
-                                     host_span<size_type const> splits)
+                                     host_span<size_type const> splits,
+                                     rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::split(input, splits, cudf::get_default_stream());
+  return detail::split(input, splits, stream);
 }
 
 std::vector<cudf::table_view> split(cudf::table_view const& input,
-                                    host_span<size_type const> splits)
+                                    host_span<size_type const> splits,
+                                    rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::split(input, splits, cudf::get_default_stream());
+  return detail::split(input, splits, stream);
 }
 
-std::vector<column_view> split(column_view const& input, std::initializer_list<size_type> splits)
+std::vector<column_view> split(column_view const& input,
+                               std::initializer_list<size_type> splits,
+                               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::split(input, splits, cudf::get_default_stream());
+  return detail::split(input, splits, stream);
 }
 
-std::vector<table_view> split(table_view const& input, std::initializer_list<size_type> splits)
+std::vector<table_view> split(table_view const& input,
+                              std::initializer_list<size_type> splits,
+                              rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::split(input, splits, cudf::get_default_stream());
+  return detail::split(input, splits, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 45082168a8d..371663c41ee 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -181,7 +181,7 @@ struct extract_last_day_of_month {
   __device__ inline timestamp_D operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
-    const year_month_day ymd(floor<days>(ts));
+    year_month_day const ymd(floor<days>(ts));
     auto const ymdl = year_month_day_last{ymd.year() / ymd.month() / last};
     return timestamp_D{sys_days{ymdl}};
   }
@@ -217,7 +217,7 @@ struct extract_day_num_of_year {
   }
 };
 
-// Extract the the quarter to which the timestamp belongs to
+// Extract the quarter to which the timestamp belongs to
 struct extract_quarter_op {
   template <typename Timestamp>
   __device__ inline int16_t operator()(Timestamp const ts) const
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 55d68fe4a1a..1b40a994ba9 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -124,7 +124,7 @@ struct timezone_file {
     }
     CUDF_EXPECTS(
       header.typecnt > 0 && header.typecnt <= file_size / sizeof(localtime_type_record_s),
-      "Invalid number number of time types in timezone file.");
+      "Invalid number of time types in timezone file.");
     CUDF_EXPECTS(header.timecnt <= file_size,
                  "Number of transition times is larger than the file size.");
   }
@@ -500,8 +500,10 @@ std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_
   auto d_offsets = cudf::detail::make_device_uvector_async(offsets_typed, stream, mr);
 
   std::vector<std::unique_ptr<column>> tz_table_columns;
-  tz_table_columns.emplace_back(std::make_unique<cudf::column>(std::move(d_ttimes)));
-  tz_table_columns.emplace_back(std::make_unique<cudf::column>(std::move(d_offsets)));
+  tz_table_columns.emplace_back(
+    std::make_unique<cudf::column>(std::move(d_ttimes), rmm::device_buffer{}, 0));
+  tz_table_columns.emplace_back(
+    std::make_unique<cudf::column>(std::move(d_offsets), rmm::device_buffer{}, 0));
 
   // Need to finish copies before transition_times and offsets go out of scope
   stream.synchronize();
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index d543225d3eb..ab22c07e4d5 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 98ad108655f..121b5bce499 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -15,7 +15,7 @@
  */
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/sorting.hpp>
@@ -216,7 +216,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   std::transform(columns.begin(), columns.end(), keys_views.begin(), [keys_type](auto cv) {
     auto dict_view = dictionary_column_view(cv);
     // empty column may not have keys so we create an empty column_view place-holder
-    if (dict_view.is_empty()) return column_view{keys_type, 0, nullptr};
+    if (dict_view.is_empty()) return column_view{keys_type, 0, nullptr, nullptr, 0};
     auto keys = dict_view.keys();
     CUDF_EXPECTS(keys.type() == keys_type, "key types of all dictionary columns must match");
     return keys;
@@ -245,7 +245,9 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   std::vector<column_view> indices_views(columns.size());
   std::transform(columns.begin(), columns.end(), indices_views.begin(), [](auto cv) {
     auto dict_view = dictionary_column_view(cv);
-    if (dict_view.is_empty()) return column_view{data_type{type_id::UINT32}, 0, nullptr};
+    if (dict_view.is_empty()) {
+      return column_view{data_type{type_id::UINT32}, 0, nullptr, nullptr, 0};
+    }
     return dict_view.get_indices_annotated();  // nicely includes validity mask and view offset
   });
   auto all_indices        = cudf::detail::concatenate(indices_views, stream, mr);
@@ -264,7 +266,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     children_offsets.begin() + 1,
     children_offsets.end(),
     indices_itr,
-    indices_itr + indices_size + 1,
+    indices_itr + indices_size,
     map_to_keys.begin(),
     [] __device__(auto const& lhs, auto const& rhs) { return lhs.second < rhs.second; });
 
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index d804a587478..f70423a13a9 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -137,7 +137,8 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys,
         new_type, indices_size, std::move(*(contents.data.release())), rmm::device_buffer{}, 0);
     }
     // If the new type does not match, then convert the data.
-    cudf::column_view cast_view{cudf::data_type{indices_type}, indices_size, contents.data->data()};
+    cudf::column_view cast_view{
+      cudf::data_type{indices_type}, indices_size, contents.data->data(), nullptr, 0};
     return cudf::detail::cast(cast_view, new_type, stream, mr);
   }();
 
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index dcb877da686..9fe4a63373b 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -178,7 +178,8 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
     rmm::device_uvector<uint32_t> keys_positions(keys_size, stream);
     thrust::sequence(rmm::exec_policy(stream), keys_positions.begin(), keys_positions.end());
     // wrap the indices for comparison in contains()
-    column_view keys_positions_view(data_type{type_id::UINT32}, keys_size, keys_positions.data());
+    column_view keys_positions_view(
+      data_type{type_id::UINT32}, keys_size, keys_positions.data(), nullptr, 0);
     return cudf::detail::contains(indices_view, keys_positions_view, stream, mr);
   }();
   auto d_matches = matches->view().data<bool>();
diff --git a/cpp/src/filling/calendrical_month_sequence.cu b/cpp/src/filling/calendrical_month_sequence.cu
index f45634a615e..80badb7d566 100644
--- a/cpp/src/filling/calendrical_month_sequence.cu
+++ b/cpp/src/filling/calendrical_month_sequence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,10 +40,11 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
 std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          scalar const& init,
                                                          size_type months,
+                                                         rmm::cuda_stream_view stream,
                                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::calendrical_month_sequence(size, init, months, cudf::get_default_stream(), mr);
+  return detail::calendrical_month_sequence(size, init, months, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index a747cc195ae..3d84db121fc 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -122,6 +122,7 @@ struct out_of_place_fill_range_dispatch {
       auto ret_view    = p_ret->mutable_view();
       using DeviceType = cudf::device_storage_type_t<T>;
       in_place_fill<DeviceType>(ret_view, begin, end, value, stream);
+      p_ret->set_null_count(ret_view.null_count());
     }
 
     return p_ret;
@@ -245,20 +246,22 @@ std::unique_ptr<column> fill(column_view const& input,
 void fill_in_place(mutable_column_view& destination,
                    size_type begin,
                    size_type end,
-                   scalar const& value)
+                   scalar const& value,
+                   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::fill_in_place(destination, begin, end, value, cudf::get_default_stream());
+  return detail::fill_in_place(destination, begin, end, value, stream);
 }
 
 std::unique_ptr<column> fill(column_view const& input,
                              size_type begin,
                              size_type end,
                              scalar const& value,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::fill(input, begin, end, value, cudf::get_default_stream(), mr);
+  return detail::fill(input, begin, end, value, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 736c96e0915..677d9a09515 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,7 +63,8 @@ struct count_accessor {
     auto count = p_count->value(stream);
     // static_cast is necessary due to bool
     CUDF_EXPECTS(static_cast<int64_t>(count) <= std::numeric_limits<cudf::size_type>::max(),
-                 "count should not exceed size_type's limit.");
+                 "count should not exceed the column size limit",
+                 std::overflow_error);
     return static_cast<cudf::size_type>(count);
   }
 
@@ -86,7 +87,8 @@ struct count_checker {
       auto max = thrust::reduce(
         rmm::exec_policy(stream), count.begin<T>(), count.end<T>(), 0, thrust::maximum<T>());
       CUDF_EXPECTS(max <= std::numeric_limits<cudf::size_type>::max(),
-                   "count should not have values larger than size_type maximum.");
+                   "count exceeds the column size limit",
+                   std::overflow_error);
     }
   }
 
@@ -135,13 +137,13 @@ std::unique_ptr<table> repeat(table_view const& input_table,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(count >= 0, "count value should be non-negative");
-  CUDF_EXPECTS(
-    static_cast<int64_t>(input_table.num_rows()) * count <= std::numeric_limits<size_type>::max(),
-    "The resulting table has more rows than size_type's limit.");
-
   if ((input_table.num_rows() == 0) || (count == 0)) { return cudf::empty_like(input_table); }
 
+  CUDF_EXPECTS(count >= 0, "count value should be non-negative");
+  CUDF_EXPECTS(input_table.num_rows() <= std::numeric_limits<size_type>::max() / count,
+               "The resulting table exceeds the column size limit",
+               std::overflow_error);
+
   auto output_size = input_table.num_rows() * count;
   auto map_begin   = cudf::detail::make_counting_transform_iterator(
     0, [count] __device__(auto i) { return i / count; });
@@ -154,18 +156,20 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 
 std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat(input_table, count, cudf::get_default_stream(), mr);
+  return detail::repeat(input_table, count, stream, mr);
 }
 
 std::unique_ptr<table> repeat(table_view const& input_table,
                               size_type count,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat(input_table, count, cudf::get_default_stream(), mr);
+  return detail::repeat(input_table, count, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index b4bab369c61..99a17f8b0e0 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -150,18 +150,20 @@ std::unique_ptr<column> sequence(size_type size,
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  scalar const& step,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sequence(size, init, step, cudf::get_default_stream(), mr);
+  return detail::sequence(size, init, step, stream, mr);
 }
 
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sequence(size, init, cudf::get_default_stream(), mr);
+  return detail::sequence(size, init, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index df590c0c4b9..ce1fc71968f 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -107,7 +107,7 @@ struct empty_column_constructor {
 
     if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) {
       return make_lists_column(
-        0, make_empty_column(type_to_id<offset_type>()), empty_like(values), 0, {});
+        0, make_empty_column(type_to_id<size_type>()), empty_like(values), 0, {});
     }
 
     if constexpr (k == aggregation::Kind::RANK) {
@@ -184,6 +184,15 @@ void verify_valid_requests(host_span<RequestType const> requests)
 // Compute aggregation requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggregate(
   host_span<aggregation_request const> requests, rmm::mr::device_memory_resource* mr)
+{
+  return aggregate(requests, cudf::get_default_stream(), mr);
+}
+
+// Compute aggregation requests
+std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggregate(
+  host_span<aggregation_request const> requests,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
@@ -194,9 +203,9 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
   verify_valid_requests(requests);
 
-  if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); }
+  if (_keys.num_rows() == 0) { return {empty_like(_keys), empty_results(requests)}; }
 
-  return dispatch_aggregation(requests, cudf::get_default_stream(), mr);
+  return dispatch_aggregation(requests, stream, mr);
 }
 
 // Compute scan requests
@@ -286,7 +295,7 @@ detail::sort::sort_groupby_helper& groupby::helper()
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
   table_view const& values,
   host_span<size_type const> offsets,
-  std::vector<std::reference_wrapper<const scalar>> const& fill_values,
+  std::vector<std::reference_wrapper<scalar const>> const& fill_values,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 6c55b1438ee..506832881a9 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -33,10 +33,10 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
+#include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
@@ -69,12 +69,12 @@ namespace {
 // TODO: replace it with `cuco::static_map`
 // https://github.com/rapidsai/cudf/issues/10401
 template <typename ComparatorType>
-using map_type =
-  concurrent_unordered_map<cudf::size_type,
-                           cudf::size_type,
-                           cudf::experimental::row::hash::
-                             device_row_hasher<cudf::detail::default_hash, cudf::nullate::DYNAMIC>,
-                           ComparatorType>;
+using map_type = concurrent_unordered_map<
+  cudf::size_type,
+  cudf::size_type,
+  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                   cudf::nullate::DYNAMIC>,
+  ComparatorType>;
 
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
@@ -250,7 +250,9 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
     column_view null_removed_map(
       data_type(type_to_id<size_type>()),
       arg_result->size(),
-      static_cast<void const*>(arg_result->view().template data<size_type>()));
+      static_cast<void const*>(arg_result->view().template data<size_type>()),
+      nullptr,
+      0);
     auto gather_argminmax =
       cudf::detail::gather(table_view({col}),
                            null_removed_map,
@@ -644,18 +646,18 @@ std::unique_ptr<table> groupby(table_view const& keys,
 bool can_use_hash_groupby(host_span<aggregation_request const> requests)
 {
   return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-    // Currently, structs are not supported in any of hash-based aggregations.
-    // Therefore, if any request contains structs then we must fallback to sort-based aggregations.
-    // TODO: Support structs in hash-based aggregations.
     auto const v_type = is_dictionary(r.values.type())
                           ? cudf::dictionary_column_view(r.values).keys().type()
                           : r.values.type();
 
-    return not(r.values.type().id() == type_id::STRUCT) and
-           std::all_of(r.aggregations.begin(), r.aggregations.end(), [v_type](auto const& a) {
-             return cudf::has_atomic_support(cudf::detail::target_type(v_type, a->kind)) and
-                    is_hash_aggregation(a->kind);
-           });
+    // Currently, input values (not keys) of STRUCT and LIST types are not supported in any of
+    // hash-based aggregations. For those situations, we fallback to sort-based aggregations.
+    if (v_type.id() == type_id::STRUCT or v_type.id() == type_id::LIST) { return false; }
+
+    return std::all_of(r.aggregations.begin(), r.aggregations.end(), [v_type](auto const& a) {
+      return cudf::has_atomic_support(cudf::detail::target_type(v_type, a->kind)) and
+             is_hash_aggregation(a->kind);
+    });
   });
 }
 
diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_kernels.cuh
index 15a38029bc4..4bc73631732 100644
--- a/cpp/src/groupby/hash/multi_pass_kernels.cuh
+++ b/cpp/src/groupby/hash/multi_pass_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,11 +20,12 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/assert.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cuda/atomic>
+
 #include <cmath>
 
 namespace cudf {
@@ -86,7 +87,8 @@ struct var_hash_functor {
     auto x        = static_cast<Target>(source.element<Source>(source_index));
     auto mean     = static_cast<Target>(sum.element<SumType>(target_index)) / group_size;
     Target result = (x - mean) * (x - mean) / (group_size - ddof);
-    atomicAdd(&target.element<Target>(target_index), result);
+    cuda::atomic_ref<Target, cuda::thread_scope_device> ref{target.element<Target>(target_index)};
+    ref.fetch_add(result, cuda::std::memory_order_relaxed);
     // STD sqrt is applied in finalize()
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 05510d12a7b..3f977dc81d7 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -167,7 +167,9 @@ void aggregate_result_functor::operator()<aggregation::MIN>(aggregation const& a
       column_view null_removed_map(
         data_type(type_to_id<size_type>()),
         argmin_result.size(),
-        static_cast<void const*>(argmin_result.template data<size_type>()));
+        static_cast<void const*>(argmin_result.template data<size_type>()),
+        nullptr,
+        0);
       auto transformed_result =
         cudf::detail::gather(table_view({values}),
                              null_removed_map,
@@ -207,7 +209,9 @@ void aggregate_result_functor::operator()<aggregation::MAX>(aggregation const& a
       column_view null_removed_map(
         data_type(type_to_id<size_type>()),
         argmax_result.size(),
-        static_cast<void const*>(argmax_result.template data<size_type>()));
+        static_cast<void const*>(argmax_result.template data<size_type>()),
+        nullptr,
+        0);
       auto transformed_result =
         cudf::detail::gather(table_view({values}),
                              null_removed_map,
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index c61a998a40c..f95ad72f453 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -96,12 +96,12 @@ std::unique_ptr<column> group_collect(column_view const& values,
   auto [child_column,
         offsets_column] = [null_handling, num_groups, &values, &group_offsets, stream, mr] {
     auto offsets_column = make_numeric_column(
-      data_type(type_to_id<offset_type>()), num_groups + 1, mask_state::UNALLOCATED, stream, mr);
+      data_type(type_to_id<size_type>()), num_groups + 1, mask_state::UNALLOCATED, stream, mr);
 
     thrust::copy(rmm::exec_policy(stream),
                  group_offsets.begin(),
                  group_offsets.end(),
-                 offsets_column->mutable_view().template begin<offset_type>());
+                 offsets_column->mutable_view().template begin<size_type>());
 
     // If column of grouped values contains null elements, and null_policy == EXCLUDE,
     // those elements must be filtered out, and offsets recomputed.
diff --git a/cpp/src/groupby/sort/group_merge_lists.cu b/cpp/src/groupby/sort/group_merge_lists.cu
index 3043d107635..2c72128dbfb 100644
--- a/cpp/src/groupby/sort/group_merge_lists.cu
+++ b/cpp/src/groupby/sort/group_merge_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                "Input to `group_merge_lists` must be a non-nullable lists column.");
 
   auto offsets_column = make_numeric_column(
-    data_type(type_to_id<offset_type>()), num_groups + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_to_id<size_type>()), num_groups + 1, mask_state::UNALLOCATED, stream, mr);
 
   // Generate offsets of the output lists column by gathering from the provided group offsets and
   // the input list offsets.
@@ -54,7 +54,7 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                  group_offsets.begin(),
                  group_offsets.end(),
                  lists_column_view(values).offsets_begin(),
-                 offsets_column->mutable_view().template begin<offset_type>());
+                 offsets_column->mutable_view().template begin<size_type>());
 
   // The child column of the output lists column is just copied from the input column.
   auto child_column =
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index a641f702658..3ed53944172 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <reductions/struct_minmax_util.cuh>
+#include <reductions/nested_type_minmax_util.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -26,6 +26,7 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 58ee06fcfef..34543147b1c 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <reductions/struct_minmax_util.cuh>
+#include <reductions/nested_type_minmax_util.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -135,14 +135,16 @@ static constexpr bool is_group_reduction_supported()
     case aggregation::MIN:
     case aggregation::MAX: return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
     case aggregation::ARGMIN:
-    case aggregation::ARGMAX:
-      return is_relationally_comparable<T, T>() or std::is_same_v<T, cudf::struct_view>;
+    case aggregation::ARGMAX: return is_relationally_comparable<T, T>() or cudf::is_nested<T>();
     default: return false;
   }
 }
 
 template <aggregation::Kind K, typename T>
-struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_supported<K, T>()>> {
+struct group_reduction_functor<
+  K,
+  T,
+  std::enable_if_t<is_group_reduction_supported<K, T>() && !cudf::is_nested<T>()>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
@@ -207,11 +209,11 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
   }
 };
 
-template <aggregation::Kind K>
+template <aggregation::Kind K, typename T>
 struct group_reduction_functor<
   K,
-  cudf::struct_view,
-  std::enable_if_t<is_group_reduction_supported<K, cudf::struct_view>()>> {
+  T,
+  std::enable_if_t<is_group_reduction_supported<K, T>() && cudf::is_nested<T>()>> {
   static std::unique_ptr<column> invoke(column_view const& values,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
@@ -219,7 +221,7 @@ struct group_reduction_functor<
                                         rmm::mr::device_memory_resource* mr)
   {
     // This is be expected to be size_type.
-    using ResultType = cudf::detail::target_type_t<cudf::struct_view, K>;
+    using ResultType = cudf::detail::target_type_t<T, K>;
 
     auto result = make_fixed_width_column(
       data_type{type_to_id<ResultType>()}, num_groups, mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 8cd2d8baf4e..30b6f67dffe 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -131,18 +132,31 @@ struct var_functor {
     }
 
     // set nulls
-    auto result_view = mutable_column_device_view::create(*result, stream);
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator(0),
-                       group_sizes.size(),
-                       [d_result = *result_view, d_group_sizes, ddof] __device__(size_type i) {
-                         size_type group_size = d_group_sizes[i];
-                         if (group_size == 0 or group_size - ddof <= 0)
-                           d_result.set_null(i);
-                         else
-                           d_result.set_valid(i);
-                       });
-
+    auto result_view  = mutable_column_device_view::create(*result, stream);
+    auto null_count   = rmm::device_scalar<cudf::size_type>(0, stream, mr);
+    auto d_null_count = null_count.data();
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      group_sizes.size(),
+      [d_result = *result_view, d_group_sizes, ddof, d_null_count] __device__(size_type i) {
+        size_type group_size = d_group_sizes[i];
+        if (group_size == 0 or group_size - ddof <= 0) {
+          d_result.set_null(i);
+          // Assuming that typical data does not have too many nulls this
+          // atomic shouldn't serialize the code too much. The alternatives
+          // would be 1) writing a more complex kernel using cub/shmem to
+          // increase parallelism, or 2) calling `cudf::count_nulls` after the
+          // fact. (1) is more work than it's worth without benchmarking, and
+          // this approach should outperform (2) unless large amounts of the
+          // data is null.
+          atomicAdd(d_null_count, 1);
+        } else {
+          d_result.set_valid(i);
+        }
+      });
+
+    result->set_null_count(null_count.value(stream));
     return result;
   }
 
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 26af7246029..32120988065 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -126,7 +126,7 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
                "Unsupported list type in grouped rank scan.");
   auto const& rank_agg         = dynamic_cast<cudf::detail::rank_aggregation const&>(agg);
   auto const& group_labels     = helper.group_labels(stream);
-  auto const group_labels_view = column_view(cudf::device_span<const size_type>(group_labels));
+  auto const group_labels_view = column_view(cudf::device_span<size_type const>(group_labels));
   auto const gather_map        = [&]() {
     if (is_presorted()) {  // assumes both keys and values are sorted, Spark does this.
       return cudf::detail::sequence(group_labels.size(),
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index ee1c8782b8c..4c87c091b34 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -27,6 +27,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/labeling/label_segments.cuh>
 #include <cudf/detail/scatter.hpp>
+#include <cudf/detail/sequence.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
@@ -37,10 +38,8 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/distance.h>
-#include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/sequence.h>
 #include <thrust/unique.h>
 
 #include <algorithm>
@@ -95,20 +94,12 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream)
 
   if (_key_sorted_order) { return sliced_key_sorted_order(); }
 
-  // TODO (dm): optimization. When keys are pre sorted but ignore nulls is true,
-  //            we still want all rows with nulls in the end. Sort is costly, so
-  //            do a copy_if(counting, sorted_order, {bitmask.is_valid(i)})
   if (_keys_pre_sorted == sorted::YES) {
-    _key_sorted_order = make_numeric_column(
-      data_type(type_to_id<size_type>()), _keys.num_rows(), mask_state::UNALLOCATED, stream);
-
-    auto d_key_sorted_order = _key_sorted_order->mutable_view().data<size_type>();
-
-    thrust::sequence(rmm::exec_policy(stream),
-                     d_key_sorted_order,
-                     d_key_sorted_order + _key_sorted_order->size(),
-                     0);
-
+    _key_sorted_order = cudf::detail::sequence(_keys.num_rows(),
+                                               numeric_scalar<size_type>(0),
+                                               numeric_scalar<size_type>(1),
+                                               stream,
+                                               rmm::mr::get_current_device_resource());
     return sliced_key_sorted_order();
   }
 
@@ -147,12 +138,14 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
   if (_group_offsets) return *_group_offsets;
 
   auto const size = num_keys(stream);
-  _group_offsets  = std::make_unique<index_vector>(size + 1, stream);
+  // Create a temporary variable and only set _group_offsets right before the return.
+  // This way, a 2nd (parallel) call to this will not be given a partially created object.
+  auto group_offsets = std::make_unique<index_vector>(size + 1, stream);
 
   auto const comparator = cudf::experimental::row::equality::self_comparator{_keys, stream};
 
   auto const sorted_order = key_sort_order(stream).data<size_type>();
-  decltype(_group_offsets->begin()) result_end;
+  decltype(group_offsets->begin()) result_end;
 
   if (cudf::detail::has_nested_columns(_keys)) {
     auto const d_key_equal = comparator.equal_to<true>(
@@ -170,7 +163,7 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
                                  itr,
                                  itr + size,
                                  result.begin(),
-                                 _group_offsets->begin(),
+                                 group_offsets->begin(),
                                  thrust::identity<bool>{});
   } else {
     auto const d_key_equal = comparator.equal_to<false>(
@@ -178,14 +171,15 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
     result_end = thrust::unique_copy(rmm::exec_policy(stream),
                                      thrust::counting_iterator<size_type>(0),
                                      thrust::counting_iterator<size_type>(size),
-                                     _group_offsets->begin(),
+                                     group_offsets->begin(),
                                      permuted_row_equality_comparator(d_key_equal, sorted_order));
   }
 
-  size_type num_groups = thrust::distance(_group_offsets->begin(), result_end);
-  _group_offsets->set_element(num_groups, size, stream);
-  _group_offsets->resize(num_groups + 1, stream);
+  auto const num_groups = thrust::distance(group_offsets->begin(), result_end);
+  group_offsets->set_element_async(num_groups, size, stream);
+  group_offsets->resize(num_groups + 1, stream);
 
+  _group_offsets = std::move(group_offsets);
   return *_group_offsets;
 }
 
@@ -194,18 +188,18 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels(
 {
   if (_group_labels) return *_group_labels;
 
-  // Get group labels for future use in segmented sorting
-  _group_labels = std::make_unique<index_vector>(num_keys(stream), stream);
+  // Create a temporary variable and only set _group_labels right before the return.
+  // This way, a 2nd (parallel) call to this will not be given a partially created object.
+  auto group_labels = std::make_unique<index_vector>(num_keys(stream), stream);
 
-  auto& group_labels = *_group_labels;
-  if (num_keys(stream) == 0) return group_labels;
+  if (num_keys(stream)) {
+    auto const& offsets = group_offsets(stream);
+    cudf::detail::label_segments(
+      offsets.begin(), offsets.end(), group_labels->begin(), group_labels->end(), stream);
+  }
 
-  cudf::detail::label_segments(group_offsets(stream).begin(),
-                               group_offsets(stream).end(),
-                               group_labels.begin(),
-                               group_labels.end(),
-                               stream);
-  return group_labels;
+  _group_labels = std::move(group_labels);
+  return *_group_labels;
 }
 
 column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stream)
@@ -215,8 +209,11 @@ column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stre
   column_ptr temp_labels = make_numeric_column(
     data_type(type_to_id<size_type>()), _keys.num_rows(), mask_state::ALL_NULL, stream);
 
-  auto group_labels_view = cudf::column_view(
-    data_type(type_to_id<size_type>()), group_labels(stream).size(), group_labels(stream).data());
+  auto group_labels_view = cudf::column_view(data_type(type_to_id<size_type>()),
+                                             group_labels(stream).size(),
+                                             group_labels(stream).data(),
+                                             nullptr,
+                                             0);
 
   auto scatter_map = key_sort_order(stream);
 
@@ -239,14 +236,14 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea
   auto [row_bitmask, null_count] =
     cudf::detail::bitmask_and(_keys, stream, rmm::mr::get_current_device_resource());
 
-  _keys_bitmask_column = make_numeric_column(
-    data_type(type_id::INT8), _keys.num_rows(), std::move(row_bitmask), null_count, stream);
-
-  auto keys_bitmask_view = _keys_bitmask_column->mutable_view();
-  using T                = id_to_type<type_id::INT8>;
-  thrust::fill(
-    rmm::exec_policy(stream), keys_bitmask_view.begin<T>(), keys_bitmask_view.end<T>(), 0);
+  auto const zero = numeric_scalar<int8_t>(0);
+  // Create a temporary variable and only set _keys_bitmask_column right before the return.
+  // This way, a 2nd (parallel) call to this will not be given a partially created object.
+  auto keys_bitmask_column = cudf::detail::sequence(
+    _keys.num_rows(), zero, zero, stream, rmm::mr::get_current_device_resource());
+  keys_bitmask_column->set_null_mask(std::move(row_bitmask), null_count);
 
+  _keys_bitmask_column = std::move(keys_bitmask_column);
   return _keys_bitmask_column->view();
 }
 
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index e83b237102d..439b1c2d066 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -21,8 +21,7 @@
 #include <hash/managed.cuh>
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -35,6 +34,8 @@
 #include <limits>
 #include <type_traits>
 
+#include <cuda/atomic>
+
 namespace {
 template <std::size_t N>
 struct packed {
@@ -91,8 +92,8 @@ union pair_packer;
 template <typename pair_type>
 union pair_packer<pair_type, std::enable_if_t<is_packable<pair_type>()>> {
   using packed_type = packed_t<pair_type>;
-  packed_type const packed;
-  pair_type const pair;
+  packed_type packed;
+  pair_type pair;
 
   __device__ pair_packer(pair_type _pair) : pair{_pair} {}
 
@@ -114,7 +115,7 @@ union pair_packer<pair_type, std::enable_if_t<is_packable<pair_type>()>> {
  */
 template <typename Key,
           typename Element,
-          typename Hasher    = cudf::detail::default_hash<Key>,
+          typename Hasher    = cudf::hashing::detail::default_hash<Key>,
           typename Equality  = equal_to<Key>,
           typename Allocator = default_allocator<thrust::pair<Key, Element>>>
 class concurrent_unordered_map {
@@ -127,7 +128,7 @@ class concurrent_unordered_map {
   using mapped_type    = Element;
   using value_type     = thrust::pair<Key, Element>;
   using iterator       = cycle_iterator_adapter<value_type*>;
-  using const_iterator = const cycle_iterator_adapter<value_type*>;
+  using const_iterator = cycle_iterator_adapter<value_type*> const;
 
  public:
   /**
@@ -160,11 +161,11 @@ class concurrent_unordered_map {
    */
   static auto create(size_type capacity,
                      rmm::cuda_stream_view stream,
-                     const mapped_type unused_element = std::numeric_limits<mapped_type>::max(),
-                     const key_type unused_key        = std::numeric_limits<key_type>::max(),
-                     const Hasher& hash_function      = hasher(),
-                     const Equality& equal            = key_equal(),
-                     const allocator_type& allocator  = allocator_type())
+                     mapped_type const unused_element = std::numeric_limits<mapped_type>::max(),
+                     key_type const unused_key        = std::numeric_limits<key_type>::max(),
+                     Hasher const& hash_function      = hasher(),
+                     Equality const& equal            = key_equal(),
+                     allocator_type const& allocator  = allocator_type())
   {
     CUDF_FUNC_RANGE();
     using Self = concurrent_unordered_map<Key, Element, Hasher, Equality, Allocator>;
@@ -268,16 +269,21 @@ class concurrent_unordered_map {
   __device__ std::enable_if_t<is_packable<pair_type>(), insert_result> attempt_insert(
     value_type* const __restrict__ insert_location, value_type const& insert_pair)
   {
-    pair_packer<pair_type> const unused{thrust::make_pair(m_unused_key, m_unused_element)};
-    pair_packer<pair_type> const new_pair{insert_pair};
-    pair_packer<pair_type> const old{
-      atomicCAS(reinterpret_cast<typename pair_packer<pair_type>::packed_type*>(insert_location),
-                unused.packed,
-                new_pair.packed)};
+    pair_packer<pair_type> expected{thrust::make_pair(m_unused_key, m_unused_element)};
+    pair_packer<pair_type> desired{insert_pair};
+
+    using packed_type = typename pair_packer<pair_type>::packed_type;
 
-    if (old.packed == unused.packed) { return insert_result::SUCCESS; }
+    auto* insert_ptr = reinterpret_cast<packed_type*>(insert_location);
+    cuda::atomic_ref<packed_type, cuda::thread_scope_device> ref{*insert_ptr};
+    auto const success =
+      ref.compare_exchange_strong(expected.packed, desired.packed, cuda::std::memory_order_relaxed);
 
-    if (m_equal(old.pair.first, insert_pair.first)) { return insert_result::DUPLICATE; }
+    if (success) {
+      return insert_result::SUCCESS;
+    } else if (m_equal(expected.pair.first, insert_pair.first)) {
+      return insert_result::DUPLICATE;
+    }
     return insert_result::CONTINUE;
   }
 
@@ -292,16 +298,20 @@ class concurrent_unordered_map {
   __device__ std::enable_if_t<not is_packable<pair_type>(), insert_result> attempt_insert(
     value_type* const __restrict__ insert_location, value_type const& insert_pair)
   {
-    key_type const old_key{atomicCAS(&(insert_location->first), m_unused_key, insert_pair.first)};
+    auto expected = m_unused_key;
+    cuda::atomic_ref<key_type, cuda::thread_scope_device> ref{insert_location->first};
+    auto const key_success =
+      ref.compare_exchange_strong(expected, insert_pair.first, cuda::std::memory_order_relaxed);
 
     // Hash bucket empty
-    if (m_unused_key == old_key) {
+    if (key_success) {
       insert_location->second = insert_pair.second;
       return insert_result::SUCCESS;
     }
-
     // Key already exists
-    if (m_equal(old_key, insert_pair.first)) { return insert_result::DUPLICATE; }
+    else if (m_equal(expected, insert_pair.first)) {
+      return insert_result::DUPLICATE;
+    }
 
     return insert_result::CONTINUE;
   }
@@ -327,7 +337,7 @@ class concurrent_unordered_map {
    */
   __device__ thrust::pair<iterator, bool> insert(value_type const& insert_pair)
   {
-    const size_type key_hash{m_hf(insert_pair.first)};
+    size_type const key_hash{m_hf(insert_pair.first)};
     size_type index{key_hash % m_capacity};
 
     insert_result status{insert_result::CONTINUE};
@@ -421,7 +431,7 @@ class concurrent_unordered_map {
     }
   }
 
-  void assign_async(const concurrent_unordered_map& other, rmm::cuda_stream_view stream)
+  void assign_async(concurrent_unordered_map const& other, rmm::cuda_stream_view stream)
   {
     if (other.m_capacity <= m_capacity) {
       m_capacity = other.m_capacity;
@@ -454,7 +464,7 @@ class concurrent_unordered_map {
     }
   }
 
-  void prefetch(const int dev_id, rmm::cuda_stream_view stream)
+  void prefetch(int const dev_id, rmm::cuda_stream_view stream)
   {
     cudaPointerAttributes hashtbl_values_ptr_attributes;
     cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values);
@@ -510,11 +520,11 @@ class concurrent_unordered_map {
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   concurrent_unordered_map(size_type capacity,
-                           const mapped_type unused_element,
-                           const key_type unused_key,
-                           const Hasher& hash_function,
-                           const Equality& equal,
-                           const allocator_type& allocator,
+                           mapped_type const unused_element,
+                           key_type const unused_key,
+                           Hasher const& hash_function,
+                           Equality const& equal,
+                           allocator_type const& allocator,
                            rmm::cuda_stream_view stream)
     : m_hf(hash_function),
       m_equal(equal),
diff --git a/cpp/src/hash/hash_allocator.cuh b/cpp/src/hash/hash_allocator.cuh
index 207f46ae543..64a2a852ae4 100644
--- a/cpp/src/hash/hash_allocator.cuh
+++ b/cpp/src/hash/hash_allocator.cuh
@@ -1,62 +1,62 @@
-/*
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <new>
-
-#include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-template <class T>
-struct default_allocator {
-  using value_type                    = T;
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
-
-  default_allocator() = default;
-
-  template <class U>
-  constexpr default_allocator(const default_allocator<U>&) noexcept
-  {
-  }
-
-  T* allocate(std::size_t n, rmm::cuda_stream_view stream = cudf::get_default_stream()) const
-  {
-    return static_cast<T*>(mr->allocate(n * sizeof(T), stream));
-  }
-
-  void deallocate(T* p,
-                  std::size_t n,
-                  rmm::cuda_stream_view stream = cudf::get_default_stream()) const
-  {
-    mr->deallocate(p, n * sizeof(T), stream);
-  }
-};
-
-template <class T, class U>
-bool operator==(const default_allocator<T>&, const default_allocator<U>&)
-{
-  return true;
-}
-template <class T, class U>
-bool operator!=(const default_allocator<T>&, const default_allocator<U>&)
-{
-  return false;
-}
+/*
+ * Copyright (c) 2017-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <new>
+
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+template <class T>
+struct default_allocator {
+  using value_type                    = T;
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+
+  default_allocator() = default;
+
+  template <class U>
+  constexpr default_allocator(default_allocator<U> const&) noexcept
+  {
+  }
+
+  T* allocate(std::size_t n, rmm::cuda_stream_view stream = cudf::get_default_stream()) const
+  {
+    return static_cast<T*>(mr->allocate(n * sizeof(T), stream));
+  }
+
+  void deallocate(T* p,
+                  std::size_t n,
+                  rmm::cuda_stream_view stream = cudf::get_default_stream()) const
+  {
+    mr->deallocate(p, n * sizeof(T), stream);
+  }
+};
+
+template <class T, class U>
+bool operator==(default_allocator<T> const&, default_allocator<U> const&)
+{
+  return true;
+}
+template <class T, class U>
+bool operator!=(default_allocator<T> const&, default_allocator<U> const&)
+{
+  return false;
+}
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index 150017d9117..68e02ef3cf4 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,44 +13,15 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/hashing.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/algorithm.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/table/table_device_view.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/execution_policy.h>
-#include <thrust/tabulate.h>
-
-#include <algorithm>
 
 namespace cudf {
+namespace hashing {
 namespace detail {
-namespace {
-
-template <typename IterType>
-std::vector<column_view> to_leaf_columns(IterType iter_begin, IterType iter_end)
-{
-  std::vector<column_view> leaf_columns;
-  std::for_each(iter_begin, iter_end, [&leaf_columns](column_view const& col) {
-    if (is_nested(col.type())) {
-      CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "unsupported nested type");
-      auto child_columns = to_leaf_columns(col.child_begin(), col.child_end());
-      leaf_columns.insert(leaf_columns.end(), child_columns.begin(), child_columns.end());
-    } else {
-      leaf_columns.emplace_back(col);
-    }
-  });
-  return leaf_columns;
-}
-
-}  // namespace
 
 std::unique_ptr<column> hash(table_view const& input,
                              hash_id hash_function,
@@ -59,22 +30,24 @@ std::unique_ptr<column> hash(table_view const& input,
                              rmm::mr::device_memory_resource* mr)
 {
   switch (hash_function) {
-    case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, seed, stream, mr);
-    case (hash_id::HASH_SPARK_MURMUR3): return spark_murmur_hash3_32(input, seed, stream, mr);
-    case (hash_id::HASH_MD5): return md5_hash(input, stream, mr);
+    case (hash_id::HASH_MURMUR3): return murmurhash3_x86_32(input, seed, stream, mr);
+    case (hash_id::HASH_SPARK_MURMUR3): return spark_murmurhash3_x86_32(input, seed, stream, mr);
+    case (hash_id::HASH_MD5): return md5(input, stream, mr);
     default: CUDF_FAIL("Unsupported hash function.");
   }
 }
 
 }  // namespace detail
+}  // namespace hashing
 
 std::unique_ptr<column> hash(table_view const& input,
                              hash_id hash_function,
                              uint32_t seed,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash(input, hash_function, seed, cudf::get_default_stream(), mr);
+  return hashing::detail::hash(input, hash_function, seed, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/hash/helper_functions.cuh b/cpp/src/hash/helper_functions.cuh
index 70fc47538c9..cd58ec5f57d 100644
--- a/cpp/src/hash/helper_functions.cuh
+++ b/cpp/src/hash/helper_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ inline size_t compute_hash_table_size(cudf::size_type num_keys_to_insert,
 }
 
 template <typename pair_type>
-__forceinline__ __device__ pair_type load_pair_vectorized(const pair_type* __restrict__ const ptr)
+__forceinline__ __device__ pair_type load_pair_vectorized(pair_type const* __restrict__ const ptr)
 {
   if (sizeof(uint4) == sizeof(pair_type)) {
     union pair_type2vec_type {
@@ -57,7 +57,7 @@ __forceinline__ __device__ pair_type load_pair_vectorized(const pair_type* __res
       pair_type pair_val;
     };
     pair_type2vec_type converter = {0, 0, 0, 0};
-    converter.vec_val            = *reinterpret_cast<const uint4*>(ptr);
+    converter.vec_val            = *reinterpret_cast<uint4 const*>(ptr);
     return converter.pair_val;
   } else if (sizeof(uint2) == sizeof(pair_type)) {
     union pair_type2vec_type {
@@ -65,7 +65,7 @@ __forceinline__ __device__ pair_type load_pair_vectorized(const pair_type* __res
       pair_type pair_val;
     };
     pair_type2vec_type converter = {0, 0};
-    converter.vec_val            = *reinterpret_cast<const uint2*>(ptr);
+    converter.vec_val            = *reinterpret_cast<uint2 const*>(ptr);
     return converter.pair_val;
   } else if (sizeof(int) == sizeof(pair_type)) {
     union pair_type2vec_type {
@@ -73,7 +73,7 @@ __forceinline__ __device__ pair_type load_pair_vectorized(const pair_type* __res
       pair_type pair_val;
     };
     pair_type2vec_type converter = {0};
-    converter.vec_val            = *reinterpret_cast<const int*>(ptr);
+    converter.vec_val            = *reinterpret_cast<int const*>(ptr);
     return converter.pair_val;
   } else if (sizeof(short) == sizeof(pair_type)) {
     union pair_type2vec_type {
@@ -81,7 +81,7 @@ __forceinline__ __device__ pair_type load_pair_vectorized(const pair_type* __res
       pair_type pair_val;
     };
     pair_type2vec_type converter = {0};
-    converter.vec_val            = *reinterpret_cast<const short*>(ptr);
+    converter.vec_val            = *reinterpret_cast<short const*>(ptr);
     return converter.pair_val;
   } else {
     return *ptr;
@@ -90,7 +90,7 @@ __forceinline__ __device__ pair_type load_pair_vectorized(const pair_type* __res
 
 template <typename pair_type>
 __forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ const ptr,
-                                                      const pair_type val)
+                                                      pair_type const val)
 {
   if (sizeof(uint4) == sizeof(pair_type)) {
     union pair_type2vec_type {
@@ -131,11 +131,11 @@ __forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ co
 
 template <typename value_type, typename size_type, typename key_type, typename elem_type>
 __global__ void init_hashtbl(value_type* __restrict__ const hashtbl_values,
-                             const size_type n,
-                             const key_type key_val,
-                             const elem_type elem_val)
+                             size_type const n,
+                             key_type const key_val,
+                             elem_type const elem_val)
 {
-  const size_type idx = blockIdx.x * blockDim.x + threadIdx.x;
+  size_type const idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < n) {
     store_pair_vectorized(hashtbl_values + idx, thrust::make_pair(key_val, elem_val));
   }
@@ -147,7 +147,7 @@ struct equal_to {
   using first_argument_type  = T;
   using second_argument_type = T;
   __forceinline__ __host__ __device__ constexpr bool operator()(
-    const first_argument_type& lhs, const second_argument_type& rhs) const
+    first_argument_type const& lhs, second_argument_type const& rhs) const
   {
     return lhs == rhs;
   }
@@ -164,9 +164,9 @@ class cycle_iterator_adapter {
 
   cycle_iterator_adapter() = delete;
 
-  __host__ __device__ explicit cycle_iterator_adapter(const iterator_type& begin,
-                                                      const iterator_type& end,
-                                                      const iterator_type& current)
+  __host__ __device__ explicit cycle_iterator_adapter(iterator_type const& begin,
+                                                      iterator_type const& end,
+                                                      iterator_type const& current)
     : m_begin(begin), m_end(end), m_current(current)
   {
   }
@@ -180,7 +180,7 @@ class cycle_iterator_adapter {
     return *this;
   }
 
-  __host__ __device__ const cycle_iterator_adapter& operator++() const
+  __host__ __device__ cycle_iterator_adapter const& operator++() const
   {
     if (m_end == (m_current + 1))
       m_current = m_begin;
@@ -199,7 +199,7 @@ class cycle_iterator_adapter {
     return old;
   }
 
-  __host__ __device__ const cycle_iterator_adapter& operator++(int) const
+  __host__ __device__ cycle_iterator_adapter const& operator++(int) const
   {
     cycle_iterator_adapter<iterator_type> old(m_begin, m_end, m_current);
     if (m_end == (m_current + 1))
@@ -209,14 +209,14 @@ class cycle_iterator_adapter {
     return old;
   }
 
-  __host__ __device__ bool equal(const cycle_iterator_adapter<iterator_type>& other) const
+  __host__ __device__ bool equal(cycle_iterator_adapter<iterator_type> const& other) const
   {
     return m_current == other.m_current && m_begin == other.m_begin && m_end == other.m_end;
   }
 
   __host__ __device__ reference& operator*() { return *m_current; }
 
-  __host__ __device__ const reference& operator*() const { return *m_current; }
+  __host__ __device__ reference const& operator*() const { return *m_current; }
 
   __host__ __device__ const pointer operator->() const { return m_current.operator->(); }
 
@@ -229,15 +229,15 @@ class cycle_iterator_adapter {
 };
 
 template <class T>
-__host__ __device__ bool operator==(const cycle_iterator_adapter<T>& lhs,
-                                    const cycle_iterator_adapter<T>& rhs)
+__host__ __device__ bool operator==(cycle_iterator_adapter<T> const& lhs,
+                                    cycle_iterator_adapter<T> const& rhs)
 {
   return lhs.equal(rhs);
 }
 
 template <class T>
-__host__ __device__ bool operator!=(const cycle_iterator_adapter<T>& lhs,
-                                    const cycle_iterator_adapter<T>& rhs)
+__host__ __device__ bool operator!=(cycle_iterator_adapter<T> const& lhs,
+                                    cycle_iterator_adapter<T> const& rhs)
 {
   return !lhs.equal(rhs);
 }
diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh
index d85a12c69a9..10aeb6e52be 100644
--- a/cpp/src/hash/managed.cuh
+++ b/cpp/src/hash/managed.cuh
@@ -1,44 +1,44 @@
-/*
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <new>
-
-struct managed {
-  static void* operator new(size_t n)
-  {
-    void* ptr          = nullptr;
-    cudaError_t result = cudaMallocManaged(&ptr, n);
-    if (cudaSuccess != result || 0 == ptr) throw std::bad_alloc();
-    return ptr;
-  }
-
-  static void operator delete(void* ptr) noexcept
-  {
-    auto const free_result = cudaFree(ptr);
-    assert(free_result == cudaSuccess);
-  }
-};
-
-inline bool isPtrManaged(cudaPointerAttributes attr)
-{
-#if CUDART_VERSION >= 10000
-  return (attr.type == cudaMemoryTypeManaged);
-#else
-  return attr.isManaged;
-#endif
-}
+/*
+ * Copyright (c) 2017-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <new>
+
+struct managed {
+  static void* operator new(size_t n)
+  {
+    void* ptr          = nullptr;
+    cudaError_t result = cudaMallocManaged(&ptr, n);
+    if (cudaSuccess != result || 0 == ptr) throw std::bad_alloc();
+    return ptr;
+  }
+
+  static void operator delete(void* ptr) noexcept
+  {
+    auto const free_result = cudaFree(ptr);
+    assert(free_result == cudaSuccess);
+  }
+};
+
+inline bool isPtrManaged(cudaPointerAttributes attr)
+{
+#if CUDART_VERSION >= 10000
+  return (attr.type == cudaMemoryTypeManaged);
+#else
+  return attr.isManaged;
+#endif
+}
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index b9a6d8f490a..4e25f9f8c23 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -15,10 +15,11 @@
  */
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/hashing.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/hashing/detail/hash_functions.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -36,11 +37,100 @@
 #include <iterator>
 
 namespace cudf {
-
+namespace hashing {
 namespace detail {
 
 namespace {
 
+template <int capacity, typename hash_step_callable>
+struct hash_circular_buffer {
+  uint8_t storage[capacity];
+  uint8_t* cur;
+  int available_space{capacity};
+  hash_step_callable hash_step;
+
+  __device__ inline hash_circular_buffer(hash_step_callable hash_step)
+    : cur{storage}, hash_step{hash_step}
+  {
+  }
+
+  __device__ inline void put(uint8_t const* in, int size)
+  {
+    int copy_start = 0;
+    while (size >= available_space) {
+      // The buffer will be filled by this chunk of data. Copy a chunk of the
+      // data to fill the buffer and trigger a hash step.
+      memcpy(cur, in + copy_start, available_space);
+      hash_step(storage);
+      size -= available_space;
+      copy_start += available_space;
+      cur             = storage;
+      available_space = capacity;
+    }
+    // The buffer will not be filled by the remaining data. That is, `size >= 0
+    // && size < capacity`. We copy the remaining data into the buffer but do
+    // not trigger a hash step.
+    memcpy(cur, in + copy_start, size);
+    cur += size;
+    available_space -= size;
+  }
+
+  __device__ inline void pad(int const space_to_leave)
+  {
+    if (space_to_leave > available_space) {
+      memset(cur, 0x00, available_space);
+      hash_step(storage);
+      cur             = storage;
+      available_space = capacity;
+    }
+    memset(cur, 0x00, available_space - space_to_leave);
+    cur += available_space - space_to_leave;
+    available_space = space_to_leave;
+  }
+
+  __device__ inline uint8_t const& operator[](int idx) const { return storage[idx]; }
+};
+
+// Get a uint8_t pointer to a column element and its size as a pair.
+template <typename Element>
+auto __device__ inline get_element_pointer_and_size(Element const& element)
+{
+  if constexpr (is_fixed_width<Element>() && !is_chrono<Element>()) {
+    return thrust::make_pair(reinterpret_cast<uint8_t const*>(&element), sizeof(Element));
+  } else {
+    CUDF_UNREACHABLE("Unsupported type.");
+  }
+}
+
+template <>
+auto __device__ inline get_element_pointer_and_size(string_view const& element)
+{
+  return thrust::make_pair(reinterpret_cast<uint8_t const*>(element.data()), element.size_bytes());
+}
+
+/**
+ * Modified GPU implementation of
+ * https://johnnylee-sde.github.io/Fast-unsigned-integer-to-hex-string/
+ * Copyright (c) 2015 Barry Clark
+ * Licensed under the MIT license.
+ * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
+ */
+void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destination)
+{
+  // Transform 0xABCD'1234 => 0x0000'ABCD'0000'1234 => 0x0B0A'0D0C'0201'0403
+  uint64_t x = num;
+  x          = ((x & 0xFFFF'0000u) << 16) | ((x & 0xFFFF));
+  x          = ((x & 0x000F'0000'000Fu) << 8) | ((x & 0x00F0'0000'00F0u) >> 4) |
+      ((x & 0x0F00'0000'0F00u) << 16) | ((x & 0xF000'0000'F000) << 4);
+
+  // Calculate a mask of ascii value offsets for bytes that contain alphabetical hex digits
+  uint64_t offsets = (((x + 0x0606'0606'0606'0606) >> 4) & 0x0101'0101'0101'0101) * 0x27;
+
+  x |= 0x3030'3030'3030'3030;
+  x += offsets;
+  std::memcpy(destination, reinterpret_cast<uint8_t*>(&x), 8);
+}
+
 // The MD5 algorithm and its hash/shift constants are officially specified in
 // RFC 1321. For convenience, these values can also be found on Wikipedia:
 // https://en.wikipedia.org/wiki/MD5
@@ -85,8 +175,8 @@ struct MD5Hasher {
     }
   }
 
-  MD5Hasher(const MD5Hasher&)            = delete;
-  MD5Hasher& operator=(const MD5Hasher&) = delete;
+  MD5Hasher(MD5Hasher const&)            = delete;
+  MD5Hasher& operator=(MD5Hasher const&) = delete;
   MD5Hasher(MD5Hasher&&)                 = delete;
   MD5Hasher& operator=(MD5Hasher&&)      = delete;
 
@@ -106,7 +196,7 @@ struct MD5Hasher {
   struct md5_hash_step {
     uint32_t (&hash_values)[4];
 
-    void __device__ inline operator()(const uint8_t (&buffer)[message_chunk_size])
+    void __device__ inline operator()(uint8_t const (&buffer)[message_chunk_size])
     {
       uint32_t A = hash_values[0];
       uint32_t B = hash_values[1];
@@ -215,9 +305,9 @@ inline bool md5_leaf_type_check(data_type dt)
 
 }  // namespace
 
-std::unique_ptr<column> md5_hash(table_view const& input,
-                                 rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> md5(table_view const& input,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
 {
   if (input.num_columns() == 0 || input.num_rows() == 0) {
     // Return the MD5 hash of a zero-length input.
@@ -281,4 +371,14 @@ std::unique_ptr<column> md5_hash(table_view const& input,
 }
 
 }  // namespace detail
+
+std::unique_ptr<column> md5(table_view const& input,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::md5(input, stream, mr);
+}
+
+}  // namespace hashing
 }  // namespace cudf
diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu
new file mode 100644
index 00000000000..1fc469686e1
--- /dev/null
+++ b/cpp/src/hash/murmurhash3_x64_128.cu
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
+#include <cudf/table/table_device_view.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+
+namespace cudf {
+namespace hashing {
+namespace detail {
+namespace {
+
+using hash_value_type = thrust::pair<uint64_t, uint64_t>;
+
+/**
+ * @brief Computes the hash value of a row in the given table.
+ *
+ * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
+ */
+template <typename Nullate>
+class murmur_device_row_hasher {
+ public:
+  murmur_device_row_hasher(Nullate nulls,
+                           table_device_view const& t,
+                           uint64_t seed,
+                           uint64_t* d_output1,
+                           uint64_t* d_output2)
+    : _check_nulls(nulls), _input(t), _seed(seed), _output1(d_output1), _output2(d_output2)
+  {
+  }
+
+  /**
+   * @brief Return the hash value of a row in the given table.
+   *
+   * @param row_index The row index to compute the hash value of
+   * @return The hash value of the row
+   */
+  __device__ void operator()(size_type row_index) const noexcept
+  {
+    auto h = cudf::detail::accumulate(
+      _input.begin(),
+      _input.end(),
+      hash_value_type{_seed, 0},
+      [row_index, nulls = this->_check_nulls] __device__(auto hash, auto column) {
+        return cudf::type_dispatcher(
+          column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
+      });
+    _output1[row_index] = h.first;
+    _output2[row_index] = h.second;
+  }
+
+  /**
+   * @brief Computes the hash value of an element in the given column.
+   */
+  class element_hasher_adapter {
+   public:
+    template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(column_device_view const& col,
+                                          size_type row_index,
+                                          Nullate const _check_nulls,
+                                          hash_value_type const _seed) const noexcept
+    {
+      if (_check_nulls && col.is_null(row_index)) {
+        return {std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max()};
+      }
+      auto const hasher = MurmurHash3_x64_128<T>{_seed.first};
+      return hasher(col.element<T>(row_index));
+    }
+
+    template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(column_device_view const&,
+                                          size_type,
+                                          Nullate const,
+                                          hash_value_type const) const noexcept
+    {
+      CUDF_UNREACHABLE("Unsupported type for MurmurHash3_x64_128");
+    }
+  };
+
+  Nullate const _check_nulls;
+  table_device_view const _input;
+  uint64_t const _seed;
+  uint64_t* _output1;
+  uint64_t* _output2;
+};
+
+}  // namespace
+
+std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
+                                           uint64_t seed,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  auto output1 = make_numeric_column(
+    data_type(type_id::UINT64), input.num_rows(), mask_state::UNALLOCATED, stream, mr);
+  auto output2 = make_numeric_column(
+    data_type(type_id::UINT64), input.num_rows(), mask_state::UNALLOCATED, stream, mr);
+
+  if (!input.is_empty()) {
+    bool const nullable   = has_nulls(input);
+    auto const input_view = table_device_view::create(input, stream);
+    auto d_output1        = output1->mutable_view().data<uint64_t>();
+    auto d_output2        = output2->mutable_view().data<uint64_t>();
+
+    // Compute the hash value for each row
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::counting_iterator<size_type>(0),
+                       input.num_rows(),
+                       murmur_device_row_hasher(nullable, *input_view, seed, d_output1, d_output2));
+  }
+
+  std::vector<std::unique_ptr<column>> out_columns(2);
+  out_columns.front() = std::move(output1);
+  out_columns.back()  = std::move(output2);
+  return std::make_unique<table>(std::move(out_columns));
+}
+
+}  // namespace detail
+
+std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
+                                           uint64_t seed,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::murmurhash3_x64_128(input, seed, stream, mr);
+}
+
+}  // namespace hashing
+}  // namespace cudf
diff --git a/cpp/src/hash/murmur_hash.cu b/cpp/src/hash/murmurhash3_x86_32.cu
similarity index 64%
rename from cpp/src/hash/murmur_hash.cu
rename to cpp/src/hash/murmurhash3_x86_32.cu
index 1b75c818f36..a6ab301a86e 100644
--- a/cpp/src/hash/murmur_hash.cu
+++ b/cpp/src/hash/murmurhash3_x86_32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/hashing.hpp>
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
@@ -26,12 +27,13 @@
 #include <thrust/tabulate.h>
 
 namespace cudf {
+namespace hashing {
 namespace detail {
 
-std::unique_ptr<column> murmur_hash3_32(table_view const& input,
-                                        uint32_t seed,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
+                                           uint32_t seed,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
                                     input.num_rows(),
@@ -50,10 +52,21 @@ std::unique_ptr<column> murmur_hash3_32(table_view const& input,
   thrust::tabulate(rmm::exec_policy(stream),
                    output_view.begin<hash_value_type>(),
                    output_view.end<hash_value_type>(),
-                   row_hasher.device_hasher<MurmurHash3_32>(nullable, seed));
+                   row_hasher.device_hasher<MurmurHash3_x86_32>(nullable, seed));
 
   return output;
 }
 
 }  // namespace detail
+
+std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
+                                           uint32_t seed,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::murmurhash3_x86_32(input, seed, stream, mr);
+}
+
+}  // namespace hashing
 }  // namespace cudf
diff --git a/cpp/src/hash/spark_murmur_hash.cu b/cpp/src/hash/spark_murmurhash3_x86_32.cu
similarity index 81%
rename from cpp/src/hash/spark_murmur_hash.cu
rename to cpp/src/hash/spark_murmurhash3_x86_32.cu
index ed3bc2a3605..c7992b4afa0 100644
--- a/cpp/src/hash/spark_murmur_hash.cu
+++ b/cpp/src/hash/spark_murmurhash3_x86_32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/hashing.hpp>
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/hash_functions.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
@@ -27,6 +28,7 @@
 #include <thrust/tabulate.h>
 
 namespace cudf {
+namespace hashing {
 namespace detail {
 
 namespace {
@@ -34,11 +36,11 @@ namespace {
 using spark_hash_value_type = int32_t;
 
 template <typename Key, CUDF_ENABLE_IF(not cudf::is_nested<Key>())>
-struct SparkMurmurHash3_32 {
+struct Spark_MurmurHash3_x86_32 {
   using result_type = spark_hash_value_type;
 
-  constexpr SparkMurmurHash3_32() = default;
-  constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {}
+  constexpr Spark_MurmurHash3_x86_32() = default;
+  constexpr Spark_MurmurHash3_x86_32(uint32_t seed) : m_seed(seed) {}
 
   [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
   {
@@ -84,10 +86,10 @@ struct SparkMurmurHash3_32 {
       // casting byte-to-int, but C++ does not.
       uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
       k1 *= c1;
-      k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
+      k1 = rotate_bits_left(k1, rot_c1);
       k1 *= c2;
       h ^= k1;
-      h = cudf::detail::rotate_bits_left(h, rot_c2);
+      h = rotate_bits_left(static_cast<uint32_t>(h), rot_c2);
       h = h * 5 + c3;
     }
     return h;
@@ -104,10 +106,10 @@ struct SparkMurmurHash3_32 {
     for (cudf::size_type i = 0; i < nblocks; i++) {
       uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
       k1 *= c1;
-      k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
+      k1 = rotate_bits_left(k1, rot_c1);
       k1 *= c2;
       h ^= k1;
-      h = cudf::detail::rotate_bits_left(h, rot_c2);
+      h = rotate_bits_left(static_cast<uint32_t>(h), rot_c2);
       h = h * 5 + c3;
     }
 
@@ -129,55 +131,56 @@ struct SparkMurmurHash3_32 {
 };
 
 template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<bool>::operator()(bool const& key) const
+spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<bool>::operator()(
+  bool const& key) const
 {
   return compute<uint32_t>(key);
 }
 
 template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<int8_t>::operator()(
+spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<int8_t>::operator()(
   int8_t const& key) const
 {
   return compute<uint32_t>(key);
 }
 
 template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<uint8_t>::operator()(
+spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<uint8_t>::operator()(
   uint8_t const& key) const
 {
   return compute<uint32_t>(key);
 }
 
 template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<int16_t>::operator()(
+spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<int16_t>::operator()(
   int16_t const& key) const
 {
   return compute<uint32_t>(key);
 }
 
 template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<uint16_t>::operator()(
+spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<uint16_t>::operator()(
   uint16_t const& key) const
 {
   return compute<uint32_t>(key);
 }
 
 template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<float>::operator()(
+spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<float>::operator()(
   float const& key) const
 {
-  return compute<float>(detail::normalize_nans(key));
+  return compute<float>(normalize_nans(key));
 }
 
 template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<double>::operator()(
+spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<double>::operator()(
   double const& key) const
 {
-  return compute<double>(detail::normalize_nans(key));
+  return compute<double>(normalize_nans(key));
 }
 
 template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operator()(
+spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<cudf::string_view>::operator()(
   cudf::string_view const& key) const
 {
   auto const data = reinterpret_cast<std::byte const*>(key.data());
@@ -186,21 +189,21 @@ spark_hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::
 }
 
 template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal32>::operator()(
+spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<numeric::decimal32>::operator()(
   numeric::decimal32 const& key) const
 {
   return compute<uint64_t>(key.value());
 }
 
 template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal64>::operator()(
+spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<numeric::decimal64>::operator()(
   numeric::decimal64 const& key) const
 {
   return compute<uint64_t>(key.value());
 }
 
 template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal128>::operator()(
+spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<numeric::decimal128>::operator()(
   numeric::decimal128 const& key) const
 {
   // Generates the Spark MurmurHash3 hash value, mimicking the conversion:
@@ -264,9 +267,9 @@ spark_hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal128>
  * null.
  *
  * For additional differences such as special tail processing and decimal type
- * handling, refer to the SparkMurmurHash3_32 functor.
+ * handling, refer to the Spark_MurmurHash3_x86_32 functor.
  *
- * @tparam hash_function Hash functor to use for hashing elements. Must be SparkMurmurHash3_32.
+ * @tparam hash_function Hash functor to use for hashing elements. Must be Spark_MurmurHash3_x86_32.
  * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
  */
 template <template <typename> class hash_function, typename Nullate>
@@ -283,7 +286,7 @@ class spark_murmur_device_row_hasher {
    */
   __device__ auto operator()(size_type row_index) const noexcept
   {
-    return detail::accumulate(
+    return cudf::detail::accumulate(
       _table.begin(),
       _table.end(),
       _seed,
@@ -331,13 +334,13 @@ class spark_murmur_device_row_hasher {
         if (curr_col.type().id() == type_id::STRUCT) {
           if (curr_col.num_child_columns() == 0) { return _seed; }
           // Non-empty structs are assumed to be decomposed and contain only one child
-          curr_col = detail::structs_column_device_view(curr_col).get_sliced_child(0);
+          curr_col = cudf::detail::structs_column_device_view(curr_col).get_sliced_child(0);
         } else if (curr_col.type().id() == type_id::LIST) {
-          curr_col = detail::lists_column_device_view(curr_col).get_sliced_child();
+          curr_col = cudf::detail::lists_column_device_view(curr_col).get_sliced_child();
         }
       }
 
-      return detail::accumulate(
+      return cudf::detail::accumulate(
         thrust::counting_iterator(0),
         thrust::counting_iterator(curr_col.size()),
         _seed,
@@ -359,8 +362,8 @@ class spark_murmur_device_row_hasher {
   {
     // Error out if passed an unsupported hash_function
     static_assert(
-      std::is_base_of_v<SparkMurmurHash3_32<int>, hash_function<int>>,
-      "spark_murmur_device_row_hasher only supports the SparkMurmurHash3_32 hash function");
+      std::is_base_of_v<Spark_MurmurHash3_x86_32<int>, hash_function<int>>,
+      "spark_murmur_device_row_hasher only supports the Spark_MurmurHash3_x86_32 hash function");
   }
 
   Nullate const _check_nulls;
@@ -392,10 +395,10 @@ void check_hash_compatibility(table_view const& input)
 
 }  // namespace
 
-std::unique_ptr<column> spark_murmur_hash3_32(table_view const& input,
-                                              uint32_t seed,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> spark_murmurhash3_x86_32(table_view const& input,
+                                                 uint32_t seed,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
 {
   auto output = make_numeric_column(data_type(type_to_id<spark_hash_value_type>()),
                                     input.num_rows(),
@@ -418,10 +421,22 @@ std::unique_ptr<column> spark_murmur_hash3_32(table_view const& input,
     rmm::exec_policy(stream),
     output_view.begin<spark_hash_value_type>(),
     output_view.end<spark_hash_value_type>(),
-    row_hasher.device_hasher<SparkMurmurHash3_32, spark_murmur_device_row_hasher>(nullable, seed));
+    row_hasher.device_hasher<Spark_MurmurHash3_x86_32, spark_murmur_device_row_hasher>(nullable,
+                                                                                       seed));
 
   return output;
 }
 
 }  // namespace detail
+
+std::unique_ptr<column> spark_murmurhash3_x86_32(table_view const& input,
+                                                 uint32_t seed,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::spark_murmurhash3_x86_32(input, seed, stream, mr);
+}
+
+}  // namespace hashing
 }  // namespace cudf
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
index 55036bec6a6..87075a39ea3 100644
--- a/cpp/src/hash/unordered_multiset.cuh
+++ b/cpp/src/hash/unordered_multiset.cuh
@@ -18,9 +18,9 @@
 
 #include <hash/helper_functions.cuh>
 
-#include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -32,19 +32,21 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
 
+#include <cuda/atomic>
+
 namespace cudf {
 namespace detail {
 /*
  *  Device view of the unordered multiset
  */
 template <typename Element,
-          typename Hasher   = default_hash<Element>,
+          typename Hasher   = cudf::hashing::detail::default_hash<Element>,
           typename Equality = equal_to<Element>>
 class unordered_multiset_device_view {
  public:
   unordered_multiset_device_view(size_type hash_size,
-                                 const size_type* hash_begin,
-                                 const Element* hash_data)
+                                 size_type const* hash_begin,
+                                 Element const* hash_data)
     : hash_size{hash_size}, hash_begin{hash_begin}, hash_data{hash_data}, hasher(), equals()
   {
   }
@@ -64,15 +66,15 @@ class unordered_multiset_device_view {
   Hasher hasher;
   Equality equals;
   size_type hash_size;
-  const size_type* hash_begin;
-  const Element* hash_data;
+  size_type const* hash_begin;
+  Element const* hash_data;
 };
 
 /*
  * Fixed size set on a device.
  */
 template <typename Element,
-          typename Hasher   = default_hash<Element>,
+          typename Hasher   = cudf::hashing::detail::default_hash<Element>,
           typename Equality = equal_to<Element>>
 class unordered_multiset {
  public:
@@ -95,16 +97,18 @@ class unordered_multiset {
     size_type* d_hash_bins_end   = hash_bins_end.data();
     Element* d_hash_data         = hash_data.data();
 
-    thrust::for_each(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     thrust::make_counting_iterator<size_type>(col.size()),
-                     [d_hash_bins_start, d_col, hasher] __device__(size_t idx) {
-                       if (!d_col.is_null(idx)) {
-                         Element e     = d_col.element<Element>(idx);
-                         size_type tmp = hasher(e) % (2 * d_col.size());
-                         atomicAdd(d_hash_bins_start + tmp, size_type{1});
-                       }
-                     });
+    thrust::for_each(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator<size_type>(0),
+      thrust::make_counting_iterator<size_type>(col.size()),
+      [d_hash_bins_start, d_col, hasher] __device__(size_t idx) {
+        if (!d_col.is_null(idx)) {
+          Element e     = d_col.element<Element>(idx);
+          size_type tmp = hasher(e) % (2 * d_col.size());
+          cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(d_hash_bins_start + tmp)};
+          ref.fetch_add(1, cuda::std::memory_order_relaxed);
+        }
+      });
 
     thrust::exclusive_scan(rmm::exec_policy(stream),
                            hash_bins_start.begin(),
@@ -116,17 +120,19 @@ class unordered_multiset {
                  hash_bins_end.end(),
                  hash_bins_start.begin());
 
-    thrust::for_each(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     thrust::make_counting_iterator<size_type>(col.size()),
-                     [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) {
-                       if (!d_col.is_null(idx)) {
-                         Element e           = d_col.element<Element>(idx);
-                         size_type tmp       = hasher(e) % (2 * d_col.size());
-                         size_type offset    = atomicAdd(d_hash_bins_end + tmp, size_type{1});
-                         d_hash_data[offset] = e;
-                       }
-                     });
+    thrust::for_each(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator<size_type>(0),
+      thrust::make_counting_iterator<size_type>(col.size()),
+      [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) {
+        if (!d_col.is_null(idx)) {
+          Element e     = d_col.element<Element>(idx);
+          size_type tmp = hasher(e) % (2 * d_col.size());
+          cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(d_hash_bins_end + tmp)};
+          size_type offset    = ref.fetch_add(1, cuda::std::memory_order_relaxed);
+          d_hash_data[offset] = e;
+        }
+      });
 
     return unordered_multiset(d_col.size(), std::move(hash_bins_start), std::move(hash_data));
   }
diff --git a/cpp/src/hash/xxhash_64.cu b/cpp/src/hash/xxhash_64.cu
new file mode 100644
index 00000000000..e17bc134420
--- /dev/null
+++ b/cpp/src/hash/xxhash_64.cu
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
+#include <cudf/hashing/detail/hash_functions.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/tabulate.h>
+
+namespace cudf {
+namespace hashing {
+namespace detail {
+
+namespace {
+
+using hash_value_type = uint64_t;
+
+template <typename Key>
+struct XXHash_64 {
+  using result_type = hash_value_type;
+
+  constexpr XXHash_64() = default;
+  constexpr XXHash_64(hash_value_type seed) : m_seed(seed) {}
+
+  __device__ inline uint32_t getblock32(std::byte const* data, std::size_t offset) const
+  {
+    // Read a 4-byte value from the data pointer as individual bytes for safe
+    // unaligned access (very likely for string types).
+    auto block = reinterpret_cast<uint8_t const*>(data + offset);
+    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
+  }
+
+  __device__ inline uint64_t getblock64(std::byte const* data, std::size_t offset) const
+  {
+    uint64_t result = getblock32(data, offset + 4);
+    result          = result << 32;
+    return result | getblock32(data, offset);
+  }
+
+  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
+
+  template <typename T>
+  result_type __device__ inline compute(T const& key) const
+  {
+    auto data = device_span<std::byte const>(reinterpret_cast<std::byte const*>(&key), sizeof(T));
+    return compute_bytes(data);
+  }
+
+  result_type __device__ inline compute_remaining_bytes(device_span<std::byte const>& in,
+                                                        std::size_t offset,
+                                                        result_type h64) const
+  {
+    // remaining data can be processed in 8-byte chunks
+    if ((in.size() % 32) >= 8) {
+      for (; offset <= in.size() - 8; offset += 8) {
+        uint64_t k1 = getblock64(in.data(), offset) * prime2;
+
+        k1 = rotate_bits_left(k1, 31) * prime1;
+        h64 ^= k1;
+        h64 = rotate_bits_left(h64, 27) * prime1 + prime4;
+      }
+    }
+
+    // remaining data can be processed in 4-byte chunks
+    if ((in.size() % 8) >= 4) {
+      for (; offset <= in.size() - 4; offset += 4) {
+        h64 ^= (getblock32(in.data(), offset) & 0xfffffffful) * prime1;
+        h64 = rotate_bits_left(h64, 23) * prime2 + prime3;
+      }
+    }
+
+    // and the rest
+    if (in.size() % 4) {
+      while (offset < in.size()) {
+        h64 ^= (std::to_integer<uint8_t>(in[offset]) & 0xff) * prime5;
+        h64 = rotate_bits_left(h64, 11) * prime1;
+        ++offset;
+      }
+    }
+    return h64;
+  }
+
+  result_type __device__ compute_bytes(device_span<std::byte const>& in) const
+  {
+    uint64_t offset = 0;
+    uint64_t h64;
+    // data can be processed in 32-byte chunks
+    if (in.size() >= 32) {
+      auto limit  = in.size() - 32;
+      uint64_t v1 = m_seed + prime1 + prime2;
+      uint64_t v2 = m_seed + prime2;
+      uint64_t v3 = m_seed;
+      uint64_t v4 = m_seed - prime1;
+
+      do {
+        // pipeline 4*8byte computations
+        v1 += getblock64(in.data(), offset) * prime2;
+        v1 = rotate_bits_left(v1, 31);
+        v1 *= prime1;
+        offset += 8;
+        v2 += getblock64(in.data(), offset) * prime2;
+        v2 = rotate_bits_left(v2, 31);
+        v2 *= prime1;
+        offset += 8;
+        v3 += getblock64(in.data(), offset) * prime2;
+        v3 = rotate_bits_left(v3, 31);
+        v3 *= prime1;
+        offset += 8;
+        v4 += getblock64(in.data(), offset) * prime2;
+        v4 = rotate_bits_left(v4, 31);
+        v4 *= prime1;
+        offset += 8;
+      } while (offset <= limit);
+
+      h64 = rotate_bits_left(v1, 1) + rotate_bits_left(v2, 7) + rotate_bits_left(v3, 12) +
+            rotate_bits_left(v4, 18);
+
+      v1 *= prime2;
+      v1 = rotate_bits_left(v1, 31);
+      v1 *= prime1;
+      h64 ^= v1;
+      h64 = h64 * prime1 + prime4;
+
+      v2 *= prime2;
+      v2 = rotate_bits_left(v2, 31);
+      v2 *= prime1;
+      h64 ^= v2;
+      h64 = h64 * prime1 + prime4;
+
+      v3 *= prime2;
+      v3 = rotate_bits_left(v3, 31);
+      v3 *= prime1;
+      h64 ^= v3;
+      h64 = h64 * prime1 + prime4;
+
+      v4 *= prime2;
+      v4 = rotate_bits_left(v4, 31);
+      v4 *= prime1;
+      h64 ^= v4;
+      h64 = h64 * prime1 + prime4;
+    } else {
+      h64 = m_seed + prime5;
+    }
+
+    h64 += in.size();
+
+    h64 = compute_remaining_bytes(in, offset, h64);
+
+    return finalize(h64);
+  }
+
+  constexpr __host__ __device__ std::uint64_t finalize(std::uint64_t h) const noexcept
+  {
+    h ^= h >> 33;
+    h *= prime2;
+    h ^= h >> 29;
+    h *= prime3;
+    h ^= h >> 32;
+    return h;
+  }
+
+ private:
+  hash_value_type m_seed{};
+  static constexpr uint64_t prime1 = 0x9e3779b185ebca87ul;
+  static constexpr uint64_t prime2 = 0xc2b2ae3d27d4eb4ful;
+  static constexpr uint64_t prime3 = 0x165667b19e3779f9ul;
+  static constexpr uint64_t prime4 = 0x85ebca77c2b2ae63ul;
+  static constexpr uint64_t prime5 = 0x27d4eb2f165667c5ul;
+};
+
+template <>
+hash_value_type __device__ inline XXHash_64<bool>::operator()(bool const& key) const
+{
+  return compute(static_cast<uint8_t>(key));
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<float>::operator()(float const& key) const
+{
+  return compute(normalize_nans(key));
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<double>::operator()(double const& key) const
+{
+  return compute(normalize_nans(key));
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<cudf::string_view>::operator()(
+  cudf::string_view const& key) const
+{
+  auto const len = key.size_bytes();
+  auto data = device_span<std::byte const>(reinterpret_cast<std::byte const*>(key.data()), len);
+  return compute_bytes(data);
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<numeric::decimal32>::operator()(
+  numeric::decimal32 const& key) const
+{
+  return compute(key.value());
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<numeric::decimal64>::operator()(
+  numeric::decimal64 const& key) const
+{
+  return compute(key.value());
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<numeric::decimal128>::operator()(
+  numeric::decimal128 const& key) const
+{
+  return compute(key.value());
+}
+
+/**
+ * @brief Computes the hash value of a row in the given table.
+ *
+ * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
+ */
+template <typename Nullate>
+class device_row_hasher {
+ public:
+  device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed)
+    : _check_nulls(nulls), _table(t), _seed(seed)
+  {
+  }
+
+  __device__ auto operator()(size_type row_index) const noexcept
+  {
+    return cudf::detail::accumulate(
+      _table.begin(),
+      _table.end(),
+      _seed,
+      [row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
+        return cudf::type_dispatcher(
+          column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
+      });
+  }
+
+  /**
+   * @brief Computes the hash value of an element in the given column.
+   */
+  class element_hasher_adapter {
+   public:
+    template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(column_device_view const& col,
+                                          size_type const row_index,
+                                          Nullate const _check_nulls,
+                                          hash_value_type const _seed) const noexcept
+    {
+      if (_check_nulls && col.is_null(row_index)) {
+        return std::numeric_limits<hash_value_type>::max();
+      }
+      auto const hasher = XXHash_64<T>{_seed};
+      return hasher(col.element<T>(row_index));
+    }
+
+    template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(column_device_view const&,
+                                          size_type const,
+                                          Nullate const,
+                                          hash_value_type const) const noexcept
+    {
+      CUDF_UNREACHABLE("Unsupported type for XXHash_64");
+    }
+  };
+
+  Nullate const _check_nulls;
+  table_device_view const _table;
+  hash_value_type const _seed;
+};
+
+}  // namespace
+
+std::unique_ptr<column> xxhash_64(table_view const& input,
+                                  uint64_t seed,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
+{
+  auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
+                                    input.num_rows(),
+                                    mask_state::UNALLOCATED,
+                                    stream,
+                                    mr);
+
+  // Return early if there's nothing to hash
+  if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
+
+  bool const nullable   = has_nulls(input);
+  auto const input_view = table_device_view::create(input, stream);
+  auto output_view      = output->mutable_view();
+
+  // Compute the hash value for each row
+  thrust::tabulate(rmm::exec_policy(stream),
+                   output_view.begin<hash_value_type>(),
+                   output_view.end<hash_value_type>(),
+                   device_row_hasher(nullable, *input_view, seed));
+
+  return output;
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> xxhash_64(table_view const& input,
+                                  uint64_t seed,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::xxhash_64(input, seed, stream, mr);
+}
+
+}  // namespace hashing
+}  // namespace cudf
diff --git a/cpp/src/interop/detail/arrow_allocator.cpp b/cpp/src/interop/detail/arrow_allocator.cpp
index cb67c893573..41fb68a5748 100644
--- a/cpp/src/interop/detail/arrow_allocator.cpp
+++ b/cpp/src/interop/detail/arrow_allocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,41 @@
 
 #include <cudf/detail/interop.hpp>
 
+#include <memory>
+#include <sys/mman.h>
+#include <unistd.h>
+
 namespace cudf {
 namespace detail {
 
-std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(const int64_t size, arrow::MemoryPool* ar_mr)
+/*
+  Enable Transparent Huge Pages (THP) for large (>4MB) allocations.
+  `buf` is returned untouched.
+  Enabling THP can improve performance of device-host memory transfers
+  significantly, see <https://github.com/rapidsai/cudf/pull/13914>.
+*/
+template <typename T>
+T enable_hugepage(T&& buf)
+{
+  if (buf->size() < (1u << 22u)) {  // Smaller than 4 MB
+    return std::move(buf);
+  }
+
+#ifdef MADV_HUGEPAGE
+  const auto pagesize = sysconf(_SC_PAGESIZE);
+  void* addr          = const_cast<uint8_t*>(buf->data());
+  if (addr == nullptr) { return std::move(buf); }
+  auto length{static_cast<std::size_t>(buf->size())};
+  if (std::align(pagesize, pagesize, addr, length)) {
+    // Intentionally not checking for errors that may be returned by older kernel versions;
+    // optimistically tries enabling huge pages.
+    madvise(addr, length, MADV_HUGEPAGE);
+  }
+#endif
+  return std::move(buf);
+}
+
+std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(int64_t const size, arrow::MemoryPool* ar_mr)
 {
   /*
   nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer
@@ -28,12 +59,12 @@ std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(const int64_t size, arrow::
   To work around this issue we compile an allocation shim in C++ and use
   that from our cuda sources
   */
-  auto result = arrow::AllocateBuffer(size, ar_mr);
+  arrow::Result<std::unique_ptr<arrow::Buffer>> result = arrow::AllocateBuffer(size, ar_mr);
   CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer");
-  return std::move(result).ValueOrDie();
+  return enable_hugepage(std::move(result).ValueOrDie());
 }
 
-std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(const int64_t size, arrow::MemoryPool* ar_mr)
+std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(int64_t const size, arrow::MemoryPool* ar_mr)
 {
   /*
   nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer
@@ -42,9 +73,9 @@ std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(const int64_t size, arrow::
   To work around this issue we compile an allocation shim in C++ and use
   that from our cuda sources
   */
-  auto result = arrow::AllocateBitmap(size, ar_mr);
+  arrow::Result<std::shared_ptr<arrow::Buffer>> result = arrow::AllocateBitmap(size, ar_mr);
   CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow bitmap");
-  return std::move(result).ValueOrDie();
+  return enable_hugepage(std::move(result).ValueOrDie());
 }
 
 }  // namespace detail
diff --git a/cpp/src/interop/detail/arrow_allocator.hpp b/cpp/src/interop/detail/arrow_allocator.hpp
index 20099f91afa..75c1baa0dca 100644
--- a/cpp/src/interop/detail/arrow_allocator.hpp
+++ b/cpp/src/interop/detail/arrow_allocator.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,10 +22,10 @@ namespace cudf {
 namespace detail {
 
 // unique_ptr because that is what AllocateBuffer returns
-std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(const int64_t size, arrow::MemoryPool* ar_mr);
+std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(int64_t const size, arrow::MemoryPool* ar_mr);
 
 // shared_ptr because that is what AllocateBitmap returns
-std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(const int64_t size, arrow::MemoryPool* ar_mr);
+std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(int64_t const size, arrow::MemoryPool* ar_mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 58afc8e9015..1759c998c75 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -173,13 +173,15 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
   }
   CUDF_EXPECTS(tensor.shape[0] >= 0,
                "DLTensor first dim should be of shape greater than or equal to 0.");
-  CUDF_EXPECTS(tensor.shape[0] < std::numeric_limits<size_type>::max(),
-               "DLTensor first dim exceeds size supported by cudf");
+  CUDF_EXPECTS(tensor.shape[0] <= std::numeric_limits<size_type>::max(),
+               "DLTensor first dim exceeds the column size limit",
+               std::overflow_error);
   if (tensor.ndim > 1) {
     CUDF_EXPECTS(tensor.shape[1] >= 0,
                  "DLTensor second dim should be of shape greater than or equal to 0.");
-    CUDF_EXPECTS(tensor.shape[1] < std::numeric_limits<size_type>::max(),
-                 "DLTensor second dim exceeds size supported by cudf");
+    CUDF_EXPECTS(tensor.shape[1] <= std::numeric_limits<size_type>::max(),
+                 "DLTensor second dim exceeds the column size limit",
+                 std::overflow_error);
   }
   size_t const num_columns = (tensor.ndim == 2) ? static_cast<size_t>(tensor.shape[1]) : 1;
 
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index f9d67a43b7a..30cfee97fd8 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -33,6 +33,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 
 #include <thrust/gather.h>
 
@@ -110,7 +111,7 @@ struct dispatch_to_cudf_column {
     auto mask        = std::make_unique<rmm::device_buffer>(allocation_size, stream, mr);
     auto mask_buffer = array.null_bitmap();
     CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(),
-                                  reinterpret_cast<const uint8_t*>(mask_buffer->address()),
+                                  reinterpret_cast<uint8_t const*>(mask_buffer->address()),
                                   null_bitmap_size,
                                   cudaMemcpyDefault,
                                   stream.value()));
@@ -144,7 +145,7 @@ struct dispatch_to_cudf_column {
     auto mutable_column_view = col->mutable_view();
     CUDF_CUDA_TRY(cudaMemcpyAsync(
       mutable_column_view.data<T>(),
-      reinterpret_cast<const uint8_t*>(data_buffer->address()) + array.offset() * sizeof(T),
+      reinterpret_cast<uint8_t const*>(data_buffer->address()) + array.offset() * sizeof(T),
       sizeof(T) * num_rows,
       cudaMemcpyDefault,
       stream.value()));
@@ -169,7 +170,11 @@ struct dispatch_to_cudf_column {
 
 std::unique_ptr<column> get_empty_type_column(size_type size)
 {
-  return std::make_unique<column>(data_type(type_id::EMPTY), size, rmm::device_buffer{});
+  // this abomination is required by cuDF Python, which needs to handle
+  // [PyArrow null arrays](https://arrow.apache.org/docs/python/generated/pyarrow.NullArray.html)
+  // of finite length
+  return std::make_unique<column>(
+    data_type(type_id::EMPTY), size, rmm::device_buffer{}, rmm::device_buffer{}, size);
 }
 
 /**
@@ -200,7 +205,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>
 
   CUDF_CUDA_TRY(cudaMemcpyAsync(
     mutable_column_view.data<DeviceType>(),
-    reinterpret_cast<const uint8_t*>(data_buffer->address()) + array.offset() * sizeof(DeviceType),
+    reinterpret_cast<uint8_t const*>(data_buffer->address()) + array.offset() * sizeof(DeviceType),
     sizeof(DeviceType) * num_rows,
     cudaMemcpyDefault,
     stream.value()));
@@ -233,9 +238,11 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(
   rmm::mr::device_memory_resource* mr)
 {
   auto data_buffer = array.data()->buffers[1];
-  auto data        = rmm::device_buffer(data_buffer->size(), stream, mr);
+  // mask-to-bools expects the mask to be bitmask_type aligned/padded
+  auto data = rmm::device_buffer(
+    cudf::bitmask_allocation_size_bytes(data_buffer->size() * CHAR_BIT), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(),
-                                reinterpret_cast<const uint8_t*>(data_buffer->address()),
+                                reinterpret_cast<uint8_t const*>(data_buffer->address()),
                                 data_buffer->size(),
                                 cudaMemcpyDefault,
                                 stream.value()));
@@ -319,8 +326,11 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::dictionary32>(
 
   // Child columns shouldn't have masks and we need the mask in main column
   auto column_contents = indices_column->release();
-  indices_column       = std::make_unique<column>(
-    dict_indices_type, static_cast<size_type>(array.length()), std::move(*(column_contents.data)));
+  indices_column       = std::make_unique<column>(dict_indices_type,
+                                            static_cast<size_type>(array.length()),
+                                            std::move(*(column_contents.data)),
+                                            rmm::device_buffer{},
+                                            0);
 
   return make_dictionary_column(std::move(keys_column),
                                 std::move(indices_column),
@@ -435,7 +445,8 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                     return get_column(*array_chunk, cudf_type, false, stream, mr);
                                   });
                    if (concat_columns.empty()) {
-                     return std::make_unique<column>(cudf_type, 0, rmm::device_buffer{});
+                     return std::make_unique<column>(
+                       cudf_type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
                    } else if (concat_columns.size() == 1) {
                      return std::move(concat_columns[0]);
                    }
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 7f88019beb2..958a2fcb95f 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -51,7 +51,7 @@ std::shared_ptr<arrow::Buffer> fetch_data_buffer(column_view input_view,
                                                  arrow::MemoryPool* ar_mr,
                                                  rmm::cuda_stream_view stream)
 {
-  const int64_t data_size_in_bytes = sizeof(T) * input_view.size();
+  int64_t const data_size_in_bytes = sizeof(T) * input_view.size();
 
   auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr);
 
@@ -71,7 +71,7 @@ std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
                                                  arrow::MemoryPool* ar_mr,
                                                  rmm::cuda_stream_view stream)
 {
-  const int64_t mask_size_in_bytes = cudf::bitmask_allocation_size_bytes(input_view.size());
+  int64_t const mask_size_in_bytes = cudf::bitmask_allocation_size_bytes(input_view.size());
 
   if (input_view.has_nulls()) {
     auto mask_buffer = allocate_arrow_bitmap(static_cast<int64_t>(input_view.size()), ar_mr);
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index 6312fbf93a9..221cdf93042 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -27,8 +27,9 @@ template <>
 uint64_t container::get_encoded()
 {
   uint64_t val = 0;
-  for (uint64_t len = 0; len < 64; len += 7) {
-    auto const byte = get_raw<uint8_t>();
+  for (auto len = 0; len < 64; len += 7) {
+    // 64-bit int since shift left is upto 64.
+    uint64_t const byte = get_raw<uint8_t>();
     val |= (byte & 0x7f) << len;
     if (byte < 0x80) break;
   }
@@ -198,7 +199,7 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
     // Read the next sync markers and ensure they match the first ones we
     // encountered.  If they don't, we have to assume the data is corrupted,
     // and thus, we terminate processing immediately.
-    const uint64_t sync_marker[] = {get_raw<uint64_t>(), get_raw<uint64_t>()};
+    uint64_t const sync_marker[] = {get_raw<uint64_t>(), get_raw<uint64_t>()};
     bool valid_sync_markers =
       ((sync_marker[0] == md->sync_marker[0]) && (sync_marker[1] == md->sync_marker[1]));
     if (!valid_sync_markers) { return false; }
@@ -296,7 +297,7 @@ enum attrtype_e {
  *
  * @returns true if successful, false if error
  */
-bool schema_parser::parse(std::vector<schema_entry>& schema, const std::string& json_str)
+bool schema_parser::parse(std::vector<schema_entry>& schema, std::string const& json_str)
 {
   // Empty schema
   if (json_str == "[]") return true;
@@ -305,7 +306,7 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, const std::string&
   int depth = 0, parent_idx = -1, entry_idx = -1;
   json_state_e state = state_attrname;
   std::string str;
-  const std::unordered_map<std::string, type_kind_e> typenames = {
+  std::unordered_map<std::string, type_kind_e> const typenames = {
     {"null", type_null},
     {"boolean", type_boolean},
     {"int", type_int},
@@ -328,7 +329,7 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, const std::string&
     {"local-timestamp-millis", type_local_timestamp_millis},
     {"local-timestamp-micros", type_local_timestamp_micros},
     {"duration", type_duration}};
-  const std::unordered_map<std::string, attrtype_e> attrnames = {
+  std::unordered_map<std::string, attrtype_e> const attrnames = {
     {"type", attrtype_type},
     {"name", attrtype_name},
     {"fields", attrtype_fields},
@@ -480,8 +481,8 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, const std::string&
 std::string schema_parser::get_str()
 {
   std::string s;
-  const char* start = m_cur;
-  const char* cur   = start;
+  char const* start = m_cur;
+  char const* cur   = start;
   while (cur < m_end && *cur++ != '"')
     ;
   int32_t len = static_cast<int32_t>(cur - start - 1);
diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp
index e88b1051737..f2813a1ba51 100644
--- a/cpp/src/io/avro/avro.hpp
+++ b/cpp/src/io/avro/avro.hpp
@@ -123,16 +123,16 @@ class schema_parser {
 
  public:
   schema_parser() {}
-  bool parse(std::vector<schema_entry>& schema, const std::string& str);
+  bool parse(std::vector<schema_entry>& schema, std::string const& str);
 
  protected:
   [[nodiscard]] bool more_data() const { return (m_cur < m_end); }
   std::string get_str();
 
  protected:
-  const char* m_base;
-  const char* m_cur;
-  const char* m_end;
+  char const* m_base;
+  char const* m_cur;
+  char const* m_end;
 };
 
 /**
@@ -165,7 +165,7 @@ class container {
 
  protected:
   // Base address of the file data.  This will always point to the file's metadata.
-  const uint8_t* m_base;
+  uint8_t const* m_base;
 
   // Start, current, and end pointers for the file.  These pointers refer to the
   // actual data content of the file, not the metadata.  `m_cur` and `m_start`
@@ -174,9 +174,9 @@ class container {
   // the first row to be processed.  `m_cur` is updated as the file is parsed,
   // until either `m_end` is reached, or the number of rows requested by the user
   // is reached.
-  const uint8_t* m_start;
-  const uint8_t* m_cur;
-  const uint8_t* m_end;
+  uint8_t const* m_start;
+  uint8_t const* m_cur;
+  uint8_t const* m_end;
 };
 
 }  // namespace avro
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 6f33bd5da7a..2c634d9b590 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -32,7 +32,7 @@ constexpr int max_shared_schema_len = 1000;
  * Avro varint encoding - see
  * https://avro.apache.org/docs/1.2.0/spec.html#binary_encoding
  */
-static inline int64_t __device__ avro_decode_zigzag_varint(const uint8_t*& cur, const uint8_t* end)
+static inline int64_t __device__ avro_decode_zigzag_varint(uint8_t const*& cur, uint8_t const* end)
 {
   uint64_t u = 0;
   if (cur < end) {
@@ -102,7 +102,7 @@ avro_decode_row(schemadesc_s const* schema,
   // processing multiple blocks, i.e. this block could only have 10 rows, but
   // it's the 3rd block (where each block has 10 rows), so we need to write to
   // the 30th row in the destination array.
-  const ptrdiff_t dst_row =
+  ptrdiff_t const dst_row =
     (row >= first_row && row < end_row ? static_cast<ptrdiff_t>((row - first_row) + row_offset)
                                        : -1);
   // Critical invariant checks: dst_row should be -1 or greater, and
@@ -171,7 +171,7 @@ avro_decode_row(schemadesc_s const* schema,
       case type_enum: {
         int64_t v       = avro_decode_zigzag_varint(cur, end);
         size_t count    = 0;
-        const char* ptr = nullptr;
+        char const* ptr = nullptr;
         if (kind == type_enum) {  // dictionary
           size_t idx = schema[i].count + v;
           if (idx < global_dictionary.size()) {
@@ -179,7 +179,7 @@ avro_decode_row(schemadesc_s const* schema,
             count = global_dictionary[idx].second;
           }
         } else if (v >= 0 && cur + v <= end) {  // string or bytes
-          ptr   = reinterpret_cast<const char*>(cur);
+          ptr   = reinterpret_cast<char const*>(cur);
           count = (size_t)v;
           cur += count;
         }
@@ -354,8 +354,8 @@ __global__ void __launch_bounds__(num_warps * 32, 2)
   __syncthreads();
   if (block_id >= blocks.size()) { return; }
 
-  const uint8_t* cur      = avro_data + blk->offset;
-  const uint8_t* end      = cur + blk->size;
+  uint8_t const* cur      = avro_data + blk->offset;
+  uint8_t const* end      = cur + blk->size;
   size_t first_row        = blk->first_row + blk->row_offset;
   size_t cur_row          = blk->row_offset;
   size_t end_row          = first_row + blk->num_rows;
@@ -363,7 +363,7 @@ __global__ void __launch_bounds__(num_warps * 32, 2)
 
   while (cur < end) {
     uint32_t nrows;
-    const uint8_t* start = cur;
+    uint8_t const* start = cur;
 
     if (cur + min_row_size * rows_remaining == end) {
       // We're dealing with predictable fixed-size rows, which means we can
@@ -419,7 +419,7 @@ __global__ void __launch_bounds__(num_warps * 32, 2)
  * @param[in] avro_data Raw block data
  * @param[in] schema_len Number of entries in schema
  * @param[in] min_row_size Minimum size in bytes of a row
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void DecodeAvroColumnData(device_span<block_desc_s const> blocks,
                           schemadesc_s* schema,
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index a1be00dff9b..f73e1db91c3 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -190,9 +190,12 @@ rmm::device_buffer decompress_data(datasource& source,
                                    rmm::cuda_stream_view stream)
 {
   if (meta.codec == "deflate") {
-    auto inflate_in = hostdevice_vector<device_span<uint8_t const>>(meta.block_list.size(), stream);
-    auto inflate_out   = hostdevice_vector<device_span<uint8_t>>(meta.block_list.size(), stream);
-    auto inflate_stats = hostdevice_vector<compression_result>(meta.block_list.size(), stream);
+    auto inflate_in =
+      cudf::detail::hostdevice_vector<device_span<uint8_t const>>(meta.block_list.size(), stream);
+    auto inflate_out =
+      cudf::detail::hostdevice_vector<device_span<uint8_t>>(meta.block_list.size(), stream);
+    auto inflate_stats =
+      cudf::detail::hostdevice_vector<compression_result>(meta.block_list.size(), stream);
     thrust::fill(rmm::exec_policy(stream),
                  inflate_stats.d_begin(),
                  inflate_stats.d_end(),
@@ -218,12 +221,12 @@ rmm::device_buffer decompress_data(datasource& source,
       meta.block_list[i].size   = static_cast<uint32_t>(inflate_out[i].size());
       dst_pos += meta.block_list[i].size;
     }
-    inflate_in.host_to_device(stream);
+    inflate_in.host_to_device_async(stream);
 
     for (int loop_cnt = 0; loop_cnt < 2; loop_cnt++) {
-      inflate_out.host_to_device(stream);
+      inflate_out.host_to_device_async(stream);
       gpuinflate(inflate_in, inflate_out, inflate_stats, gzip_header_included::NO, stream);
-      inflate_stats.device_to_host(stream, true);
+      inflate_stats.device_to_host_sync(stream);
 
       // Check if larger output is required, as it's not known ahead of time
       if (loop_cnt == 0) {
@@ -267,7 +270,7 @@ rmm::device_buffer decompress_data(datasource& source,
     // file header. meta.block_list[i].offset refers to offset of block i in the file, including
     // file header.
     // Find ptrs to each compressed block in comp_block_data by removing header offset.
-    hostdevice_vector<void const*> compressed_data_ptrs(num_blocks, stream);
+    cudf::detail::hostdevice_vector<void const*> compressed_data_ptrs(num_blocks, stream);
     std::transform(meta.block_list.begin(),
                    meta.block_list.end(),
                    compressed_data_ptrs.host_ptr(),
@@ -275,16 +278,16 @@ rmm::device_buffer decompress_data(datasource& source,
                      return static_cast<std::byte const*>(comp_block_data.data()) +
                             (block.offset - meta.block_list[0].offset);
                    });
-    compressed_data_ptrs.host_to_device(stream);
+    compressed_data_ptrs.host_to_device_async(stream);
 
-    hostdevice_vector<size_t> compressed_data_sizes(num_blocks, stream);
+    cudf::detail::hostdevice_vector<size_t> compressed_data_sizes(num_blocks, stream);
     std::transform(meta.block_list.begin(),
                    meta.block_list.end(),
                    compressed_data_sizes.host_ptr(),
                    [](auto const& block) { return block.size; });
-    compressed_data_sizes.host_to_device(stream);
+    compressed_data_sizes.host_to_device_async(stream);
 
-    hostdevice_vector<size_t> uncompressed_data_sizes(num_blocks, stream);
+    cudf::detail::hostdevice_vector<size_t> uncompressed_data_sizes(num_blocks, stream);
     nvcompStatus_t status =
       nvcompBatchedSnappyGetDecompressSizeAsync(compressed_data_ptrs.device_ptr(),
                                                 compressed_data_sizes.device_ptr(),
@@ -293,7 +296,7 @@ rmm::device_buffer decompress_data(datasource& source,
                                                 stream.value());
     CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess,
                  "Unable to get uncompressed sizes for snappy compressed blocks");
-    uncompressed_data_sizes.device_to_host(stream, true);
+    uncompressed_data_sizes.device_to_host_sync(stream);
 
     size_t const uncompressed_data_size =
       std::reduce(uncompressed_data_sizes.begin(), uncompressed_data_sizes.end());
@@ -309,13 +312,13 @@ rmm::device_buffer decompress_data(datasource& source,
     rmm::device_buffer scratch(temp_size, stream);
     rmm::device_buffer decomp_block_data(uncompressed_data_size, stream);
     rmm::device_uvector<void*> uncompressed_data_ptrs(num_blocks, stream);
-    hostdevice_vector<size_t> uncompressed_data_offsets(num_blocks, stream);
+    cudf::detail::hostdevice_vector<size_t> uncompressed_data_offsets(num_blocks, stream);
 
     std::exclusive_scan(uncompressed_data_sizes.begin(),
                         uncompressed_data_sizes.end(),
                         uncompressed_data_offsets.begin(),
                         0);
-    uncompressed_data_offsets.host_to_device(stream);
+    uncompressed_data_offsets.host_to_device_async(stream);
 
     thrust::tabulate(rmm::exec_policy(stream),
                      uncompressed_data_ptrs.begin(),
@@ -382,7 +385,7 @@ std::vector<column_buffer> decode_data(metadata& meta,
   }
 
   // Build gpu schema
-  auto schema_desc = hostdevice_vector<gpu::schemadesc_s>(meta.schema.size(), stream);
+  auto schema_desc = cudf::detail::hostdevice_vector<gpu::schemadesc_s>(meta.schema.size(), stream);
 
   uint32_t min_row_data_size = 0;
   int skip_field_cnt         = 0;
@@ -447,7 +450,7 @@ std::vector<column_buffer> decode_data(metadata& meta,
   auto block_list = cudf::detail::make_device_uvector_async(
     meta.block_list, stream, rmm::mr::get_current_device_resource());
 
-  schema_desc.host_to_device(stream);
+  schema_desc.host_to_device_async(stream);
 
   gpu::DecodeAvroColumnData(block_list,
                             schema_desc.device_ptr(),
@@ -467,7 +470,7 @@ std::vector<column_buffer> decode_data(metadata& meta,
                                     stream.value()));
     }
   }
-  schema_desc.device_to_host(stream, true);
+  schema_desc.device_to_host_sync(stream);
 
   for (size_t i = 0; i < out_buffers.size(); i++) {
     auto const col_idx          = selection[i].first;
@@ -496,7 +499,7 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
 
   // Select only columns required by the options
   auto selected_columns = meta.select_columns(options.get_columns());
-  if (selected_columns.size() != 0) {
+  if (not selected_columns.empty()) {
     // Get a list of column data types
     std::vector<data_type> column_types;
     for (auto const& col : selected_columns) {
diff --git a/cpp/src/io/comp/brotli_dict.cpp b/cpp/src/io/comp/brotli_dict.cpp
index a55e50d3913..1566d1b779f 100644
--- a/cpp/src/io/comp/brotli_dict.cpp
+++ b/cpp/src/io/comp/brotli_dict.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -6529,7 +6529,7 @@ static const brotli_dictionary_s g_dictionary = {
    136, 224, 164, 184, 224, 164, 149, 224, 165, 141, 224, 164, 176, 224, 164, 191, 224, 164, 175,
    224, 164, 164, 224, 164, 190}};
 
-const brotli_dictionary_s* get_brotli_dictionary() { return &g_dictionary; }
+brotli_dictionary_s const* get_brotli_dictionary() { return &g_dictionary; }
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/brotli_dict.hpp b/cpp/src/io/comp/brotli_dict.hpp
index e5983eaa292..bbcfd9af809 100644
--- a/cpp/src/io/comp/brotli_dict.hpp
+++ b/cpp/src/io/comp/brotli_dict.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -79,7 +79,7 @@ struct brotli_dictionary_s {
 constexpr int brotli_min_dictionary_word_length = 4;
 constexpr int brotli_max_dictionary_word_length = 24;
 
-const brotli_dictionary_s* get_brotli_dictionary();
+brotli_dictionary_s const* get_brotli_dictionary();
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp
index 1542a39a2f0..7159ff30d7c 100644
--- a/cpp/src/io/comp/cpu_unbz2.cpp
+++ b/cpp/src/io/comp/cpu_unbz2.cpp
@@ -122,9 +122,9 @@ using huff_s = struct {
 // Decoder state
 using unbz_state_s = struct {
   // Input
-  const uint8_t* cur;
-  const uint8_t* end;
-  const uint8_t* base;
+  uint8_t const* cur;
+  uint8_t const* end;
+  uint8_t const* base;
   uint64_t bitbuf;
   uint32_t bitpos;
 
@@ -157,13 +157,13 @@ using unbz_state_s = struct {
 };
 
 // return next 32 bits
-static inline uint32_t next32bits(const unbz_state_s* s)
+static inline uint32_t next32bits(unbz_state_s const* s)
 {
   return (uint32_t)((s->bitbuf << s->bitpos) >> 32);
 }
 
 // return next n bits
-static inline uint32_t showbits(const unbz_state_s* s, uint32_t n)
+static inline uint32_t showbits(unbz_state_s const* s, uint32_t n)
 {
   return (uint32_t)((s->bitbuf << s->bitpos) >> (64 - n));
 }
@@ -173,9 +173,9 @@ static void skipbits(unbz_state_s* s, uint32_t n)
 {
   uint32_t bitpos = s->bitpos + n;
   if (bitpos >= 32) {
-    const uint8_t* cur = s->cur + 4;
+    uint8_t const* cur = s->cur + 4;
     uint32_t next32 =
-      (cur + 4 < s->end) ? __builtin_bswap32(*reinterpret_cast<const uint32_t*>(cur + 4)) : 0;
+      (cur + 4 < s->end) ? __builtin_bswap32(*reinterpret_cast<uint32_t const*>(cur + 4)) : 0;
     s->cur    = cur;
     s->bitbuf = (s->bitbuf << 32) | next32;
     bitpos &= 0x1f;
@@ -205,7 +205,7 @@ int32_t bz2_decompress_block(unbz_state_s* s)
   int32_t groupNo;
   int32_t groupPos;
   uint32_t nblock, nblockMAX;
-  const huff_s* gSel = nullptr;
+  huff_s const* gSel = nullptr;
   uint32_t inUse16;
   uint32_t sig0, sig1;
 
@@ -460,7 +460,7 @@ int32_t bz2_decompress_block(unbz_state_s* s)
   // Verify the end-of-block signature: should be followed by another block or an end-of-stream
   // signature
   {
-    const uint8_t* save_cur = s->cur;
+    uint8_t const* save_cur = s->cur;
     uint64_t save_bitbuf    = s->bitbuf;
     uint32_t save_bitpos    = s->bitpos;
     sig0                    = getbits(s, 24);
@@ -524,7 +524,7 @@ static void bzUnRLE(unbz_state_s* s)
 }
 
 int32_t cpu_bz2_uncompress(
-  const uint8_t* source, size_t sourceLen, uint8_t* dest, size_t* destLen, uint64_t* block_start)
+  uint8_t const* source, size_t sourceLen, uint8_t* dest, size_t* destLen, uint64_t* block_start)
 {
   unbz_state_s s{};
   uint32_t v;
@@ -539,7 +539,7 @@ int32_t cpu_bz2_uncompress(
   s.base = source;
   s.end =
     source + sourceLen - 4;  // We will not read the final combined CRC (last 4 bytes of the file)
-  s.bitbuf = __builtin_bswap64(*reinterpret_cast<const uint64_t*>(source));
+  s.bitbuf = __builtin_bswap64(*reinterpret_cast<uint64_t const*>(source));
   s.bitpos = 0;
 
   s.out     = dest;
@@ -565,7 +565,7 @@ int32_t cpu_bz2_uncompress(
       s.cur    = source + (size_t)(bit_offs >> 3);
       s.bitpos = (uint32_t)(bit_offs & 7);
       if (s.cur + 8 > s.end) return BZ_PARAM_ERROR;
-      s.bitbuf = __builtin_bswap64(*reinterpret_cast<const uint64_t*>(s.cur));
+      s.bitbuf = __builtin_bswap64(*reinterpret_cast<uint64_t const*>(s.cur));
     }
   }
 
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index fbe35cc777d..542ca031b7c 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -97,12 +97,12 @@ __inline__ __device__ int brotli_context_lut(int mode) { return (mode << 9); }
 
 inline __device__ uint8_t brotli_transform_type(int idx) { return kTransformsData[(idx * 3) + 1]; }
 
-inline __device__ const uint8_t* brotli_transform_prefix(int idx)
+inline __device__ uint8_t const* brotli_transform_prefix(int idx)
 {
   return &kPrefixSuffix[kPrefixSuffixMap[kTransformsData[(idx * 3)]]];
 }
 
-inline __device__ const uint8_t* brotli_transform_suffix(int idx)
+inline __device__ uint8_t const* brotli_transform_suffix(int idx)
 {
   return &kPrefixSuffix[kPrefixSuffixMap[kTransformsData[(idx * 3) + 2]]];
 }
@@ -152,9 +152,9 @@ constexpr int local_heap_size =
  */
 struct debrotli_state_s {
   // Bitstream
-  const uint8_t* cur;
-  const uint8_t* end;
-  const uint8_t* base;
+  uint8_t const* cur;
+  uint8_t const* end;
+  uint8_t const* base;
   uint2 bitbuf;
   uint32_t bitpos;
   int32_t error;
@@ -199,28 +199,28 @@ struct debrotli_state_s {
 inline __device__ uint32_t Log2Floor(uint32_t value) { return 32 - __clz(value); }
 
 /// @brief initializes the bit reader
-__device__ void initbits(debrotli_state_s* s, const uint8_t* base, size_t len, size_t pos = 0)
+__device__ void initbits(debrotli_state_s* s, uint8_t const* base, size_t len, size_t pos = 0)
 {
-  const uint8_t* p  = base + pos;
+  uint8_t const* p  = base + pos;
   auto prefix_bytes = (uint32_t)(((size_t)p) & 3);
   p -= prefix_bytes;
   s->base     = base;
   s->end      = base + len;
   s->cur      = p;
-  s->bitbuf.x = (p < s->end) ? *reinterpret_cast<const uint32_t*>(p) : 0;
+  s->bitbuf.x = (p < s->end) ? *reinterpret_cast<uint32_t const*>(p) : 0;
   p += 4;
-  s->bitbuf.y = (p < s->end) ? *reinterpret_cast<const uint32_t*>(p) : 0;
+  s->bitbuf.y = (p < s->end) ? *reinterpret_cast<uint32_t const*>(p) : 0;
   s->bitpos   = prefix_bytes * 8;
 }
 
 // return next 32 bits
-inline __device__ uint32_t next32bits(const debrotli_state_s* s)
+inline __device__ uint32_t next32bits(debrotli_state_s const* s)
 {
   return __funnelshift_rc(s->bitbuf.x, s->bitbuf.y, s->bitpos);
 }
 
 /// return next n bits
-inline __device__ uint32_t showbits(const debrotli_state_s* s, uint32_t n)
+inline __device__ uint32_t showbits(debrotli_state_s const* s, uint32_t n)
 {
   uint32_t next32 = __funnelshift_rc(s->bitbuf.x, s->bitbuf.y, s->bitpos);
   return (next32 & ((1 << n) - 1));
@@ -230,9 +230,9 @@ inline __device__ void skipbits(debrotli_state_s* s, uint32_t n)
 {
   uint32_t bitpos = s->bitpos + n;
   if (bitpos >= 32) {
-    const uint8_t* cur = s->cur + 8;
+    uint8_t const* cur = s->cur + 8;
     s->bitbuf.x        = s->bitbuf.y;
-    s->bitbuf.y        = (cur < s->end) ? *reinterpret_cast<const uint32_t*>(cur) : 0;
+    s->bitbuf.y        = (cur < s->end) ? *reinterpret_cast<uint32_t const*>(cur) : 0;
     s->cur             = cur - 4;
     bitpos &= 0x1f;
   }
@@ -288,7 +288,7 @@ static __device__ uint32_t getbits_u8vlc(debrotli_state_s* s)
 }
 
 /// Decode a Huffman code with 8-bit initial lookup
-static __device__ uint32_t getvlc(debrotli_state_s* s, const uint16_t* lut)
+static __device__ uint32_t getvlc(debrotli_state_s* s, uint16_t const* lut)
 {
   uint32_t next32 = next32bits(s);
   uint32_t vlc, len;
@@ -496,7 +496,7 @@ static __device__ uint32_t BuildSimpleHuffmanTable(uint16_t* lut,
                                                    uint32_t num_symbols)
 {
   uint32_t table_size      = 1;
-  const uint32_t goal_size = 1U << root_bits;
+  uint32_t const goal_size = 1U << root_bits;
   switch (num_symbols) {
     case 0: lut[0] = huffcode(0, val[0]); break;
     case 1:
@@ -625,7 +625,7 @@ static __device__ void BuildCodeLengthsHuffmanTable(huff_scratch_s* hs)
 // Returns the table width of the next 2nd level table. |count| is the histogram
 // of bit lengths for the remaining symbols, |len| is the code length of the
 // next processed symbol.
-static __device__ int NextTableBitSize(const uint16_t* const count, int len, int root_bits)
+static __device__ int NextTableBitSize(uint16_t const* const count, int len, int root_bits)
 {
   int left = 1 << (len - root_bits);
   while (len < 15) {
@@ -640,7 +640,7 @@ static __device__ int NextTableBitSize(const uint16_t* const count, int len, int
 // Build a huffman lookup table (currently thread0-only)
 static __device__ uint32_t BuildHuffmanTable(uint16_t* root_lut,
                                              int root_bits,
-                                             const uint16_t* const symbol_lists,
+                                             uint16_t const* const symbol_lists,
                                              uint16_t* count)
 {
   uint32_t code;     // current table entry
@@ -929,7 +929,7 @@ static __device__ uint32_t DecodeHuffmanTree(debrotli_state_s* s,
     memset(&hs->code_length_histo[0], 0, 6 * sizeof(hs->code_length_histo));
     memset(&hs->code_length_code_lengths[0], 0, sizeof(hs->code_length_code_lengths));
     for (i = prefix_code_type; i < 18; i++) {
-      const uint8_t code_len_idx = kCodeLengthCodeOrder[i];
+      uint8_t const code_len_idx = kCodeLengthCodeOrder[i];
       uint32_t ix, v;
 
       ix = showbits(s, 4);
@@ -1525,7 +1525,7 @@ static __device__ void DecodeHuffmanTreeGroups(debrotli_state_s* s,
   HuffmanTreeGroupDecode(s, s->distance_hgroup);
 }
 
-static __device__ int PrepareLiteralDecoding(debrotli_state_s* s, const uint8_t*& context_map_slice)
+static __device__ int PrepareLiteralDecoding(debrotli_state_s* s, uint8_t const*& context_map_slice)
 {
   int context_mode;
   uint32_t block_type     = s->block_type_rb[1];
@@ -1540,8 +1540,8 @@ static __device__ uint32_t DecodeBlockTypeAndLength(debrotli_state_s* s, int tre
 {
   uint32_t max_block_type = s->num_block_types[tree_type];
   if (max_block_type > 1) {
-    const uint16_t* len_tree  = s->block_type_vlc[tree_type];
-    const uint16_t* type_tree = len_tree + brotli_huffman_max_size_26;
+    uint16_t const* len_tree  = s->block_type_vlc[tree_type];
+    uint16_t const* type_tree = len_tree + brotli_huffman_max_size_26;
     uint8_t* ringbuffer       = &s->block_type_rb[tree_type * 2];
     // Read 0..15 + 3..39 bits.
     uint32_t block_type = getvlc(s, type_tree);
@@ -1581,14 +1581,14 @@ inline __device__ int ToUpperCase(uint8_t* p)
 }
 
 static __device__ int TransformDictionaryWord(uint8_t* dst,
-                                              const uint8_t* word,
+                                              uint8_t const* word,
                                               int len,
                                               int transform_idx)
 {
   int idx               = 0;
-  const uint8_t* prefix = brotli_transform_prefix(transform_idx);
+  uint8_t const* prefix = brotli_transform_prefix(transform_idx);
   uint8_t type          = brotli_transform_type(transform_idx);
-  const uint8_t* suffix = brotli_transform_suffix(transform_idx);
+  uint8_t const* suffix = brotli_transform_suffix(transform_idx);
   {
     int prefix_len = *prefix++;
     while (prefix_len--) {
@@ -1596,7 +1596,7 @@ static __device__ int TransformDictionaryWord(uint8_t* dst,
     }
   }
   {
-    const int t = type;
+    int const t = type;
     int i       = 0;
     if (t <= BROTLI_TRANSFORM_OMIT_LAST_9) {
       len -= t;
@@ -1629,15 +1629,15 @@ static __device__ int TransformDictionaryWord(uint8_t* dst,
 }
 
 /// ProcessCommands, actual decoding: 1 warp, most work done by thread0
-static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_dictionary_s* words, int t)
+static __device__ void ProcessCommands(debrotli_state_s* s, brotli_dictionary_s const* words, int t)
 {
   int32_t meta_block_len = s->meta_block_len;
   uint8_t* out           = s->out;
   int32_t pos            = 0;
   int p1                 = s->p1;
   int p2                 = s->p2;
-  const uint16_t* htree_command;
-  const uint8_t *context_map_slice, *dist_context_map_slice;
+  uint16_t const* htree_command;
+  uint8_t const *context_map_slice, *dist_context_map_slice;
   int dist_rb_idx;
   uint32_t blen_L, blen_I, blen_D;
   auto* const dict_scratch = reinterpret_cast<uint8_t*>(
@@ -1695,7 +1695,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
               insert_length -= len;
               blen_L -= len;
               if (brotli_need_context_lut(context_mode)) {
-                const debrotli_huff_tree_group_s* literal_hgroup = s->literal_hgroup;
+                debrotli_huff_tree_group_s const* literal_hgroup = s->literal_hgroup;
                 do {
                   int context = brotli_context(p1, p2, context_mode);
                   p2          = p1;
@@ -1703,7 +1703,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
                   out[pos++]  = p1;
                 } while (--len);
               } else {
-                const uint16_t* literal_htree = s->literal_hgroup->htrees[context_map_slice[0]];
+                uint16_t const* literal_htree = s->literal_hgroup->htrees[context_map_slice[0]];
                 do {
                   p2         = p1;
                   p1         = getvlc(s, literal_htree);
@@ -1721,7 +1721,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
             distance_code    = s->dist_rb[dist_rb_idx & 3];
             distance_context = 1;
           } else {
-            const uint16_t* distance_tree;
+            uint16_t const* distance_tree;
             int distval;
             // Read distance code in the command, unless it was implicitly zero.
             if (blen_D == 0) {
@@ -1744,10 +1744,10 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
                 int dist = distance_code << 1;
                 // kDistanceShortCodeIndexOffset has 2-bit values from LSB: 3, 2, 1, 0, 3, 3, 3, 3,
                 // 3, 3, 2, 2, 2, 2, 2, 2
-                const uint32_t kDistanceShortCodeIndexOffset = 0xAAAF'FF1B;
+                uint32_t const kDistanceShortCodeIndexOffset = 0xAAAF'FF1B;
                 // kDistanceShortCodeValueOffset has 2-bit values from LSB: -0, 0,-0, 0,-1, 1,-2,
                 // 2,-3, 3,-1, 1,-2, 2,-3, 3
-                const uint32_t kDistanceShortCodeValueOffset = 0xFA5F'A500;
+                uint32_t const kDistanceShortCodeValueOffset = 0xFA5F'A500;
                 int v         = (dist_rb_idx + (int)(kDistanceShortCodeIndexOffset >> dist)) & 0x3;
                 distance_code = s->dist_rb[v];
                 v             = (int)(kDistanceShortCodeValueOffset >> dist) & 0x3;
@@ -1864,7 +1864,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
       if (distance_code > 0) {
         // Copy
         for (uint32_t i = t; i < copy_length; i += 32) {
-          const uint8_t* src =
+          uint8_t const* src =
             out + pos + ((i >= (uint32_t)distance_code) ? (i % (uint32_t)distance_code) : i) -
             distance_code;
           b            = *src;
@@ -1872,7 +1872,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
         }
       } else {
         // Dictionary
-        const uint8_t* src = (distance_code < 0) ? &words->data[-distance_code] : dict_scratch;
+        uint8_t const* src = (distance_code < 0) ? &words->data[-distance_code] : dict_scratch;
         if (t < copy_length) {
           b            = src[t];
           out[pos + t] = b;
@@ -1962,7 +1962,7 @@ __global__ void __launch_bounds__(block_size, 2)
       if (!s->error && s->meta_block_len != 0) {
         if (s->is_uncompressed) {
           // Uncompressed block
-          const uint8_t* src = s->cur + ((s->bitpos + 7) >> 3);
+          uint8_t const* src = s->cur + ((s->bitpos + 7) >> 3);
           uint8_t* dst       = s->out;
           if (!t) {
             if (getbits_bytealign(s) != 0) {
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index 85b07e3d222..42c4fbe7bea 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -102,7 +102,7 @@ constexpr int prefetch_size      = (1 << log2_prefetch_size);
 
 /// @brief Prefetcher state
 struct prefetch_queue_s {
-  const uint8_t* cur_p;  ///< Prefetch location
+  uint8_t const* cur_p;  ///< Prefetch location
   int run;               ///< prefetcher will exit when run=0
   uint8_t pref_data[prefetch_size];
 };
@@ -222,7 +222,7 @@ __device__ uint32_t getbits(inflate_state_s* s, uint32_t n)
  * - Incomplete codes are handled by this decoder, since they are permitted
  *   in the deflate format.  See the format notes for fixed() and dynamic().
  */
-__device__ int decode(inflate_state_s* s, const int16_t* counts, const int16_t* symbols)
+__device__ int decode(inflate_state_s* s, int16_t const* counts, int16_t const* symbols)
 {
   unsigned int len;    // current number of bits in code
   unsigned int code;   // len bits being decoded
@@ -279,7 +279,7 @@ __device__ int decode(inflate_state_s* s, const int16_t* counts, const int16_t*
  *   the code bits definition.
  */
 __device__ int construct(
-  inflate_state_s* s, int16_t* counts, int16_t* symbols, const int16_t* length, int n)
+  inflate_state_s* s, int16_t* counts, int16_t* symbols, int16_t const* length, int n)
 {
   int symbol;  // current symbol when stepping through length[]
   int len;     // current length when stepping through counts[]
@@ -553,7 +553,7 @@ __device__ void decode_symbols(inflate_state_s* s)
       } else {
         // Slow length path
         uint32_t next32r       = __brev(next32);
-        const int16_t* symbols = &s->lensym[s->index_slow_len];
+        int16_t const* symbols = &s->lensym[s->index_slow_len];
         unsigned int first     = s->first_slow_len;
         int lext;
 #pragma unroll 1
@@ -592,7 +592,7 @@ __device__ void decode_symbols(inflate_state_s* s)
           cur += 4;
 #else
           cur += 8;
-          bitbuf.y = (cur < end) ? *(const uint32_t*)cur : 0;
+          bitbuf.y = (cur < end) ? *(uint32_t const*)cur : 0;
           cur -= 4;
 #endif
           bitpos &= 0x1f;
@@ -608,7 +608,7 @@ __device__ void decode_symbols(inflate_state_s* s)
           len += dext;
         } else {
           uint32_t next32r       = __brev(next32);
-          const int16_t* symbols = &s->distsym[s->index_slow_dist];
+          int16_t const* symbols = &s->distsym[s->index_slow_dist];
           unsigned int first     = s->first_slow_dist;
 #pragma unroll 1
           for (len = log2_dist_lut + 1; len <= max_bits; len++) {
@@ -645,7 +645,7 @@ __device__ void decode_symbols(inflate_state_s* s)
 #else
         cur += 8;
         if (cur < end) {
-          bitbuf.y = *(const uint32_t*)cur;
+          bitbuf.y = *(uint32_t const*)cur;
           cur -= 4;
         } else {
           bitbuf.y = 0;
@@ -686,8 +686,8 @@ __device__ void init_length_lut(inflate_state_s* s, int t)
   int32_t* lut = s->u.lut.lenlut;
 
   for (uint32_t bits = t; bits < (1 << log2_len_lut); bits += blockDim.x) {
-    const int16_t* cnt     = s->lencnt;
-    const int16_t* symbols = s->lensym;
+    int16_t const* cnt     = s->lencnt;
+    int16_t const* symbols = s->lensym;
     int sym                = -10 << 5;
     unsigned int first     = 0;
     unsigned int rbits     = __brev(bits) >> (32 - log2_len_lut);
@@ -713,7 +713,7 @@ __device__ void init_length_lut(inflate_state_s* s, int t)
   if (!t) {
     unsigned int first = 0;
     unsigned int index = 0;
-    const int16_t* cnt = s->lencnt;
+    int16_t const* cnt = s->lencnt;
     for (unsigned int len = 1; len <= log2_len_lut; len++) {
       unsigned int count = cnt[len];
       index += count;
@@ -734,8 +734,8 @@ __device__ void init_distance_lut(inflate_state_s* s, int t)
   int32_t* lut = s->u.lut.distlut;
 
   for (uint32_t bits = t; bits < (1 << log2_dist_lut); bits += blockDim.x) {
-    const int16_t* cnt     = s->distcnt;
-    const int16_t* symbols = s->distsym;
+    int16_t const* cnt     = s->distcnt;
+    int16_t const* symbols = s->distsym;
     int sym                = 0;
     unsigned int first     = 0;
     unsigned int rbits     = __brev(bits) >> (32 - log2_dist_lut);
@@ -758,7 +758,7 @@ __device__ void init_distance_lut(inflate_state_s* s, int t)
   if (!t) {
     unsigned int first = 0;
     unsigned int index = 0;
-    const int16_t* cnt = s->distcnt;
+    int16_t const* cnt = s->distcnt;
     for (unsigned int len = 1; len <= log2_dist_lut; len++) {
       unsigned int count = cnt[len];
       index += count;
@@ -774,8 +774,8 @@ __device__ void init_distance_lut(inflate_state_s* s, int t)
 __device__ void process_symbols(inflate_state_s* s, int t)
 {
   uint8_t* out           = s->out;
-  const uint8_t* outend  = s->outend;
-  const uint8_t* outbase = s->outbase;
+  uint8_t const* outend  = s->outend;
+  uint8_t const* outbase = s->outbase;
   int batch              = 0;
 
   do {
@@ -804,7 +804,7 @@ __device__ void process_symbols(inflate_state_s* s, int t)
       len    = max((symbol & 0xffff) - 256, 0);  // max should be unnecessary, but just in case
       dist   = symbol >> 16;
       for (int i = t; i < len; i += 32) {
-        const uint8_t* src = out + ((i >= dist) ? (i % dist) : i) - dist;
+        uint8_t const* src = out + ((i >= dist) ? (i % dist) : i) - dist;
         uint8_t b          = (src < outbase) ? 0 : *src;
         if (out + i < outend) { out[i] = b; }
       }
@@ -897,12 +897,12 @@ __device__ void copy_stored(inflate_state_s* s, int t)
     // Fast copy 16 bytes at a time
     for (int i = t * 16; i < fast_bytes; i += blockDim.x * 16) {
       uint4 u;
-      u.x = *reinterpret_cast<const uint32_t*>(cur4 + i + 0 * 4);
-      u.y = *reinterpret_cast<const uint32_t*>(cur4 + i + 1 * 4);
-      u.z = *reinterpret_cast<const uint32_t*>(cur4 + i + 2 * 4);
-      u.w = *reinterpret_cast<const uint32_t*>(cur4 + i + 3 * 4);
+      u.x = *reinterpret_cast<uint32_t const*>(cur4 + i + 0 * 4);
+      u.y = *reinterpret_cast<uint32_t const*>(cur4 + i + 1 * 4);
+      u.z = *reinterpret_cast<uint32_t const*>(cur4 + i + 2 * 4);
+      u.w = *reinterpret_cast<uint32_t const*>(cur4 + i + 3 * 4);
       if (bitpos != 0) {
-        uint32_t v = (bitpos != 0) ? *reinterpret_cast<const uint32_t*>(cur4 + i + 4 * 4) : 0;
+        uint32_t v = (bitpos != 0) ? *reinterpret_cast<uint32_t const*>(cur4 + i + 4 * 4) : 0;
         u.x        = __funnelshift_rc(u.x, u.y, bitpos);
         u.y        = __funnelshift_rc(u.y, u.z, bitpos);
         u.z        = __funnelshift_rc(u.z, u.w, bitpos);
@@ -947,15 +947,15 @@ __device__ void init_prefetcher(inflate_state_s* s, int t)
 
 __device__ void prefetch_warp(volatile inflate_state_s* s, int t)
 {
-  const uint8_t* cur_p = s->pref.cur_p;
-  const uint8_t* end   = s->end;
+  uint8_t const* cur_p = s->pref.cur_p;
+  uint8_t const* end   = s->end;
   while (shuffle((t == 0) ? s->pref.run : 0)) {
     auto cur_lo = (int32_t)(size_t)cur_p;
     int do_pref =
       shuffle((t == 0) ? (cur_lo - *(volatile int32_t*)&s->cur < prefetch_size - 32 * 4 - 4) : 0);
     if (do_pref) {
-      const uint8_t* p             = cur_p + 4 * t;
-      *prefetch_addr32(s->pref, p) = (p < end) ? *reinterpret_cast<const uint32_t*>(p) : 0;
+      uint8_t const* p             = cur_p + 4 * t;
+      *prefetch_addr32(s->pref, p) = (p < end) ? *reinterpret_cast<uint32_t const*>(p) : 0;
       cur_p += 4 * 32;
       __threadfence_block();
       __syncwarp();
@@ -972,7 +972,7 @@ __device__ void prefetch_warp(volatile inflate_state_s* s, int t)
  * @brief Parse GZIP header
  * See https://tools.ietf.org/html/rfc1952
  */
-__device__ int parse_gzip_header(const uint8_t* src, size_t src_size)
+__device__ int parse_gzip_header(uint8_t const* src, size_t src_size)
 {
   int hdr_len = -1;
 
@@ -1156,13 +1156,13 @@ __global__ void __launch_bounds__(1024)
   copy_uncompressed_kernel(device_span<device_span<uint8_t const> const> inputs,
                            device_span<device_span<uint8_t> const> outputs)
 {
-  __shared__ const uint8_t* volatile src_g;
+  __shared__ uint8_t const* volatile src_g;
   __shared__ uint8_t* volatile dst_g;
   __shared__ uint32_t volatile copy_len_g;
 
   uint32_t t = threadIdx.x;
   uint32_t z = blockIdx.x;
-  const uint8_t* src;
+  uint8_t const* src;
   uint8_t* dst;
   uint32_t len, src_align_bytes, src_align_bits, dst_align_bytes;
 
@@ -1190,7 +1190,7 @@ __global__ void __launch_bounds__(1024)
   src_align_bytes = (uint32_t)(3 & reinterpret_cast<uintptr_t>(src));
   src_align_bits  = src_align_bytes << 3;
   while (len >= 32) {
-    const auto* src32 = reinterpret_cast<const uint32_t*>(src - src_align_bytes);
+    auto const* src32 = reinterpret_cast<uint32_t const*>(src - src_align_bytes);
     uint32_t copy_cnt = min(len >> 2, 1024);
     if (t < copy_cnt) {
       uint32_t v = src32[t];
diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp
index 1b45a31b13b..5908b77c98b 100644
--- a/cpp/src/io/comp/gpuinflate.hpp
+++ b/cpp/src/io/comp/gpuinflate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/io/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -46,6 +47,20 @@ struct compression_result {
 
 enum class gzip_header_included { NO, YES };
 
+/**
+ * @brief The value used for padding a data buffer such that its size will be multiple of it.
+ *
+ * Padding is necessary for input/output buffers of several compression/decompression kernels
+ * (inflate_kernel and nvcomp snappy). Such kernels operate on aligned data pointers, which require
+ * padding to the buffers so that the pointers can shift along the address space to satisfy their
+ * alignment requirement.
+ *
+ * In the meantime, it is not entirely clear why such padding is needed. We need to further
+ * investigate and implement a better fix rather than just padding the buffer.
+ * See https://github.com/rapidsai/cudf/issues/13605.
+ */
+constexpr std::size_t BUFFER_PADDING_MULTIPLE{8};
+
 /**
  * @brief Interface for decompressing GZIP-compressed data
  *
@@ -136,5 +151,18 @@ void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
               device_span<compression_result> results,
               rmm::cuda_stream_view stream);
 
+/**
+ * @brief Aggregate results of compression into a single statistics object.
+ *
+ * @param inputs List of uncompressed input buffers
+ * @param results List of compression results
+ * @param stream CUDA stream to use
+ * @return writer_compression_statistics
+ */
+[[nodiscard]] writer_compression_statistics collect_compression_statistics(
+  device_span<device_span<uint8_t const> const> inputs,
+  device_span<compression_result const> results,
+  rmm::cuda_stream_view stream);
+
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 323b4b583ce..1a2c90eb52e 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -332,8 +332,8 @@ size_t compress_max_output_chunk_size(compression_type compression,
 
 // Dispatcher for nvcompBatched<format>CompressAsync
 static void batched_compress_async(compression_type compression,
-                                   const void* const* device_uncompressed_ptrs,
-                                   const size_t* device_uncompressed_bytes,
+                                   void const* const* device_uncompressed_ptrs,
+                                   size_t const* device_uncompressed_bytes,
                                    size_t max_uncompressed_chunk_bytes,
                                    size_t batch_size,
                                    void* device_temp_ptr,
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index 6c7ab490751..0428f4edaf2 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ constexpr int hash_bits = 12;
  * @brief snappy compressor state
  */
 struct snap_state_s {
-  const uint8_t* src;                 ///< Ptr to uncompressed data
+  uint8_t const* src;                 ///< Ptr to uncompressed data
   uint32_t src_len;                   ///< Uncompressed data length
   uint8_t* dst_base;                  ///< Base ptr to output compressed data
   uint8_t* dst;                       ///< Current ptr to uncompressed data
@@ -53,10 +53,10 @@ static inline __device__ uint32_t snap_hash(uint32_t v)
 /**
  * @brief Fetches four consecutive bytes
  */
-static inline __device__ uint32_t fetch4(const uint8_t* src)
+static inline __device__ uint32_t fetch4(uint8_t const* src)
 {
   uint32_t src_align = 3 & reinterpret_cast<uintptr_t>(src);
-  const auto* src32  = reinterpret_cast<const uint32_t*>(src - src_align);
+  auto const* src32  = reinterpret_cast<uint32_t const*>(src - src_align);
   uint32_t v         = src32[0];
   return (src_align) ? __funnelshift_r(v, src32[1], src_align * 8) : v;
 }
@@ -73,7 +73,7 @@ static inline __device__ uint32_t fetch4(const uint8_t* src)
  * @return Updated pointer to compressed byte stream
  */
 static __device__ uint8_t* StoreLiterals(
-  uint8_t* dst, uint8_t* end, const uint8_t* src, uint32_t len_minus1, uint32_t t)
+  uint8_t* dst, uint8_t* end, uint8_t const* src, uint32_t len_minus1, uint32_t t)
 {
   if (len_minus1 < 60) {
     if (!t && dst < end) dst[0] = (len_minus1 << 2);
@@ -179,7 +179,7 @@ static inline __device__ uint32_t HashMatchAny(uint32_t v, uint32_t t)
  * @return Number of bytes before first match (literal length)
  */
 static __device__ uint32_t FindFourByteMatch(snap_state_s* s,
-                                             const uint8_t* src,
+                                             uint8_t const* src,
                                              uint32_t pos0,
                                              uint32_t t)
 {
@@ -233,8 +233,8 @@ static __device__ uint32_t FindFourByteMatch(snap_state_s* s,
 }
 
 /// @brief Returns the number of matching bytes for two byte sequences up to 63 bytes
-static __device__ uint32_t Match60(const uint8_t* src1,
-                                   const uint8_t* src2,
+static __device__ uint32_t Match60(uint8_t const* src1,
+                                   uint8_t const* src2,
                                    uint32_t len,
                                    uint32_t t)
 {
@@ -267,7 +267,7 @@ __global__ void __launch_bounds__(128)
   snap_state_s* const s = &state_g;
   uint32_t t            = threadIdx.x;
   uint32_t pos;
-  const uint8_t* src;
+  uint8_t const* src;
 
   if (!t) {
     auto const src     = inputs[blockIdx.x].data();
diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/statistics.cu
new file mode 100644
index 00000000000..e0f7e1ec6dd
--- /dev/null
+++ b/cpp/src/io/comp/statistics.cu
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gpuinflate.hpp"
+
+#include <rmm/exec_policy.hpp>
+#include <thrust/transform_reduce.h>
+
+namespace cudf::io {
+
+writer_compression_statistics collect_compression_statistics(
+  device_span<device_span<uint8_t const> const> inputs,
+  device_span<compression_result const> results,
+  rmm::cuda_stream_view stream)
+{
+  // bytes_written on success
+  auto const output_size_successful = thrust::transform_reduce(
+    rmm::exec_policy(stream),
+    results.begin(),
+    results.end(),
+    [] __device__(auto& res) {
+      return res.status == compression_status::SUCCESS ? res.bytes_written : 0;
+    },
+    0ul,
+    thrust::plus<size_t>());
+
+  auto input_size_with_status = [inputs, results, stream](compression_status status) {
+    auto const zipped_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(inputs.begin(), results.begin()));
+    auto const zipped_end = zipped_begin + inputs.size();
+
+    return thrust::transform_reduce(
+      rmm::exec_policy(stream),
+      zipped_begin,
+      zipped_end,
+      [status] __device__(auto tup) {
+        return thrust::get<1>(tup).status == status ? thrust::get<0>(tup).size() : 0;
+      },
+      0ul,
+      thrust::plus<size_t>());
+  };
+
+  return writer_compression_statistics{input_size_with_status(compression_status::SUCCESS),
+                                       input_size_with_status(compression_status::FAILURE),
+                                       input_size_with_status(compression_status::SKIPPED),
+                                       output_size_successful};
+}
+
+}  // namespace cudf::io
diff --git a/cpp/src/io/comp/unbz2.hpp b/cpp/src/io/comp/unbz2.hpp
index 3f550055227..8ee140e9f3b 100644
--- a/cpp/src/io/comp/unbz2.hpp
+++ b/cpp/src/io/comp/unbz2.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,7 +96,7 @@ namespace io {
 #define BZ_UNEXPECTED_EOF   (-7)
 #define BZ_OUTBUFF_FULL     (-8)
 
-int32_t cpu_bz2_uncompress(const uint8_t* input,
+int32_t cpu_bz2_uncompress(uint8_t const* input,
                            size_t inlen,
                            uint8_t* dst,
                            size_t* dstlen,
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 122374556a8..017fd8abb47 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -18,10 +18,11 @@
 #include "nvcomp_adapter.hpp"
 #include "unbz2.hpp"  // bz2 uncompress
 
+#include <io/utilities/hostdevice_vector.hpp>
+
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
 
 #include <cuda_runtime.h>
 
@@ -109,27 +110,27 @@ struct bz2_file_header_s {
 #pragma pack(pop)
 
 struct gz_archive_s {
-  const gz_file_header_s* fhdr;
+  gz_file_header_s const* fhdr;
   uint16_t hcrc16;           // header crc16 if present
   uint16_t xlen;
-  const uint8_t* fxtra;      // xlen bytes (optional)
-  const uint8_t* fname;      // zero-terminated original filename if present
-  const uint8_t* fcomment;   // zero-terminated comment if present
-  const uint8_t* comp_data;  // compressed data
+  uint8_t const* fxtra;      // xlen bytes (optional)
+  uint8_t const* fname;      // zero-terminated original filename if present
+  uint8_t const* fcomment;   // zero-terminated comment if present
+  uint8_t const* comp_data;  // compressed data
   size_t comp_len;           // Compressed data length
   uint32_t crc32;            // CRC32 of uncompressed data
   uint32_t isize;            // Input size modulo 2^32
 };
 
 struct zip_archive_s {
-  const zip_eocd_s* eocd;    // end of central directory
-  const zip64_eocdl* eocdl;  // end of central dir locator (optional)
-  const zip_cdfh_s* cdfh;    // start of central directory file headers
+  zip_eocd_s const* eocd;    // end of central directory
+  zip64_eocdl const* eocdl;  // end of central dir locator (optional)
+  zip_cdfh_s const* cdfh;    // start of central directory file headers
 };
 
-bool ParseGZArchive(gz_archive_s* dst, const uint8_t* raw, size_t len)
+bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len)
 {
-  const gz_file_header_s* fhdr;
+  gz_file_header_s const* fhdr;
 
   if (!dst) return false;
   memset(dst, 0, sizeof(gz_archive_s));
@@ -191,7 +192,7 @@ bool ParseGZArchive(gz_archive_s* dst, const uint8_t* raw, size_t len)
   return (fhdr->comp_mthd == 8 && len > 0);
 }
 
-bool OpenZipArchive(zip_archive_s* dst, const uint8_t* raw, size_t len)
+bool OpenZipArchive(zip_archive_s* dst, uint8_t const* raw, size_t len)
 {
   memset(dst, 0, sizeof(zip_archive_s));
   // Find the end of central directory
@@ -199,16 +200,16 @@ bool OpenZipArchive(zip_archive_s* dst, const uint8_t* raw, size_t len)
     for (ptrdiff_t i = len - sizeof(zip_eocd_s) - 2;
          i + sizeof(zip_eocd_s) + 2 + 0xffff >= len && i >= 0;
          i--) {
-      const auto* eocd = reinterpret_cast<zip_eocd_s const*>(raw + i);
+      auto const* eocd = reinterpret_cast<zip_eocd_s const*>(raw + i);
       if (eocd->sig == 0x0605'4b50 &&
           eocd->disk_id == eocd->start_disk  // multi-file archives not supported
           && eocd->num_entries == eocd->total_entries &&
           eocd->cdir_size >= sizeof(zip_cdfh_s) * eocd->num_entries && eocd->cdir_offset < len &&
-          i + *reinterpret_cast<const uint16_t*>(eocd + 1) <= static_cast<ptrdiff_t>(len)) {
-        const auto* cdfh = reinterpret_cast<const zip_cdfh_s*>(raw + eocd->cdir_offset);
+          i + *reinterpret_cast<uint16_t const*>(eocd + 1) <= static_cast<ptrdiff_t>(len)) {
+        auto const* cdfh = reinterpret_cast<zip_cdfh_s const*>(raw + eocd->cdir_offset);
         dst->eocd        = eocd;
         if (i >= static_cast<ptrdiff_t>(sizeof(zip64_eocdl))) {
-          const auto* eocdl = reinterpret_cast<const zip64_eocdl*>(raw + i - sizeof(zip64_eocdl));
+          auto const* eocdl = reinterpret_cast<zip64_eocdl const*>(raw + i - sizeof(zip64_eocdl));
           if (eocdl->sig == 0x0706'4b50) { dst->eocdl = eocdl; }
         }
         // Start of central directory
@@ -219,7 +220,7 @@ bool OpenZipArchive(zip_archive_s* dst, const uint8_t* raw, size_t len)
   return (dst->eocd && dst->cdfh);
 }
 
-int cpu_inflate(uint8_t* uncomp_data, size_t* destLen, const uint8_t* comp_data, size_t comp_len)
+int cpu_inflate(uint8_t* uncomp_data, size_t* destLen, uint8_t const* comp_data, size_t comp_len)
 {
   int zerr;
   z_stream strm;
@@ -252,7 +253,7 @@ int cpu_inflate(uint8_t* uncomp_data, size_t* destLen, const uint8_t* comp_data,
  * @param[in] comp_data Raw compressed data
  * @param[in] comp_len Compressed data size
  */
-void cpu_inflate_vector(std::vector<uint8_t>& dst, const uint8_t* comp_data, size_t comp_len)
+void cpu_inflate_vector(std::vector<uint8_t>& dst, uint8_t const* comp_data, size_t comp_len)
 {
   z_stream strm{};
   strm.next_in   = const_cast<Bytef*>(reinterpret_cast<Bytef const*>(comp_data));
@@ -283,7 +284,7 @@ std::vector<uint8_t> decompress(compression_type compression, host_span<uint8_t
   CUDF_EXPECTS(not src.empty(), "Decompression: Source size cannot be 0");
 
   auto raw                 = src.data();
-  const uint8_t* comp_data = nullptr;
+  uint8_t const* comp_data = nullptr;
   size_t comp_len          = 0;
   size_t uncomp_len        = 0;
 
@@ -305,8 +306,8 @@ std::vector<uint8_t> decompress(compression_type compression, host_span<uint8_t
       if (OpenZipArchive(&za, raw, src.size())) {
         size_t cdfh_ofs = 0;
         for (int i = 0; i < za.eocd->num_entries; i++) {
-          const zip_cdfh_s* cdfh = reinterpret_cast<const zip_cdfh_s*>(
-            reinterpret_cast<const uint8_t*>(za.cdfh) + cdfh_ofs);
+          zip_cdfh_s const* cdfh = reinterpret_cast<zip_cdfh_s const*>(
+            reinterpret_cast<uint8_t const*>(za.cdfh) + cdfh_ofs);
           int cdfh_len = sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len;
           if (cdfh_ofs + cdfh_len > za.eocd->cdir_size || cdfh->sig != 0x0201'4b50) {
             // Bad cdir
@@ -315,7 +316,7 @@ std::vector<uint8_t> decompress(compression_type compression, host_span<uint8_t
           // For now, only accept with non-zero file sizes and DEFLATE
           if (cdfh->comp_method == 8 && cdfh->comp_size > 0 && cdfh->uncomp_size > 0) {
             size_t lfh_ofs       = cdfh->hdr_ofs;
-            const zip_lfh_s* lfh = reinterpret_cast<const zip_lfh_s*>(raw + lfh_ofs);
+            zip_lfh_s const* lfh = reinterpret_cast<zip_lfh_s const*>(raw + lfh_ofs);
             if (lfh_ofs + sizeof(zip_lfh_s) <= src.size() && lfh->sig == 0x0403'4b50 &&
                 lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len <= src.size()) {
               if (lfh->comp_method == 8 && lfh->comp_size > 0 && lfh->uncomp_size > 0) {
@@ -340,7 +341,7 @@ std::vector<uint8_t> decompress(compression_type compression, host_span<uint8_t
       [[fallthrough]];
     case compression_type::BZIP2:
       if (src.size() > 4) {
-        const bz2_file_header_s* fhdr = reinterpret_cast<const bz2_file_header_s*>(raw);
+        bz2_file_header_s const* fhdr = reinterpret_cast<bz2_file_header_s const*>(raw);
         // Check for BZIP2 file signature "BZh1" to "BZh9"
         if (fhdr->sig[0] == 'B' && fhdr->sig[1] == 'Z' && fhdr->sig[2] == 'h' &&
             fhdr->blksz >= '1' && fhdr->blksz <= '9') {
@@ -452,12 +453,12 @@ size_t decompress_snappy(host_span<uint8_t const> src, host_span<uint8_t> dst)
       if (blen & 2) {
         // xxxxxx1x: copy with 6-bit length, 2-byte or 4-byte offset
         if (cur + 2 > end) break;
-        offset = *reinterpret_cast<const uint16_t*>(cur);
+        offset = *reinterpret_cast<uint16_t const*>(cur);
         cur += 2;
         if (blen & 1)  // 4-byte offset
         {
           if (cur + 2 > end) break;
-          offset |= (*reinterpret_cast<const uint16_t*>(cur)) << 16;
+          offset |= (*reinterpret_cast<uint16_t const*>(cur)) << 16;
           cur += 2;
         }
         blen = (blen >> 2) + 1;
@@ -511,19 +512,19 @@ size_t decompress_zstd(host_span<uint8_t const> src,
   // Init device span of spans (source)
   auto const d_src =
     cudf::detail::make_device_uvector_async(src, stream, rmm::mr::get_current_device_resource());
-  auto hd_srcs = hostdevice_vector<device_span<uint8_t const>>(1, stream);
+  auto hd_srcs = cudf::detail::hostdevice_vector<device_span<uint8_t const>>(1, stream);
   hd_srcs[0]   = d_src;
-  hd_srcs.host_to_device(stream);
+  hd_srcs.host_to_device_async(stream);
 
   // Init device span of spans (temporary destination)
   auto d_dst   = rmm::device_uvector<uint8_t>(dst.size(), stream);
-  auto hd_dsts = hostdevice_vector<device_span<uint8_t>>(1, stream);
+  auto hd_dsts = cudf::detail::hostdevice_vector<device_span<uint8_t>>(1, stream);
   hd_dsts[0]   = d_dst;
-  hd_dsts.host_to_device(stream);
+  hd_dsts.host_to_device_async(stream);
 
-  auto hd_stats = hostdevice_vector<compression_result>(1, stream);
+  auto hd_stats = cudf::detail::hostdevice_vector<compression_result>(1, stream);
   hd_stats[0]   = compression_result{0, compression_status::FAILURE};
-  hd_stats.host_to_device(stream);
+  hd_stats.host_to_device_async(stream);
   auto const max_uncomp_page_size = dst.size();
   nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
                              hd_srcs,
@@ -533,7 +534,7 @@ size_t decompress_zstd(host_span<uint8_t const> src,
                              max_uncomp_page_size,
                              stream);
 
-  hd_stats.device_to_host(stream, true);
+  hd_stats.device_to_host_sync(stream);
   CUDF_EXPECTS(hd_stats[0].status == compression_status::SUCCESS, "ZSTD decompression failed");
 
   // Copy temporary output to `dst`
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index dc7e006f4ee..a7a1cfd3f9e 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -64,8 +64,8 @@ struct unsnap_queue_s {
  * @brief snappy decompression state
  */
 struct unsnap_state_s {
-  const uint8_t* base;             ///< base ptr of compressed stream
-  const uint8_t* end;              ///< end of compressed stream
+  uint8_t const* base;             ///< base ptr of compressed stream
+  uint8_t const* end;              ///< end of compressed stream
   uint32_t uncompressed_size;      ///< uncompressed stream size
   uint32_t bytes_left;             ///< remaining bytes to decompress
   int32_t error;                   ///< current error status
@@ -88,7 +88,7 @@ inline __device__ volatile uint8_t& byte_access(unsnap_state_s* s, uint32_t pos)
  */
 __device__ void snappy_prefetch_bytestream(unsnap_state_s* s, int t)
 {
-  const uint8_t* base = s->base;
+  uint8_t const* base = s->base;
   auto end            = (uint32_t)(s->end - base);
   auto align_bytes    = (uint32_t)(0x20 - (0x1f & reinterpret_cast<uintptr_t>(base)));
   int32_t pos         = min(align_bytes, end);
@@ -538,7 +538,7 @@ __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_s
           uint32_t tr  = t - shuffle(bofs - blen_t, it);
           int32_t dist = shuffle(dist_t, it);
           if (it < n) {
-            const uint8_t* src = (dist > 0) ? (out + t - dist) : (literal_base + tr - dist);
+            uint8_t const* src = (dist > 0) ? (out + t - dist) : (literal_base + tr - dist);
             out[t]             = *src;
           }
           out += shuffle(bofs, n - 1);
@@ -565,7 +565,7 @@ __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_s
           }
           blen += blen2;
           if (t < blen) {
-            const uint8_t* src = (dist > 0) ? (out - d) : (literal_base - d);
+            uint8_t const* src = (dist > 0) ? (out - d) : (literal_base - d);
             out[t]             = src[t];
           }
           out += blen;
@@ -578,12 +578,12 @@ __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_s
         uint8_t b0, b1;
         if (t < blen) {
           uint32_t pos       = t;
-          const uint8_t* src = out + ((pos >= dist) ? (pos % dist) : pos) - dist;
+          uint8_t const* src = out + ((pos >= dist) ? (pos % dist) : pos) - dist;
           b0                 = *src;
         }
         if (32 + t < blen) {
           uint32_t pos       = 32 + t;
-          const uint8_t* src = out + ((pos >= dist) ? (pos % dist) : pos) - dist;
+          uint8_t const* src = out + ((pos >= dist) ? (pos % dist) : pos) - dist;
           b1                 = *src;
         }
         if (t < blen) { out[t] = b0; }
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index f662d8a9e91..248e17669bc 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -20,6 +20,7 @@
 #include <io/utilities/block_utils.cuh>
 #include <io/utilities/parsing_utils.cuh>
 
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/null_mask.hpp>
@@ -45,6 +46,7 @@
 using namespace ::cudf::io;
 
 using cudf::device_span;
+using cudf::detail::grid_1d;
 
 namespace cudf {
 namespace io {
@@ -177,11 +179,10 @@ __global__ void __launch_bounds__(csvparse_block_dim)
 
   // ThreadIds range per block, so also need the blockId
   // This is entry into the fields; threadId is an element within `num_records`
-  long const rec_id      = threadIdx.x + (blockDim.x * blockIdx.x);
-  long const rec_id_next = rec_id + 1;
+  auto const rec_id      = grid_1d::global_thread_id();
+  auto const rec_id_next = rec_id + 1;
 
-  // we can have more threads than data, make sure we are not past the end of
-  // the data
+  // we can have more threads than data, make sure we are not past the end of the data
   if (rec_id_next >= row_offsets.size()) { return; }
 
   auto field_start   = raw_csv + row_offsets[rec_id];
@@ -317,8 +318,8 @@ __global__ void __launch_bounds__(csvparse_block_dim)
   auto const raw_csv = data.data();
   // thread IDs range per block, so also need the block id.
   // this is entry into the field array - tid is an elements within the num_entries array
-  long const rec_id      = threadIdx.x + (blockDim.x * blockIdx.x);
-  long const rec_id_next = rec_id + 1;
+  auto const rec_id      = grid_1d::global_thread_id();
+  auto const rec_id_next = rec_id + 1;
 
   // we can have more threads than data, make sure we are not past the end of the data
   if (rec_id_next >= row_offsets.size()) return;
@@ -356,7 +357,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
               --end;
             }
           }
-          auto str_list = static_cast<std::pair<const char*, size_t>*>(columns[actual_col]);
+          auto str_list = static_cast<std::pair<char const*, size_t>*>(columns[actual_col]);
           str_list[rec_id].first  = field_start;
           str_list[rec_id].second = end - field_start;
         } else {
@@ -375,7 +376,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
           }
         }
       } else if (dtypes[actual_col].id() == cudf::type_id::STRING) {
-        auto str_list           = static_cast<std::pair<const char*, size_t>*>(columns[actual_col]);
+        auto str_list           = static_cast<std::pair<char const*, size_t>*>(columns[actual_col]);
         str_list[rec_id].first  = nullptr;
         str_list[rec_id].second = 0;
       }
@@ -645,11 +646,11 @@ __global__ void __launch_bounds__(rowofs_block_dim)
     __align__(8) uint64_t ctxtree[rowofs_block_dim * 2];
   } temp_storage;
 
-  const char* end = start + (min(parse_pos + chunk_size, data_size) - start_offset);
+  char const* end = start + (min(parse_pos + chunk_size, data_size) - start_offset);
   uint32_t t      = threadIdx.x;
   size_t block_pos =
     (parse_pos - start_offset) + blockIdx.x * static_cast<size_t>(rowofs_block_bytes) + t * 32;
-  const char* cur = start + block_pos;
+  char const* cur = start + block_pos;
 
   // Initial state is neutral context (no state transitions), zero rows
   uint4 ctx_map = {
@@ -687,7 +688,7 @@ __global__ void __launch_bounds__(rowofs_block_dim)
         ctx = make_char_context(ROW_CTX_NONE, ROW_CTX_QUOTE);
       }
     } else {
-      const char* data_end = start + data_size - start_offset;
+      char const* data_end = start + data_size - start_offset;
       if (cur <= end && cur == data_end) {
         // Add a newline at data end (need the extra row offset to infer length of previous row)
         ctx = make_char_context(ROW_CTX_EOF, ROW_CTX_EOF, ROW_CTX_EOF, 1, 1, 1);
@@ -746,19 +747,19 @@ __global__ void __launch_bounds__(rowofs_block_dim)
   }
 }
 
-size_t __host__ count_blank_rows(const cudf::io::parse_options_view& opts,
+size_t __host__ count_blank_rows(cudf::io::parse_options_view const& opts,
                                  device_span<char const> data,
                                  device_span<uint64_t const> row_offsets,
                                  rmm::cuda_stream_view stream)
 {
-  const auto newline  = opts.skipblanklines ? opts.terminator : opts.comment;
-  const auto comment  = opts.comment != '\0' ? opts.comment : newline;
-  const auto carriage = (opts.skipblanklines && opts.terminator == '\n') ? '\r' : comment;
+  auto const newline  = opts.skipblanklines ? opts.terminator : opts.comment;
+  auto const comment  = opts.comment != '\0' ? opts.comment : newline;
+  auto const carriage = (opts.skipblanklines && opts.terminator == '\n') ? '\r' : comment;
   return thrust::count_if(
     rmm::exec_policy(stream),
     row_offsets.begin(),
     row_offsets.end(),
-    [data = data, newline, comment, carriage] __device__(const uint64_t pos) {
+    [data = data, newline, comment, carriage] __device__(uint64_t const pos) {
       return ((pos != data.size()) &&
               (data[pos] == newline || data[pos] == comment || data[pos] == carriage));
     });
@@ -770,14 +771,14 @@ device_span<uint64_t> __host__ remove_blank_rows(cudf::io::parse_options_view co
                                                  rmm::cuda_stream_view stream)
 {
   size_t d_size       = data.size();
-  const auto newline  = options.skipblanklines ? options.terminator : options.comment;
-  const auto comment  = options.comment != '\0' ? options.comment : newline;
-  const auto carriage = (options.skipblanklines && options.terminator == '\n') ? '\r' : comment;
+  auto const newline  = options.skipblanklines ? options.terminator : options.comment;
+  auto const comment  = options.comment != '\0' ? options.comment : newline;
+  auto const carriage = (options.skipblanklines && options.terminator == '\n') ? '\r' : comment;
   auto new_end        = thrust::remove_if(
     rmm::exec_policy(stream),
     row_offsets.begin(),
     row_offsets.end(),
-    [data = data, d_size, newline, comment, carriage] __device__(const uint64_t pos) {
+    [data = data, d_size, newline, comment, carriage] __device__(uint64_t const pos) {
       return ((pos != d_size) &&
               (data[pos] == newline || data[pos] == comment || data[pos] == carriage));
     });
@@ -793,8 +794,8 @@ std::vector<column_type_histogram> detect_column_types(
   rmm::cuda_stream_view stream)
 {
   // Calculate actual block count to use based on records count
-  const int block_size = csvparse_block_dim;
-  const int grid_size  = (row_starts.size() + block_size - 1) / block_size;
+  int const block_size = csvparse_block_dim;
+  int const grid_size  = (row_starts.size() + block_size - 1) / block_size;
 
   auto d_stats = detail::make_zeroed_device_uvector_async<column_type_histogram>(
     num_active_columns, stream, rmm::mr::get_current_device_resource());
@@ -824,7 +825,7 @@ void decode_row_column_data(cudf::io::parse_options_view const& options,
     options, data, column_flags, row_offsets, dtypes, columns, valids, valid_counts);
 }
 
-uint32_t __host__ gather_row_offsets(const parse_options_view& options,
+uint32_t __host__ gather_row_offsets(parse_options_view const& options,
                                      uint64_t* row_ctx,
                                      device_span<uint64_t> const offsets_out,
                                      device_span<char const> const data,
diff --git a/cpp/src/io/csv/csv_gpu.hpp b/cpp/src/io/csv/csv_gpu.hpp
index cbaa5d87e9a..62bd8f1eff2 100644
--- a/cpp/src/io/csv/csv_gpu.hpp
+++ b/cpp/src/io/csv/csv_gpu.hpp
@@ -183,7 +183,7 @@ size_t count_blank_rows(cudf::io::parse_options_view const& options,
  * @param row_offsets Row offsets in the character data buffer
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-device_span<uint64_t> remove_blank_rows(const cudf::io::parse_options_view& options,
+device_span<uint64_t> remove_blank_rows(cudf::io::parse_options_view const& options,
                                         device_span<char const> data,
                                         device_span<uint64_t> row_offsets,
                                         rmm::cuda_stream_view stream);
@@ -195,7 +195,7 @@ device_span<uint64_t> remove_blank_rows(const cudf::io::parse_options_view& opti
  * @param[in] data The row-column data
  * @param[in] column_flags Flags that control individual column parsing
  * @param[in] row_offsets List of row data start positions (offsets)
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  *
  * @return stats Histogram of each dtypes' occurrence for each column
  */
@@ -218,7 +218,7 @@ std::vector<column_type_histogram> detect_column_types(
  * @param[out] columns Device memory output of column data
  * @param[out] valids Device memory output of column valids bitmap data
  * @param[out] valid_counts Device memory output of the number of valid fields in each column
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void decode_row_column_data(cudf::io::parse_options_view const& options,
                             device_span<char const> data,
diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh
index f37ecc69eb3..50d2106ec42 100644
--- a/cpp/src/io/csv/datetime.cuh
+++ b/cpp/src/io/csv/datetime.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -167,7 +167,7 @@ __inline__ __device__ cuda::std::chrono::hh_mm_ss<duration_ms> extract_time_of_d
   end = last + 1;
 
   // Find hour-minute separator
-  const auto hm_sep = thrust::find(thrust::seq, begin, end, sep);
+  auto const hm_sep = thrust::find(thrust::seq, begin, end, sep);
   // Extract hours
   d_h += cudf::duration_h{to_non_negative_integer<int>(begin, hm_sep)};
 
@@ -176,14 +176,14 @@ __inline__ __device__ cuda::std::chrono::hh_mm_ss<duration_ms> extract_time_of_d
   duration_ms d_ms{0};
 
   // Find minute-second separator (if present)
-  const auto ms_sep = thrust::find(thrust::seq, hm_sep + 1, end, sep);
+  auto const ms_sep = thrust::find(thrust::seq, hm_sep + 1, end, sep);
   if (ms_sep == end) {
     d_m = duration_m{to_non_negative_integer<int32_t>(hm_sep + 1, end)};
   } else {
     d_m = duration_m{to_non_negative_integer<int32_t>(hm_sep + 1, ms_sep)};
 
     // Find second-millisecond separator (if present)
-    const auto sms_sep = thrust::find(thrust::seq, ms_sep + 1, end, '.');
+    auto const sms_sep = thrust::find(thrust::seq, ms_sep + 1, end, '.');
     if (sms_sep == end) {
       d_s = duration_s{to_non_negative_integer<int64_t>(ms_sep + 1, end)};
     } else {
@@ -329,7 +329,7 @@ __inline__ __device__ auto skip_spaces(char const* begin, char const* end)
 template <int N>
 __inline__ __device__ auto skip_if_starts_with(char const* begin,
                                                char const* end,
-                                               const char (&prefix)[N])
+                                               char const (&prefix)[N])
 {
   static constexpr size_t prefix_len = N - 1;
   if (end - begin < prefix_len) return begin;
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 5229c761000..66143d3fdee 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -73,7 +73,7 @@ __device__ void dissect_duration(T duration, duration_component* timeparts)
 
 template <typename T>
 struct duration_to_string_size_fn {
-  const column_device_view d_durations;
+  column_device_view const d_durations;
 
   __device__ size_type operator()(size_type idx)
   {
@@ -88,12 +88,12 @@ struct duration_to_string_size_fn {
 
 template <typename T>
 struct duration_to_string_fn : public duration_to_string_size_fn<T> {
-  const int32_t* d_offsets;
+  int32_t const* d_offsets;
   char* d_chars;
   using duration_to_string_size_fn<T>::d_durations;
 
-  duration_to_string_fn(const column_device_view d_durations,
-                        const int32_t* d_offsets,
+  duration_to_string_fn(column_device_view const d_durations,
+                        int32_t const* d_offsets,
                         char* d_chars)
     : duration_to_string_size_fn<T>{d_durations}, d_offsets(d_offsets), d_chars(d_chars)
   {
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index cb6d746047c..dc28380bf24 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -110,9 +110,9 @@ class selected_rows_offsets {
 string removeQuotes(string str, char quotechar)
 {
   // Exclude first and last quotation char
-  const size_t first_quote = str.find(quotechar);
+  size_t const first_quote = str.find(quotechar);
   if (first_quote != string::npos) { str.erase(first_quote, 1); }
-  const size_t last_quote = str.rfind(quotechar);
+  size_t const last_quote = str.rfind(quotechar);
   if (last_quote != string::npos) { str.erase(last_quote, 1); }
 
   return str;
@@ -159,7 +159,7 @@ std::vector<std::string> get_column_names(std::vector<char> const& header,
           --col_name_len;
         }
 
-        const string new_col_name(first_row.data() + prev, col_name_len);
+        string const new_col_name(first_row.data() + prev, col_name_len);
         col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar));
       } else {
         // This is the first data row, add the automatically generated name
@@ -237,7 +237,7 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
   size_t buffer_size               = std::min(max_chunk_bytes, data.size());
   size_t max_blocks =
     std::max<size_t>((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2);
-  hostdevice_vector<uint64_t> row_ctx(max_blocks, stream);
+  cudf::detail::hostdevice_vector<uint64_t> row_ctx(max_blocks, stream);
   size_t buffer_pos  = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size());
   size_t pos         = std::min(range_begin, data.size());
   size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0;
@@ -366,7 +366,7 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
   auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets};
 
   // Remove header rows and extract header
-  const size_t header_row_index = std::max<size_t>(header_rows, 1) - 1;
+  size_t const header_row_index = std::max<size_t>(header_rows, 1) - 1;
   if (header_row_index + 1 < row_offsets.size()) {
     CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
                                   row_offsets.data() + header_row_index,
@@ -375,8 +375,8 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                   stream.value()));
     stream.synchronize();
 
-    const auto header_start = buffer_pos + row_ctx[0];
-    const auto header_end   = buffer_pos + row_ctx[1];
+    auto const header_start = buffer_pos + row_ctx[0];
+    auto const header_end   = buffer_pos + row_ctx[1];
     CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(),
                  "Invalid csv header location");
     header.assign(data.begin() + header_start, data.begin() + header_end);
@@ -410,23 +410,29 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> select_data_and_row_
 
   // Transfer source data to GPU
   if (!source->is_empty()) {
-    auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
-    auto buffer    = source->host_read(range_offset, data_size);
-
-    auto h_data = host_span<char const>(  //
-      reinterpret_cast<const char*>(buffer->data()),
-      buffer->size());
+    auto buffer =
+      source->host_read(range_offset, range_size_padded != 0 ? range_size_padded : source->size());
+    auto h_data =
+      host_span<char const>(reinterpret_cast<char const*>(buffer->data()), buffer->size());
 
     std::vector<uint8_t> h_uncomp_data_owner;
-
     if (reader_opts.get_compression() != compression_type::NONE) {
       h_uncomp_data_owner =
         decompress(reader_opts.get_compression(), {buffer->data(), buffer->size()});
       h_data = {reinterpret_cast<char const*>(h_uncomp_data_owner.data()),
                 h_uncomp_data_owner.size()};
+      buffer.reset();
     }
+
+    // check for and skip UTF-8 BOM
+    uint8_t const UTF8_BOM[] = {0xEF, 0xBB, 0xBF};
+    if (h_data.size() >= sizeof(UTF8_BOM) &&
+        memcmp(h_data.data(), UTF8_BOM, sizeof(UTF8_BOM)) == 0) {
+      h_data = h_data.subspan(sizeof(UTF8_BOM), h_data.size() - sizeof(UTF8_BOM));
+    }
+
     // None of the parameters for row selection is used, we are parsing the entire file
-    const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 &&
+    bool const load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 &&
                                  skip_end_rows <= 0 && num_rows == -1;
 
     // With byte range, find the start of the first data row
@@ -627,10 +633,10 @@ std::vector<data_type> determine_column_types(csv_reader_options const& reader_o
   std::vector<data_type> column_types(column_flags.size());
 
   std::visit(cudf::detail::visitor_overload{
-               [&](const std::vector<data_type>& user_dtypes) {
+               [&](std::vector<data_type> const& user_dtypes) {
                  return select_data_types(user_dtypes, column_flags, column_types);
                },
-               [&](const std::map<std::string, data_type>& user_dtypes) {
+               [&](std::map<std::string, data_type> const& user_dtypes) {
                  return get_data_types_from_column_names(
                    user_dtypes, column_names, column_flags, column_types);
                }},
@@ -722,7 +728,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
           CUDF_LOG_WARN("Multiple columns with name {}; only the first appearance is parsed",
                         col_name);
 
-          const auto idx    = &col_name - column_names.data();
+          auto const idx    = &col_name - column_names.data();
           column_flags[idx] = column_parse::disabled;
         }
       }
@@ -804,11 +810,11 @@ table_with_metadata read_csv(cudf::io::datasource* source,
   // User can specify which columns should be read as datetime
   if (!reader_opts.get_parse_dates_indexes().empty() ||
       !reader_opts.get_parse_dates_names().empty()) {
-    for (const auto index : reader_opts.get_parse_dates_indexes()) {
+    for (auto const index : reader_opts.get_parse_dates_indexes()) {
       column_flags[index] |= column_parse::as_datetime;
     }
 
-    for (const auto& name : reader_opts.get_parse_dates_names()) {
+    for (auto const& name : reader_opts.get_parse_dates_names()) {
       auto it = std::find(column_names.begin(), column_names.end(), name);
       if (it != column_names.end()) {
         column_flags[it - column_names.begin()] |= column_parse::as_datetime;
@@ -818,11 +824,11 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
   // User can specify which columns should be parsed as hexadecimal
   if (!reader_opts.get_parse_hex_indexes().empty() || !reader_opts.get_parse_hex_names().empty()) {
-    for (const auto index : reader_opts.get_parse_hex_indexes()) {
+    for (auto const index : reader_opts.get_parse_hex_indexes()) {
       column_flags[index] |= column_parse::as_hexadecimal;
     }
 
-    for (const auto& name : reader_opts.get_parse_hex_names()) {
+    for (auto const& name : reader_opts.get_parse_hex_names()) {
       auto it = std::find(column_names.begin(), column_names.end(), name);
       if (it != column_names.end()) {
         column_flags[it - column_names.begin()] |= column_parse::as_hexadecimal;
@@ -862,8 +868,8 @@ table_with_metadata read_csv(cudf::io::datasource* source,
         // quotechars in quoted fields results in reduction to a single quotechar
         // TODO: Would be much more efficient to perform this operation in-place
         // during the conversion stage
-        const std::string quotechar(1, parse_opts.quotechar);
-        const std::string dblquotechar(2, parse_opts.quotechar);
+        std::string const quotechar(1, parse_opts.quotechar);
+        std::string const dblquotechar(2, parse_opts.quotechar);
         std::unique_ptr<column> col = cudf::make_strings_column(*out_buffers[i]._strings, stream);
         out_columns.emplace_back(
           cudf::strings::detail::replace(col->view(), dblquotechar, quotechar, -1, stream, mr));
@@ -966,14 +972,14 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
 
   // Handle user-defined true values, whereby field data is substituted with a
   // boolean true or numeric `1` value
-  if (reader_opts.get_true_values().size() != 0) {
+  if (not reader_opts.get_true_values().empty()) {
     parse_opts.trie_true =
       cudf::detail::create_serialized_trie(reader_opts.get_true_values(), stream);
   }
 
   // Handle user-defined false values, whereby field data is substituted with a
   // boolean false or numeric `0` value
-  if (reader_opts.get_false_values().size() != 0) {
+  if (not reader_opts.get_false_values().empty()) {
     parse_opts.trie_false =
       cudf::detail::create_serialized_trie(reader_opts.get_false_values(), stream);
   }
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 3ecbe5a09cc..8c586306ad5 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -75,10 +75,10 @@ namespace {
 struct escape_strings_fn {
   column_device_view const d_column;
   string_view const d_delimiter;  // check for column delimiter
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
-  __device__ void write_char(char_utf8 chr, char*& d_buffer, offset_type& bytes)
+  __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes)
   {
     if (d_buffer)
       d_buffer += cudf::strings::detail::from_char_utf8(chr, d_buffer);
@@ -105,8 +105,8 @@ struct escape_strings_fn {
         return chr == quote || chr == new_line || chr == d_delimiter[0];
       });
 
-    char* d_buffer    = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    offset_type bytes = 0;
+    char* d_buffer  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    size_type bytes = 0;
 
     if (quote_row) write_char(quote, d_buffer, bytes);
     for (auto chr : d_str) {
@@ -251,7 +251,7 @@ struct column_to_strings_fn {
     return cudf::strings::detail::from_timestamps(
       column,
       format,
-      strings_column_view(column_view{data_type{type_id::STRING}, 0, nullptr}),
+      strings_column_view(make_empty_column(type_id::STRING)->view()),
       stream_,
       mr_);
   }
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index 0d0b6a42282..f867a95a864 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -83,16 +83,21 @@ class DFASimulationCallbackWrapper {
     if (!write) out_count = 0;
   }
 
-  template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT>
+  template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
   __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
                                                       StateIndexT const old_state,
                                                       StateIndexT const new_state,
-                                                      SymbolIndexT const symbol_id)
+                                                      SymbolIndexT const symbol_id,
+                                                      SymbolT const read_symbol)
   {
-    uint32_t const count = transducer_table(old_state, symbol_id);
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
     if (write) {
+#if __CUDA_ARCH__ > 0
+#pragma unroll 1
+#endif
       for (uint32_t out_char = 0; out_char < count; out_char++) {
-        out_it[out_count + out_char]     = transducer_table(old_state, symbol_id, out_char);
+        out_it[out_count + out_char] =
+          transducer_table(old_state, symbol_id, out_char, read_symbol);
         out_idx_it[out_count + out_char] = offset + character_index;
       }
     }
@@ -127,9 +132,10 @@ class StateVectorTransitionOp {
   {
   }
 
-  template <typename CharIndexT, typename SymbolIndexT>
+  template <typename CharIndexT, typename SymbolIndexT, typename SymbolT>
   __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
-                                                      SymbolIndexT const read_symbol_id) const
+                                                      SymbolIndexT const& read_symbol_id,
+                                                      SymbolT const& read_symbol) const
   {
     for (int32_t i = 0; i < NUM_INSTANCES; ++i) {
       state_vector[i] = transition_table(state_vector[i], read_symbol_id);
@@ -154,15 +160,16 @@ struct StateTransitionOp {
   {
   }
 
-  template <typename CharIndexT, typename SymbolIndexT>
+  template <typename CharIndexT, typename SymbolIndexT, typename SymbolT>
   __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
-                                                      SymbolIndexT const& read_symbol_id)
+                                                      SymbolIndexT const& read_symbol_id,
+                                                      SymbolT const& read_symbol)
   {
     // Remember what state we were in before we made the transition
     StateIndexT previous_state = state;
 
     state = transition_table(state, read_symbol_id);
-    callback_op.ReadSymbol(character_index, previous_state, state, read_symbol_id);
+    callback_op.ReadSymbol(character_index, previous_state, state, read_symbol_id, read_symbol);
   }
 };
 
@@ -230,7 +237,7 @@ struct AgentDFA {
     for (int32_t i = 0; i < NUM_SYMBOLS; ++i) {
       if (IS_FULL_BLOCK || threadIdx.x * SYMBOLS_PER_THREAD + i < max_num_chars) {
         auto matched_id = symbol_matcher(chars[i]);
-        callback_op.ReadSymbol(i, matched_id);
+        callback_op.ReadSymbol(i, matched_id, chars[i]);
       }
     }
   }
@@ -253,7 +260,8 @@ struct AgentDFA {
   //---------------------------------------------------------------------
   // LOADING FULL BLOCK OF CHARACTERS, NON-ALIASED
   //---------------------------------------------------------------------
-  __device__ __forceinline__ void LoadBlock(CharT const* d_chars,
+  template <typename CharInItT>
+  __device__ __forceinline__ void LoadBlock(CharInItT d_chars,
                                             OffsetT const block_offset,
                                             OffsetT const num_total_symbols,
                                             cub::Int2Type<true> /*IS_FULL_BLOCK*/,
@@ -261,7 +269,7 @@ struct AgentDFA {
   {
     CharT thread_chars[SYMBOLS_PER_THREAD];
 
-    CharT const* d_block_symbols = d_chars + block_offset;
+    CharInItT d_block_symbols = d_chars + block_offset;
     cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_block_symbols, thread_chars);
 
 #pragma unroll
@@ -273,7 +281,8 @@ struct AgentDFA {
   //---------------------------------------------------------------------
   // LOADING PARTIAL BLOCK OF CHARACTERS, NON-ALIASED
   //---------------------------------------------------------------------
-  __device__ __forceinline__ void LoadBlock(CharT const* d_chars,
+  template <typename CharInItT>
+  __device__ __forceinline__ void LoadBlock(CharInItT d_chars,
                                             OffsetT const block_offset,
                                             OffsetT const num_total_symbols,
                                             cub::Int2Type<false> /*IS_FULL_BLOCK*/,
@@ -286,7 +295,7 @@ struct AgentDFA {
     // Last unit to be loaded is IDIV_CEIL(#SYM, SYMBOLS_PER_UNIT)
     OffsetT num_total_chars = num_total_symbols - block_offset;
 
-    CharT const* d_block_symbols = d_chars + block_offset;
+    CharInItT d_block_symbols = d_chars + block_offset;
     cub::LoadDirectStriped<BLOCK_THREADS>(
       threadIdx.x, d_block_symbols, thread_chars, num_total_chars);
 
@@ -353,7 +362,7 @@ struct AgentDFA {
                                             OffsetT const num_total_symbols)
   {
     // Check if pointer is aligned to four bytes
-    if (((uintptr_t)(const void*)(d_chars + block_offset) % 4) == 0) {
+    if (((uintptr_t)(void const*)(d_chars + block_offset) % 4) == 0) {
       if (block_offset + SYMBOLS_PER_UINT_BLOCK < num_total_symbols) {
         LoadBlock(
           d_chars, block_offset, num_total_symbols, cub::Int2Type<true>(), cub::Int2Type<4>());
@@ -372,11 +381,26 @@ struct AgentDFA {
     }
   }
 
+  template <typename CharInItT>
+  __device__ __forceinline__ void LoadBlock(CharInItT d_chars,
+                                            OffsetT const block_offset,
+                                            OffsetT const num_total_symbols)
+  {
+    // Check if we are loading a full tile of data
+    if (block_offset + SYMBOLS_PER_UINT_BLOCK < num_total_symbols) {
+      LoadBlock(
+        d_chars, block_offset, num_total_symbols, cub::Int2Type<true>(), cub::Int2Type<1>());
+    } else {
+      LoadBlock(
+        d_chars, block_offset, num_total_symbols, cub::Int2Type<false>(), cub::Int2Type<1>());
+    }
+  }
+
   template <int32_t NUM_STATES, typename SymbolMatcherT, typename TransitionTableT>
   __device__ __forceinline__ void GetThreadStateTransitionVector(
     SymbolMatcherT const& symbol_matcher,
     TransitionTableT const& transition_table,
-    CharT const* d_chars,
+    SymbolItT d_chars,
     OffsetT const block_offset,
     OffsetT const num_total_symbols,
     std::array<StateIndexT, NUM_STATES>& state_vector)
@@ -416,7 +440,7 @@ struct AgentDFA {
   __device__ __forceinline__ void GetThreadStateTransitions(
     SymbolMatcherT const& symbol_matcher,
     TransitionTableT const& transition_table,
-    CharT const* d_chars,
+    SymbolItT d_chars,
     OffsetT const block_offset,
     OffsetT const num_total_symbols,
     StateIndexT& state,
diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh
index 78c351c5735..7f728db5afa 100644
--- a/cpp/src/io/fst/in_reg_array.cuh
+++ b/cpp/src/io/fst/in_reg_array.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,14 +62,14 @@ class MultiFragmentInRegArray {
   /**
    * @brief Returns the \p num_bits bits starting at \p bit_start
    */
-  CUDF_HOST_DEVICE [[nodiscard]] uint32_t bfe(const uint32_t& data,
+  CUDF_HOST_DEVICE [[nodiscard]] uint32_t bfe(uint32_t const& data,
                                               uint32_t bit_start,
                                               uint32_t num_bits) const
   {
 #if CUB_PTX_ARCH > 0
     return cub::BFE(data, bit_start, num_bits);
 #else
-    const uint32_t MASK = (1 << num_bits) - 1;
+    uint32_t const MASK = (1 << num_bits) - 1;
     return (data >> bit_start) & MASK;
 #endif
   }
diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh
index 27ce6403ee8..c4f99736306 100644
--- a/cpp/src/io/fst/logical_stack.cuh
+++ b/cpp/src/io/fst/logical_stack.cuh
@@ -42,9 +42,10 @@ namespace cudf::io::fst {
  * @brief Describes the kind of stack operation.
  */
 enum class stack_op_type : int8_t {
-  READ = 0,  ///< Operation reading what is currently on top of the stack
-  PUSH = 1,  ///< Operation pushing a new item on top of the stack
-  POP  = 2   ///< Operation popping the item currently on top of the stack
+  READ  = 0,  ///< Operation reading what is currently on top of the stack
+  PUSH  = 1,  ///< Operation pushing a new item on top of the stack
+  POP   = 2,  ///< Operation popping the item currently on top of the stack
+  RESET = 3   ///< Operation popping all items currently on the stack
 };
 
 namespace detail {
@@ -119,9 +120,9 @@ struct StackSymbolToStackOp {
   {
     stack_op_type stack_op = symbol_to_stack_op_type(stack_symbol);
     // PUSH => +1, POP => -1, READ => 0
-    int32_t level_delta = stack_op == stack_op_type::PUSH  ? 1
-                          : stack_op == stack_op_type::POP ? -1
-                                                           : 0;
+    int32_t level_delta = (stack_op == stack_op_type::PUSH)  ? 1
+                          : (stack_op == stack_op_type::POP) ? -1
+                                                             : 0;
     return StackOpT{static_cast<decltype(StackOpT::stack_level)>(level_delta), stack_symbol};
   }
 
@@ -133,14 +134,20 @@ struct StackSymbolToStackOp {
  * @brief Binary reduction operator to compute the absolute stack level from relative stack levels
  * (i.e., +1 for a PUSH, -1 for a POP operation).
  */
+template <typename StackSymbolToStackOpTypeT>
 struct AddStackLevelFromStackOp {
   template <typename StackLevelT, typename ValueT>
   constexpr CUDF_HOST_DEVICE StackOp<StackLevelT, ValueT> operator()(
     StackOp<StackLevelT, ValueT> const& lhs, StackOp<StackLevelT, ValueT> const& rhs) const
   {
-    StackLevelT new_level = lhs.stack_level + rhs.stack_level;
+    StackLevelT new_level = (symbol_to_stack_op_type(rhs.value) == stack_op_type::RESET)
+                              ? 0
+                              : (lhs.stack_level + rhs.stack_level);
     return StackOp<StackLevelT, ValueT>{new_level, rhs.value};
   }
+
+  /// Function object returning a stack operation type for a given stack symbol
+  StackSymbolToStackOpTypeT symbol_to_stack_op_type;
 };
 
 /**
@@ -267,7 +274,7 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
                                      StackSymbolT const empty_stack_symbol,
                                      StackSymbolT const read_symbol,
                                      std::size_t const num_symbols_out,
-                                     rmm::cuda_stream_view stream = cudf::get_default_stream())
+                                     rmm::cuda_stream_view stream)
 {
   rmm::device_buffer temp_storage{};
 
@@ -323,13 +330,14 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
 
   // Getting temporary storage requirements for the prefix sum of the stack level after each
   // operation
-  CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(nullptr,
-                                               stack_level_scan_bytes,
-                                               stack_symbols_in,
-                                               d_kv_operations.Current(),
-                                               detail::AddStackLevelFromStackOp{},
-                                               num_symbols_in,
-                                               stream));
+  CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
+    nullptr,
+    stack_level_scan_bytes,
+    stack_symbols_in,
+    d_kv_operations.Current(),
+    detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
+    num_symbols_in,
+    stream));
 
   // Getting temporary storage requirements for the stable radix sort (sorting by stack level of the
   // operations)
@@ -393,13 +401,14 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
   d_kv_operations = cub::DoubleBuffer<StackOpT>{d_kv_ops_current.data(), d_kv_ops_alt.data()};
 
   // Compute prefix sum of the stack level after each operation
-  CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(temp_storage.data(),
-                                               total_temp_storage_bytes,
-                                               stack_symbols_in,
-                                               d_kv_operations.Current(),
-                                               detail::AddStackLevelFromStackOp{},
-                                               num_symbols_in,
-                                               stream));
+  CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
+    temp_storage.data(),
+    total_temp_storage_bytes,
+    stack_symbols_in,
+    d_kv_operations.Current(),
+    detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
+    num_symbols_in,
+    stream));
 
   // Stable radix sort, sorting by stack level of the operations
   d_kv_operations_unsigned = cub::DoubleBuffer<StackOpUnsignedT>{
diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh
index 055fa1357bb..37c99453361 100644
--- a/cpp/src/io/fst/lookup_tables.cuh
+++ b/cpp/src/io/fst/lookup_tables.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,20 +22,40 @@
 
 #include <cub/cub.cuh>
 
+#include <cuda/std/iterator>
+
 #include <algorithm>
 #include <cstdint>
+#include <iterator>
 #include <vector>
 
 namespace cudf::io::fst::detail {
 
+/**
+ * @brief Helper function object that delegates a lookup to a given lookup table without mapping any
+ * of the given arguments.
+ */
+struct IdentityOp {
+  template <typename LookUpTableT, typename... Args>
+  __host__ __device__ __forceinline__ auto operator()(LookUpTableT const& lookup_table,
+                                                      Args&&... args) const
+  {
+    return lookup_table.lookup(std::forward<Args>(args)...);
+  }
+};
+
 /**
  * @brief Class template that can be plugged into the finite-state machine to look up the symbol
  * group index for a given symbol. Class template does not support multi-symbol lookups (i.e., no
  * look-ahead). The class uses shared memory for the lookups.
  *
  * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id
+ * @tparam PreMapOpT A function object that is invoked with `(lut, symbol)` and must return the
+ * symbol group index of `symbol`.  `lut` is an instance of the lookup table and `symbol` is the
+ * symbol for which to get the symbol group index. If no particular mapping is needed, an instance
+ * of `IdentityOp` can be used.
  */
-template <typename SymbolT>
+template <typename SymbolT, typename PreMapOpT>
 class SingleSymbolSmemLUT {
  private:
   // Type used for representing a symbol group id (i.e., what we return for a given symbol)
@@ -50,31 +70,36 @@ class SingleSymbolSmemLUT {
   };
 
  public:
+  using TempStorage = cub::Uninitialized<_TempStorage>;
+
   struct KernelParameter {
+    using LookupTableT = SingleSymbolSmemLUT<SymbolT, PreMapOpT>;
+
     // sym_to_sgid[min(symbol,num_valid_entries)] -> symbol group index
-    SymbolT num_valid_entries;
+    uint32_t num_valid_entries;
 
     // sym_to_sgid[symbol] -> symbol group index
     SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT];
-  };
 
-  using TempStorage = cub::Uninitialized<_TempStorage>;
+    // Function object that transforms a symbol to a symbol group id
+    PreMapOpT pre_map_op;
+  };
 
   /**
    * @brief Initializes the given \p sgid_init with the symbol group lookups defined by \p
    * symbol_strings.
    *
-   * @param[out] sgid_init A hostdevice_vector that will be populated
-   * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols
+   * @param symbol_strings Array of strings, where the i-th string holds all symbols
    * (characters!) that correspond to the i-th symbol group index
-   * @param[in] stream The stream that shall be used to cudaMemcpyAsync the lookup table
+   * @param stream The stream that shall be used to cudaMemcpyAsync the lookup table
    * @return
    */
   template <typename SymbolGroupItT>
-  static void InitDeviceSymbolGroupIdLut(hostdevice_vector<KernelParameter>& sgid_init,
-                                         SymbolGroupItT const& symbol_strings,
-                                         rmm::cuda_stream_view stream)
+  static KernelParameter InitDeviceSymbolGroupIdLut(SymbolGroupItT const& symbol_strings,
+                                                    PreMapOpT pre_map_op)
   {
+    KernelParameter init_data{};
+
     // The symbol group index to be returned if none of the given symbols match
     SymbolGroupIdT no_match_id = symbol_strings.size();
 
@@ -82,9 +107,7 @@ class SingleSymbolSmemLUT {
     SymbolGroupIdT max_base_match_val = 0;
 
     // Initialize all entries: by default we return the no-match-id
-    std::fill(&sgid_init.host_ptr()->sym_to_sgid[0],
-              &sgid_init.host_ptr()->sym_to_sgid[NUM_ENTRIES_PER_LUT],
-              no_match_id);
+    std::fill(&init_data.sym_to_sgid[0], &init_data.sym_to_sgid[NUM_ENTRIES_PER_LUT], no_match_id);
 
     // Set up lookup table
     uint32_t sg_id = 0;
@@ -93,22 +116,24 @@ class SingleSymbolSmemLUT {
       // Iterate over all symbols that belong to the current symbol group
       for (auto const& sg_symbol : sg_symbols) {
         max_base_match_val = std::max(max_base_match_val, static_cast<SymbolGroupIdT>(sg_symbol));
-        sgid_init.host_ptr()->sym_to_sgid[static_cast<int32_t>(sg_symbol)] = sg_id;
+        init_data.sym_to_sgid[static_cast<int32_t>(sg_symbol)] = sg_id;
       }
       sg_id++;
     }
 
     // Initialize the out-of-bounds lookup: sym_to_sgid[max_base_match_val+1] -> no_match_id
-    sgid_init.host_ptr()->sym_to_sgid[max_base_match_val + 1] = no_match_id;
+    init_data.sym_to_sgid[max_base_match_val + 1] = no_match_id;
 
     // Alias memory / return memory requirements
-    sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 1;
+    init_data.num_valid_entries = max_base_match_val + 1;
+    init_data.pre_map_op        = pre_map_op;
 
-    sgid_init.host_to_device(stream);
+    return init_data;
   }
 
   _TempStorage& temp_storage;
   SymbolGroupIdT num_valid_entries;
+  PreMapOpT pre_map_op;
 
   __device__ __forceinline__ _TempStorage& PrivateStorage()
   {
@@ -139,7 +164,14 @@ class SingleSymbolSmemLUT {
 #endif
   }
 
-  constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT const symbol) const
+  template <typename SymbolT_>
+  constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT_ const symbol) const
+  {
+    // Look up the symbol group for given symbol
+    return pre_map_op(*this, symbol);
+  }
+
+  constexpr CUDF_HOST_DEVICE int32_t lookup(SymbolT const symbol) const
   {
     // Look up the symbol group for given symbol
     return temp_storage
@@ -147,6 +179,163 @@ class SingleSymbolSmemLUT {
   }
 };
 
+/**
+ * @brief A simple symbol group lookup wrapper that uses a simple function object to
+ * retrieve the symbol group id for a symbol.
+ *
+ * @tparam SymbolGroupLookupOpT The function object type to return the symbol group for a given
+ * symbol
+ */
+template <typename SymbolGroupLookupOpT>
+class SymbolGroupLookupOp {
+ private:
+  struct _TempStorage {};
+
+ public:
+  using TempStorage = cub::Uninitialized<_TempStorage>;
+
+  struct KernelParameter {
+    // Declare the member type that the DFA is going to instantiate
+    using LookupTableT = SymbolGroupLookupOp<SymbolGroupLookupOpT>;
+    SymbolGroupLookupOpT sgid_lookup_op;
+  };
+
+  static KernelParameter InitDeviceSymbolGroupIdLut(SymbolGroupLookupOpT sgid_lookup_op)
+  {
+    return KernelParameter{sgid_lookup_op};
+  }
+
+ private:
+  _TempStorage& temp_storage;
+  SymbolGroupLookupOpT sgid_lookup_op;
+
+  __device__ __forceinline__ _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+ public:
+  CUDF_HOST_DEVICE SymbolGroupLookupOp(KernelParameter const& kernel_param,
+                                       TempStorage& temp_storage)
+    : temp_storage(temp_storage.Alias()), sgid_lookup_op(kernel_param.sgid_lookup_op)
+  {
+  }
+
+  template <typename SymbolT_>
+  constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT_ const symbol) const
+  {
+    // Look up the symbol group for given symbol
+    return sgid_lookup_op(symbol);
+  }
+};
+
+/**
+ * @brief Prepares a simple symbol group lookup wrapper that uses a simple function object to
+ * retrieve the symbol group id for a symbol.
+ *
+ * @tparam FunctorT A function object type that must implement the signature `int32_t
+ * operator()(symbol)`, where `symbol` is a symbol from the input type.
+ * @param sgid_lookup_op A function object that must implement the signature `int32_t
+ * operator()(symbol)`, where `symbol` is a symbol from the input type.
+ * @return The kernel parameter of type SymbolGroupLookupOp::KernelParameter that is used to
+ * initialize a simple symbol group id lookup wrapper
+ */
+template <typename FunctorT>
+auto make_symbol_group_lookup_op(FunctorT sgid_lookup_op)
+{
+  return SymbolGroupLookupOp<FunctorT>::InitDeviceSymbolGroupIdLut(sgid_lookup_op);
+}
+
+/**
+ * @brief Creates a symbol group lookup table of type `SingleSymbolSmemLUT` that uses a two-staged
+ * lookup approach. @p pre_map_op is a function object invoked with `(lut, symbol)` that must return
+ * the symbol group id for the given `symbol`. `lut` is an instance of the lookup table
+ * and `symbol` is a symbol from the input tape. Usually, @p pre_map_op first maps a symbol from
+ * the input tape to an integral that is convertible to `symbol_t`. In a second stage, @p pre_map_op
+ * uses `lut`'s `lookup(mapped_symbol)` that maps that integral to the symbol group id.
+ *
+ * @tparam symbol_t Must be an integral type
+ * @tparam NUM_SYMBOL_GROUPS The number of symbol groups, excluding the catchall symbol group (aka
+ * "other" symbol group)
+ * @tparam pre_map_op_t A unary function object type that returns the symbol group id
+ * @param symbol_strings An array of vectors, where all the symbols in the i-th vector are mapped to
+ * the i-th symbol group
+ * @param pre_map_op A unary function object type that returns the symbol group id for a symbol
+ * @return A symbol group lookup table
+ */
+template <typename symbol_t, std::size_t NUM_SYMBOL_GROUPS, typename pre_map_op_t>
+auto make_symbol_group_lut(
+  std::array<std::vector<symbol_t>, NUM_SYMBOL_GROUPS> const& symbol_strings,
+  pre_map_op_t pre_map_op)
+{
+  using lookup_table_t = SingleSymbolSmemLUT<symbol_t, pre_map_op_t>;
+  return lookup_table_t::InitDeviceSymbolGroupIdLut(symbol_strings, pre_map_op);
+}
+
+/**
+ * @brief Creates a symbol group lookup table of type `SingleSymbolSmemLUT` that uses a two-staged
+ * lookup approach. @p pre_map_op is a function object invoked with `(lut, symbol)` that must return
+ * the symbol group id for the given `symbol`. `lut` is an instance of the lookup table
+ * and `symbol` is a symbol from the input tape. Usually, @p pre_map_op first maps a symbol from
+ * the input tape to an integral that is convertible to `symbol_t`. In a second stage, @p pre_map_op
+ * uses `lut`'s `lookup(mapped_symbol)` that maps that integral to the symbol group id.
+ *
+ * @tparam symbol_t The type returned by @p pre_map_op must be assignable to `char`
+ * @tparam NUM_SYMBOL_GROUPS The number of symbol groups, excluding the catchall symbol group (aka
+ * "other" symbol group)
+ * @tparam pre_map_op_t A unary function object type that returns the symbol group id for a symbol
+ * @param symbol_strings An array of strings, where all the characters in the i-th string are mapped
+ * to the i-th symbol group
+ * @param pre_map_op A unary function object type that returns the symbol group id for a symbol
+ * @return A symbol group lookup table
+ */
+template <std::size_t NUM_SYMBOL_GROUPS, typename pre_map_op_t>
+auto make_symbol_group_lut(std::array<std::string, NUM_SYMBOL_GROUPS> const& symbol_strings,
+                           pre_map_op_t pre_map_op)
+{
+  using symbol_t       = char;
+  using lookup_table_t = SingleSymbolSmemLUT<symbol_t, pre_map_op_t>;
+  return lookup_table_t::InitDeviceSymbolGroupIdLut(symbol_strings, pre_map_op);
+}
+
+/**
+ * @brief Creates a symbol group lookup table that maps a symbol to a symbol group id, requiring the
+ * symbol type from the input tape to be assignable to `symbol_t` and `symbol_t` to be of integral
+ * type.
+ *
+ * @tparam symbol_t The input tape's symbol type must be assignable to this type
+ * @tparam NUM_SYMBOL_GROUPS The number of symbol groups, excluding the catchall symbol group (aka
+ * "other" symbol group)
+ * @param symbol_strings An array of vectors, where all the symbols in the i-th vector are mapped to
+ * the i-th symbol group
+ * @return A symbol group lookup table
+ */
+template <typename symbol_t, std::size_t NUM_SYMBOL_GROUPS>
+auto make_symbol_group_lut(
+  std::array<std::vector<symbol_t>, NUM_SYMBOL_GROUPS> const& symbol_strings)
+{
+  return make_symbol_group_lut(symbol_strings, IdentityOp{});
+}
+
+/**
+ * @brief Creates a symbol group lookup table that maps a symbol to a symbol group id, requiring the
+ * symbol type from the input tape to be assignable to `symbol_t` and `symbol_t` to be of integral
+ * type.
+ *
+ * @tparam symbol_t The input tape's symbol type must be assignable to this type
+ * @tparam NUM_SYMBOL_GROUPS The number of symbol groups, excluding the catchall symbol group (aka
+ * "other" symbol group)
+ * @param symbol_strings An array of strings, where all the characters in the i-th string are mapped
+ * to the i-th symbol group
+ * @return A symbol group lookup table
+ */
+template <std::size_t NUM_SYMBOL_GROUPS>
+auto make_symbol_group_lut(std::array<std::string, NUM_SYMBOL_GROUPS> const& symbol_strings)
+{
+  return make_symbol_group_lut(symbol_strings, IdentityOp{});
+}
+
 /**
  * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a new target state. The
  * class uses shared memory for the lookups.
@@ -165,18 +354,20 @@ class TransitionTable {
   };
 
  public:
-  using TempStorage = cub::Uninitialized<_TempStorage>;
+  static constexpr int32_t NUM_STATES = MAX_NUM_STATES;
+  using TempStorage                   = cub::Uninitialized<_TempStorage>;
 
   struct KernelParameter {
+    using LookupTableT = TransitionTable<MAX_NUM_SYMBOLS, MAX_NUM_STATES>;
+
     ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS];
   };
 
   template <typename StateIdT>
-  static void InitDeviceTransitionTable(
-    hostdevice_vector<KernelParameter>& transition_table_init,
-    std::array<std::array<StateIdT, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& translation_table,
-    rmm::cuda_stream_view stream)
+  static KernelParameter InitDeviceTransitionTable(
+    std::array<std::array<StateIdT, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& translation_table)
   {
+    KernelParameter init_data{};
     // translation_table[state][symbol] -> new state
     for (std::size_t state = 0; state < translation_table.size(); ++state) {
       for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) {
@@ -184,16 +375,15 @@ class TransitionTable {
           static_cast<int64_t>(translation_table[state][symbol]) <=
             std::numeric_limits<ItemT>::max(),
           "Target state index value exceeds value representable by the transition table's type");
-        transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] =
+        init_data.transitions[symbol * MAX_NUM_STATES + state] =
           static_cast<ItemT>(translation_table[state][symbol]);
       }
     }
 
-    // Copy transition table to device
-    transition_table_init.host_to_device(stream);
+    return init_data;
   }
 
-  constexpr CUDF_HOST_DEVICE TransitionTable(const KernelParameter& kernel_param,
+  constexpr CUDF_HOST_DEVICE TransitionTable(KernelParameter const& kernel_param,
                                              TempStorage& temp_storage)
     : temp_storage(temp_storage.Alias())
   {
@@ -234,24 +424,83 @@ class TransitionTable {
   }
 };
 
+/**
+ * @brief Creates a transition table of type `TransitionTable` that maps `(state_id, match_id)`
+ * pairs to the new target state for the given `(state_id, match_id)`-combination.
+ *
+ * @tparam StateIdT An integral type used to represent state indexes
+ * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
+ * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
+ * @param transition_table The transition table
+ * @return A transition table of type `TransitionTable`
+ */
+template <typename StateIdT, std::size_t MAX_NUM_SYMBOLS, std::size_t MAX_NUM_STATES>
+auto make_transition_table(
+  std::array<std::array<StateIdT, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& transition_table)
+{
+  using transition_table_t = TransitionTable<MAX_NUM_SYMBOLS, MAX_NUM_STATES>;
+  return transition_table_t::InitDeviceTransitionTable(transition_table);
+}
+
+/**
+ * @brief Compile-time reflection to check if `OpT` type has the `TempStorage` and
+ * `KernelParameter` type members.
+ */
+template <typename OpT, typename = void>
+struct is_complex_op : std::false_type {};
+
+template <typename OpT>
+struct is_complex_op<OpT, std::void_t<typename OpT::TempStorage, typename OpT::KernelParameter>>
+  : std::true_type {};
+
+/**
+ * @brief The device view that is passed to the finite-state transducer algorithm. Each of the
+ * lookup tables can either be a simple function object that defines the `operator()` required for
+ * respective lookup table or a complex class.
+ *
+ * @tparam SymbolGroupIdLookupT
+ * @tparam TransitionTableT
+ * @tparam TranslationTableT
+ * @tparam NUM_STATES
+ */
 template <typename SymbolGroupIdLookupT,
           typename TransitionTableT,
           typename TranslationTableT,
           int32_t NUM_STATES>
 class dfa_device_view {
  private:
-  using sgid_lut_init_t          = typename SymbolGroupIdLookupT::KernelParameter;
-  using transition_table_init_t  = typename TransitionTableT::KernelParameter;
-  using translation_table_init_t = typename TranslationTableT::KernelParameter;
+  // Complex symbol group lookup operators need to declare a `TempStorage` and `KernelParameter`
+  // type member that is passed during device-side initialization.
+  using sgid_lut_init_t = std::conditional_t<is_complex_op<SymbolGroupIdLookupT>::value,
+                                             typename SymbolGroupIdLookupT::KernelParameter,
+                                             SymbolGroupIdLookupT>;
+
+  // Complex transition table lookup operators need to declare a `TempStorage` and
+  // `KernelParameter` type member that is passed during device-side initialization.
+  using transition_table_init_t = std::conditional_t<is_complex_op<TransitionTableT>::value,
+                                                     typename TransitionTableT::KernelParameter,
+                                                     TransitionTableT>;
+
+  // Complex translation table lookup operators need to declare a `TempStorage` and
+  // `KernelParameter` type member that is passed during device-side initialization.
+  using translation_table_init_t = std::conditional_t<is_complex_op<TranslationTableT>::value,
+                                                      typename TranslationTableT::KernelParameter,
+                                                      TranslationTableT>;
 
  public:
   // The maximum number of states supported by this DFA instance
   // This is a value queried by the DFA simulation algorithm
   static constexpr int32_t MAX_NUM_STATES = NUM_STATES;
 
-  using SymbolGroupStorageT      = typename SymbolGroupIdLookupT::TempStorage;
-  using TransitionTableStorageT  = typename TransitionTableT::TempStorage;
-  using TranslationTableStorageT = typename TranslationTableT::TempStorage;
+  using SymbolGroupStorageT      = std::conditional_t<is_complex_op<SymbolGroupIdLookupT>::value,
+                                                 typename SymbolGroupIdLookupT::TempStorage,
+                                                 typename cub::NullType>;
+  using TransitionTableStorageT  = std::conditional_t<is_complex_op<TransitionTableT>::value,
+                                                     typename TransitionTableT::TempStorage,
+                                                     typename cub::NullType>;
+  using TranslationTableStorageT = std::conditional_t<is_complex_op<TranslationTableT>::value,
+                                                      typename TranslationTableT::TempStorage,
+                                                      typename cub::NullType>;
 
   __device__ auto InitSymbolGroupLUT(SymbolGroupStorageT& temp_storage)
   {
@@ -285,14 +534,16 @@ class dfa_device_view {
 
 /**
  * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols
- * that the finite-state transducer is supposed to output for each transition. The class uses shared
- * memory for the lookups.
+ * that the finite-state transducer is supposed to output for each transition. The class uses
+ * shared memory for the lookups.
  *
  * @tparam OutSymbolT The symbol type being output
- * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols
+ * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output
+ * symbols
  * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
  * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
  * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols
+ * be used.
  */
 template <typename OutSymbolT,
           typename OutSymbolOffsetT,
@@ -310,6 +561,12 @@ class TransducerLookupTable {
   using TempStorage = cub::Uninitialized<_TempStorage>;
 
   struct KernelParameter {
+    using LookupTableT = TransducerLookupTable<OutSymbolT,
+                                               OutSymbolOffsetT,
+                                               MAX_NUM_SYMBOLS,
+                                               MAX_NUM_STATES,
+                                               MAX_TABLE_SIZE>;
+
     OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1];
     OutSymbolT d_out_symbols[MAX_TABLE_SIZE];
   };
@@ -320,12 +577,11 @@ class TransducerLookupTable {
    * @note Synchronizes the thread block, if called from device, and, hence, requires all threads
    * of the thread block to call the constructor
    */
-  static void InitDeviceTranslationTable(
-    hostdevice_vector<KernelParameter>& translation_table_init,
+  static KernelParameter InitDeviceTranslationTable(
     std::array<std::array<std::vector<OutSymbolT>, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const&
-      translation_table,
-    rmm::cuda_stream_view stream)
+      translation_table)
   {
+    KernelParameter init_data;
     std::vector<OutSymbolT> out_symbols;
     out_symbols.reserve(MAX_TABLE_SIZE);
     std::vector<OutSymbolOffsetT> out_symbol_offsets;
@@ -356,15 +612,11 @@ class TransducerLookupTable {
     CUDF_EXPECTS(out_symbols.size() <= MAX_TABLE_SIZE, "Unsupported translation table");
 
     // Prepare host-side data to be copied and passed to the device
-    std::copy(std::cbegin(out_symbol_offsets),
-              std::cend(out_symbol_offsets),
-              translation_table_init.host_ptr()->d_out_offsets);
-    std::copy(std::cbegin(out_symbols),
-              std::cend(out_symbols),
-              translation_table_init.host_ptr()->d_out_symbols);
-
-    // Copy data to device
-    translation_table_init.host_to_device(stream);
+    std::copy(
+      std::cbegin(out_symbol_offsets), std::cend(out_symbol_offsets), init_data.d_out_offsets);
+    std::copy(std::cbegin(out_symbols), std::cend(out_symbols), init_data.d_out_symbols);
+
+    return init_data;
   }
 
  private:
@@ -407,24 +659,130 @@ class TransducerLookupTable {
 #endif
   }
 
-  template <typename StateIndexT, typename SymbolIndexT, typename RelativeOffsetT>
-  constexpr CUDF_HOST_DEVICE OutSymbolT operator()(StateIndexT const state_id,
-                                                   SymbolIndexT const match_id,
-                                                   RelativeOffsetT const relative_offset) const
+  template <typename StateIndexT, typename SymbolIndexT, typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE auto operator()(StateIndexT const state_id,
+                                             SymbolIndexT const match_id,
+                                             RelativeOffsetT const relative_offset,
+                                             SymbolT const /*read_symbol*/) const
   {
     auto offset = temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id] + relative_offset;
     return temp_storage.out_symbols[offset];
   }
 
-  template <typename StateIndexT, typename SymbolIndexT>
+  template <typename StateIndexT, typename SymbolIndexT, typename SymbolT>
   constexpr CUDF_HOST_DEVICE OutSymbolOffsetT operator()(StateIndexT const state_id,
-                                                         SymbolIndexT const match_id) const
+                                                         SymbolIndexT const match_id,
+                                                         SymbolT const /*read_symbol*/) const
   {
     return temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id + 1] -
            temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id];
   }
 };
 
+/**
+ * @brief Creates a translation table that maps (old_state, symbol_group_id) transitions to a
+ * sequence of symbols that the finite-state transducer is supposed to output for each transition.
+ *
+ * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols
+ * be used
+ * @tparam OutSymbolT The symbol type being output
+ * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
+ * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
+ * @param translation_table The translation table
+ * @return A translation table of type `TransducerLookupTable`.
+ */
+template <std::size_t MAX_TABLE_SIZE,
+          typename OutSymbolT,
+          std::size_t MAX_NUM_SYMBOLS,
+          std::size_t MAX_NUM_STATES>
+auto make_translation_table(std::array<std::array<std::vector<OutSymbolT>, MAX_NUM_SYMBOLS>,
+                                       MAX_NUM_STATES> const& translation_table)
+{
+  using OutSymbolOffsetT    = int32_t;
+  using translation_table_t = TransducerLookupTable<OutSymbolT,
+                                                    OutSymbolOffsetT,
+                                                    MAX_NUM_SYMBOLS,
+                                                    MAX_NUM_STATES,
+                                                    MAX_TABLE_SIZE>;
+  return translation_table_t::InitDeviceTranslationTable(translation_table);
+}
+
+template <typename TranslationOpT>
+class TranslationOp {
+ private:
+  struct _TempStorage {};
+
+ public:
+  using TempStorage = cub::Uninitialized<_TempStorage>;
+
+  struct KernelParameter {
+    using LookupTableT = TranslationOp<TranslationOpT>;
+    TranslationOpT translation_op;
+  };
+
+  /**
+   * @brief Initializes the lookup table, primarily to be invoked from within device code but also
+   * provides host-side implementation for verification.
+   * @note Synchronizes the thread block, if called from device, and, hence, requires all threads
+   * of the thread block to call the constructor
+   */
+  static KernelParameter InitDeviceTranslationTable(TranslationOpT translation_op)
+  {
+    return KernelParameter{translation_op};
+  }
+
+ private:
+  _TempStorage& temp_storage;
+  TranslationOpT translation_op;
+
+  __device__ __forceinline__ _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+ public:
+  CUDF_HOST_DEVICE TranslationOp(KernelParameter const& kernel_param, TempStorage& temp_storage)
+    : temp_storage(temp_storage.Alias()), translation_op(kernel_param.translation_op)
+  {
+  }
+
+  template <typename StateIndexT, typename SymbolIndexT, typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE auto operator()(StateIndexT const state_id,
+                                             SymbolIndexT const match_id,
+                                             RelativeOffsetT const relative_offset,
+                                             SymbolT const read_symbol) const
+  {
+    return translation_op(*this, state_id, match_id, relative_offset, read_symbol);
+  }
+
+  template <typename StateIndexT, typename SymbolIndexT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE auto operator()(StateIndexT const state_id,
+                                             SymbolIndexT const match_id,
+                                             SymbolT const read_symbol) const
+  {
+    return translation_op(*this, state_id, match_id, read_symbol);
+  }
+};
+
+/**
+ * @brief Creates a simple translation table that uses a simple function object to retrieve the
+ *
+ * @tparam FunctorT A function object type that must implement two signatures: (1) with `(state_id,
+ * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`
+ * @param map_op A function object that must implement two signatures: (1) with `(state_id,
+ * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`.
+ * Invocations of the first signature, (1), must return the number of symbols that are emitted for
+ * the given transition. The second signature, (2), must return the i-th symbol to be emitted for
+ * that transition, where `i` corresponds to `relative_offse`
+ * @return A translation table of type `TranslationO`
+ */
+template <typename FunctorT>
+auto make_translation_functor(FunctorT map_op)
+{
+  return TranslationOp<FunctorT>::InitDeviceTranslationTable(map_op);
+}
+
 /**
  * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the
  * transition table and its number of states, the mapping of symbols to symbol groups, and the
@@ -436,68 +794,32 @@ class TransducerLookupTable {
  * @tparam NUM_STATES The number of states defined by the DFA (the other dimension of the
  * transition table)
  */
-template <typename OutSymbolT, int32_t NUM_SYMBOLS, int32_t NUM_STATES>
+template <typename SymbolGroupIdInitT,
+          typename TransitionTableInitT,
+          typename TranslationTableInitT>
 class Dfa {
- public:
-  // The maximum number of states supported by this DFA instance
-  // This is a value queried by the DFA simulation algorithm
-  static constexpr int32_t MAX_NUM_STATES = NUM_STATES;
-
- private:
-  // Symbol-group id lookup table
-  using SymbolGroupIdLookupT = detail::SingleSymbolSmemLUT<char>;
-  using SymbolGroupIdInitT   = typename SymbolGroupIdLookupT::KernelParameter;
-
-  // Transition table
-  using TransitionTableT     = detail::TransitionTable<NUM_SYMBOLS, NUM_STATES>;
-  using TransitionTableInitT = typename TransitionTableT::KernelParameter;
-
-  // Translation lookup table
-  using OutSymbolOffsetT      = uint32_t;
-  using TranslationTableT     = detail::TransducerLookupTable<OutSymbolT,
-                                                          OutSymbolOffsetT,
-                                                          NUM_SYMBOLS,
-                                                          NUM_STATES,
-                                                          NUM_SYMBOLS * NUM_STATES>;
-  using TranslationTableInitT = typename TranslationTableT::KernelParameter;
+  static constexpr int32_t single_item = 1;
 
+ public:
   auto get_device_view()
   {
-    return dfa_device_view<SymbolGroupIdLookupT, TransitionTableT, TranslationTableT, NUM_STATES>{
-      sgid_init.d_begin(), transition_table_init.d_begin(), translation_table_init.d_begin()};
+    return dfa_device_view<typename SymbolGroupIdInitT::LookupTableT,
+                           typename TransitionTableInitT::LookupTableT,
+                           typename TranslationTableInitT::LookupTableT,
+                           TransitionTableInitT::LookupTableT::NUM_STATES>{
+      &init_data.d_begin()->sgid_lut_init,
+      &init_data.d_begin()->transition_table_init,
+      &init_data.d_begin()->translation_table_init};
   }
 
- public:
-  /**
-   * @brief Constructs a new DFA.
-   *
-   * @param symbol_vec Sequence container of symbol groups. Each symbol group is a sequence
-   * container to symbols within that group. The index of the symbol group containing a symbol being
-   * read will be used as symbol_gid of the transition and translation tables.
-   * @param tt_vec The transition table
-   * @param out_tt_vec The translation table
-   * @param stream The stream to which memory operations and kernels are getting dispatched to
-   */
-  template <typename StateIdT, typename SymbolGroupIdItT>
-  Dfa(SymbolGroupIdItT const& symbol_vec,
-      std::array<std::array<StateIdT, NUM_SYMBOLS>, NUM_STATES> const& tt_vec,
-      std::array<std::array<std::vector<OutSymbolT>, NUM_SYMBOLS>, NUM_STATES> const& out_tt_vec,
-      cudaStream_t stream)
+  Dfa(SymbolGroupIdInitT const& sgid_lut_init,
+      TransitionTableInitT const& transition_table_init,
+      TranslationTableInitT const& translation_table_init,
+      rmm::cuda_stream_view stream)
+    : init_data{single_item, stream}
   {
-    constexpr std::size_t single_item = 1;
-
-    sgid_init              = hostdevice_vector<SymbolGroupIdInitT>{single_item, stream};
-    transition_table_init  = hostdevice_vector<TransitionTableInitT>{single_item, stream};
-    translation_table_init = hostdevice_vector<TranslationTableInitT>{single_item, stream};
-
-    // Initialize symbol group id lookup table
-    SymbolGroupIdLookupT::InitDeviceSymbolGroupIdLut(sgid_init, symbol_vec, stream);
-
-    // Initialize state transition table
-    TransitionTableT::InitDeviceTransitionTable(transition_table_init, tt_vec, stream);
-
-    // Initialize finite-state transducer lookup table
-    TranslationTableT::InitDeviceTranslationTable(translation_table_init, out_tt_vec, stream);
+    *init_data.host_ptr() = {sgid_lut_init, transition_table_init, translation_table_init};
+    init_data.host_to_device_async(stream);
   }
 
   /**
@@ -510,8 +832,8 @@ class Dfa {
    * indexes are written.
    * @tparam TransducedCountOutItT A single-item output iterator type to which the total number of
    * output symbols is written
-   * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and (b)
-   * the output symbols
+   * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and
+   * (b) the output symbols
    * @param d_chars Pointer to the input string of symbols
    * @param num_chars The total number of input symbols to process
    * @param d_out_it Random-access output iterator to which the transduced output is
@@ -524,17 +846,17 @@ class Dfa {
    * "end-state" of the previous invocation of the algorithm.
    * @param stream CUDA stream to launch kernels within. Default is the null-stream.
    */
-  template <typename SymbolT,
+  template <typename SymbolItT,
             typename TransducedOutItT,
             typename TransducedIndexOutItT,
             typename TransducedCountOutItT,
             typename OffsetT>
-  void Transduce(SymbolT const* d_chars,
+  void Transduce(SymbolItT d_chars_it,
                  OffsetT num_chars,
                  TransducedOutItT d_out_it,
                  TransducedIndexOutItT d_out_idx_it,
                  TransducedCountOutItT d_num_transduced_out_it,
-                 const uint32_t seed_state,
+                 uint32_t const seed_state,
                  rmm::cuda_stream_view stream)
   {
     std::size_t temp_storage_bytes = 0;
@@ -542,7 +864,7 @@ class Dfa {
     DeviceTransduce(nullptr,
                     temp_storage_bytes,
                     this->get_device_view(),
-                    d_chars,
+                    d_chars_it,
                     num_chars,
                     d_out_it,
                     d_out_idx_it,
@@ -557,7 +879,7 @@ class Dfa {
     DeviceTransduce(temp_storage.data(),
                     temp_storage_bytes,
                     this->get_device_view(),
-                    d_chars,
+                    d_chars_it,
                     num_chars,
                     d_out_it,
                     d_out_idx_it,
@@ -567,9 +889,36 @@ class Dfa {
   }
 
  private:
-  hostdevice_vector<SymbolGroupIdInitT> sgid_init{};
-  hostdevice_vector<TransitionTableInitT> transition_table_init{};
-  hostdevice_vector<TranslationTableInitT> translation_table_init{};
+  struct host_device_data {
+    SymbolGroupIdInitT sgid_lut_init;
+    TransitionTableInitT transition_table_init;
+    TranslationTableInitT translation_table_init;
+  };
+  cudf::detail::hostdevice_vector<host_device_data> init_data{};
 };
 
+/**
+ * @brief Creates a deterministic finite automaton (DFA) as specified by the triple of (symbol
+ * group, transition, translation)-lookup tables to be used with the finite-state transducer
+ * algorithm.
+ *
+ * @param sgid_lut_init Object used to initialize the symbol group lookup table
+ * @param transition_table_init Object used to initialize the transition table
+ * @param translation_table_init Object used to initialize the translation table
+ * @param stream The stream used to allocate and initialize device-side memory that is used to
+ * initialize the lookup tables
+ * @return A DFA of type `Dfa`.
+ */
+template <typename SymbolGroupIdInitT,
+          typename TransitionTableInitT,
+          typename TranslationTableInitT>
+auto make_fst(SymbolGroupIdInitT const& sgid_lut_init,
+              TransitionTableInitT const& transition_table_init,
+              TranslationTableInitT const& translation_table_init,
+              rmm::cuda_stream_view stream)
+{
+  return Dfa<SymbolGroupIdInitT, TransitionTableInitT, TranslationTableInitT>(
+    sgid_lut_init, transition_table_init, translation_table_init, stream);
+}
+
 }  // namespace cudf::io::fst::detail
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index ddbbce53bab..45f8b0f8822 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -31,6 +31,7 @@
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/io/parquet_metadata.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -179,17 +180,17 @@ compression_type infer_compression_type(compression_type compression, source_inf
   auto filepath = info.filepaths()[0];
 
   // Attempt to infer from the file extension
-  const auto pos = filepath.find_last_of('.');
+  auto const pos = filepath.find_last_of('.');
 
   if (pos == std::string::npos) { return {}; }
 
-  auto str_tolower = [](const auto& begin, const auto& end) {
+  auto str_tolower = [](auto const& begin, auto const& end) {
     std::string out;
     std::transform(begin, end, std::back_inserter(out), ::tolower);
     return out;
   };
 
-  const auto ext = str_tolower(filepath.begin() + pos + 1, filepath.end());
+  auto const ext = str_tolower(filepath.begin() + pos + 1, filepath.end());
 
   if (ext == "gz") { return compression_type::GZIP; }
   if (ext == "zip") { return compression_type::ZIP; }
@@ -345,7 +346,7 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info)
 
   auto parse_column_statistics = [](auto const& raw_col_stats) {
     orc::column_statistics stats_internal;
-    orc::ProtobufReader(reinterpret_cast<const uint8_t*>(raw_col_stats.c_str()),
+    orc::ProtobufReader(reinterpret_cast<uint8_t const*>(raw_col_stats.c_str()),
                         raw_col_stats.size())
       .read(stats_internal);
     return column_statistics(std::move(stats_internal));
@@ -412,7 +413,7 @@ table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_
   auto reader      = std::make_unique<detail_orc::reader>(
     std::move(datasources), options, cudf::get_default_stream(), mr);
 
-  return reader->read(options, cudf::get_default_stream());
+  return reader->read(options);
 }
 
 /**
@@ -484,11 +485,19 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
   return reader->read(options);
 }
 
+parquet_metadata read_parquet_metadata(source_info const& src_info)
+{
+  CUDF_FUNC_RANGE();
+
+  auto datasources = make_datasources(src_info);
+  return detail_parquet::read_parquet_metadata(datasources);
+}
+
 /**
  * @copydoc cudf::io::merge_row_group_metadata
  */
 std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
-  const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list)
+  std::vector<std::unique_ptr<std::vector<uint8_t>>> const& metadata_list)
 {
   CUDF_FUNC_RANGE();
   return detail_parquet::writer::merge_row_group_metadata(metadata_list);
@@ -508,6 +517,26 @@ table_input_metadata::table_input_metadata(table_view const& table)
     table.begin(), table.end(), std::back_inserter(this->column_metadata), get_children);
 }
 
+table_input_metadata::table_input_metadata(table_metadata const& metadata)
+{
+  auto const& names = metadata.schema_info;
+
+  // Create a metadata hierarchy with naming and nullability using `table_metadata`
+  std::function<column_in_metadata(column_name_info const&)> process_node =
+    [&](column_name_info const& name) {
+      auto col_meta = column_in_metadata{name.name};
+      if (name.is_nullable.has_value()) { col_meta.set_nullability(name.is_nullable.value()); }
+      std::transform(name.children.begin(),
+                     name.children.end(),
+                     std::back_inserter(col_meta.children),
+                     process_node);
+      return col_meta;
+    };
+
+  std::transform(
+    names.begin(), names.end(), std::back_inserter(this->column_metadata), process_node);
+}
+
 /**
  * @copydoc cudf::io::write_parquet
  */
@@ -741,6 +770,12 @@ parquet_writer_options_builder& parquet_writer_options_builder::max_page_fragmen
   return *this;
 }
 
+parquet_writer_options_builder& parquet_writer_options_builder::write_v2_headers(bool enabled)
+{
+  options.enable_write_v2_headers(enabled);
+  return *this;
+}
+
 void chunked_parquet_writer_options::set_key_value_metadata(
   std::vector<std::map<std::string, std::string>> metadata)
 {
@@ -822,6 +857,13 @@ chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::
   return *this;
 }
 
+chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::write_v2_headers(
+  bool enabled)
+{
+  options.enable_write_v2_headers(enabled);
+  return *this;
+}
+
 chunked_parquet_writer_options_builder&
 chunked_parquet_writer_options_builder::max_page_fragment_size(size_type val)
 {
diff --git a/cpp/src/io/json/experimental/byte_range_info.cu b/cpp/src/io/json/byte_range_info.cu
similarity index 89%
rename from cpp/src/io/json/experimental/byte_range_info.cu
rename to cpp/src/io/json/byte_range_info.cu
index d6e30d090a5..d359e917dfa 100644
--- a/cpp/src/io/json/experimental/byte_range_info.cu
+++ b/cpp/src/io/json/byte_range_info.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/find.h>
 
-namespace cudf::io::detail::json::experimental {
+namespace cudf::io::json::detail {
 
 // Extract the first character position in the string.
 size_type find_first_delimiter(device_span<char const> d_data,
@@ -33,4 +33,4 @@ size_type find_first_delimiter(device_span<char const> d_data,
   return first_delimiter_position != d_data.end() ? first_delimiter_position - d_data.begin() : -1;
 }
 
-}  // namespace cudf::io::detail::json::experimental
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 0fdcd33ada4..bdad16bd9f1 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -21,7 +21,6 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/detail/data_casting.cuh>
@@ -48,11 +47,12 @@
 #include <thrust/transform.h>
 #include <thrust/unique.h>
 
+#include <cuda/atomic>
+
 #include <algorithm>
 #include <cstdint>
 
-namespace cudf::io::json {
-namespace detail {
+namespace cudf::io::json::detail {
 
 // DEBUG prints
 auto to_cat = [](auto v) -> std::string {
@@ -232,8 +232,9 @@ reduce_to_column_tree(tree_meta_t& tree,
                        auto parent_col_id = parent_col_ids[col_id];
                        if (parent_col_id != parent_node_sentinel and
                            column_categories[parent_col_id] == node_t::NC_LIST) {
-                         atomicMax(list_parents_children_max_row_offsets + parent_col_id,
-                                   max_row_offsets[col_id]);
+                         cuda::atomic_ref<NodeIndexT, cuda::thread_scope_device> ref{
+                           *(list_parents_children_max_row_offsets + parent_col_id)};
+                         ref.fetch_max(max_row_offsets[col_id], cuda::std::memory_order_relaxed);
                        }
                      });
     thrust::gather_if(
@@ -330,7 +331,7 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
 {
   CUDF_FUNC_RANGE();
   auto const num_strings = node_range_begin.size();
-  rmm::device_uvector<thrust::pair<const char*, size_type>> string_views(num_strings, stream);
+  rmm::device_uvector<thrust::pair<char const*, size_type>> string_views(num_strings, stream);
   auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
   thrust::transform(rmm::exec_policy(stream),
                     d_offset_pairs,
@@ -346,24 +347,23 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
   cudf::io::parse_options_view options_view{};
   options_view.quotechar  = '\0';  // no quotes
   options_view.keepquotes = true;
-  auto d_column_names     = experimental::detail::parse_data(string_views.begin(),
-                                                         num_strings,
-                                                         data_type{type_id::STRING},
-                                                         rmm::device_buffer{},
-                                                         0,
-                                                         options_view,
-                                                         stream,
-                                                         rmm::mr::get_current_device_resource());
-  auto to_host            = [](auto const& col) {
+  auto d_column_names     = parse_data(string_views.begin(),
+                                   num_strings,
+                                   data_type{type_id::STRING},
+                                   rmm::device_buffer{},
+                                   0,
+                                   options_view,
+                                   stream,
+                                   rmm::mr::get_current_device_resource());
+  auto to_host            = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
     auto const h_chars = cudf::detail::make_std_vector_sync<char>(
-      cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
-      cudf::get_default_stream());
+      cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()), stream);
     auto const h_offsets = cudf::detail::make_std_vector_sync(
-      cudf::device_span<cudf::offset_type const>(
-        scv.offsets().data<cudf::offset_type>() + scv.offset(), scv.size() + 1),
-      cudf::get_default_stream());
+      cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
+                                               scv.size() + 1),
+      stream);
 
     // build std::string vector from chars and offsets
     std::vector<std::string> host_data;
@@ -507,8 +507,8 @@ void make_device_json_column(device_span<SymbolT const> input,
       init_to_zero(col.child_offsets);
     }
     col.num_rows = max_row_offsets[i] + 1;
-    col.validity.resize(bitmask_allocation_size_bytes(max_row_offsets[i] + 1), stream);
-    init_to_zero(col.validity);
+    col.validity =
+      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
     col.type = to_json_col_type(column_categories[i]);
   };
 
@@ -600,7 +600,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     columns_data[col_id] = json_column_data{col.string_offsets.data(),
                                             col.string_lengths.data(),
                                             col.child_offsets.data(),
-                                            col.validity.data()};
+                                            static_cast<bitmask_type*>(col.validity.data())};
   }
 
   auto d_ignore_vals = cudf::detail::make_device_uvector_async(
@@ -718,7 +718,8 @@ void make_device_json_column(device_span<SymbolT const> input,
  * @param options The reader options to influence the relevant type inference and type casting
  * options
  */
-cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options);
+cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options,
+                                        rmm::cuda_stream_view stream);
 
 std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_column_to_cudf_column(
   device_json_column& json_col,
@@ -736,10 +737,10 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
   auto make_validity = [stream, validity_size_check](
                          device_json_column& json_col) -> std::pair<rmm::device_buffer, size_type> {
     validity_size_check(json_col);
-    auto null_count =
-      cudf::detail::null_count(json_col.validity.data(), 0, json_col.num_rows, stream);
+    auto null_count = cudf::detail::null_count(
+      static_cast<bitmask_type*>(json_col.validity.data()), 0, json_col.num_rows, stream);
     // full null_mask is always required for parse_data
-    return {json_col.validity.release(), null_count};
+    return {std::move(json_col.validity), null_count};
     // Note: json_col modified here, moves this memory
   };
 
@@ -755,7 +756,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
     case json_col_t::StringColumn: {
       // move string_offsets to GPU and transform to string column
       auto const col_size      = json_col.string_offsets.size();
-      using char_length_pair_t = thrust::pair<const char*, size_type>;
+      using char_length_pair_t = thrust::pair<char const*, size_type>;
       CUDF_EXPECTS(json_col.string_offsets.size() == json_col.string_lengths.size(),
                    "string offset, string length mismatch");
       rmm::device_uvector<char_length_pair_t> d_string_data(col_size, stream);
@@ -772,7 +773,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
       auto string_spans_it = thrust::make_transform_iterator(
         offset_length_it, [data = d_input.data()] __device__(auto ip) {
-          return thrust::pair<const char*, std::size_t>{
+          return thrust::pair<char const*, std::size_t>{
             data + thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
         });
 
@@ -794,14 +795,14 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
 
       auto [result_bitmask, null_count] = make_validity(json_col);
       // Convert strings to the inferred data type
-      auto col = experimental::detail::parse_data(string_spans_it,
-                                                  col_size,
-                                                  target_type,
-                                                  std::move(result_bitmask),
-                                                  null_count,
-                                                  options.view(),
-                                                  stream,
-                                                  mr);
+      auto col = parse_data(string_spans_it,
+                            col_size,
+                            target_type,
+                            std::move(result_bitmask),
+                            null_count,
+                            options.view(),
+                            stream,
+                            mr);
 
       // Reset nullable if we do not have nulls
       // This is to match the existing JSON reader's behaviour:
@@ -849,8 +850,11 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
         json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first);
 
       // Note: json_col modified here, reuse the memory
-      auto offsets_column = std::make_unique<column>(
-        data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release());
+      auto offsets_column = std::make_unique<column>(data_type{type_id::INT32},
+                                                     num_rows + 1,
+                                                     json_col.child_offsets.release(),
+                                                     rmm::device_buffer{},
+                                                     0);
       // Create children column
       auto [child_column, names] =
         json_col.child_columns.empty()
@@ -859,7 +863,9 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                       std::vector<column_name_info>>{std::make_unique<column>(
                                                        data_type{type_id::INT8},
                                                        0,
-                                                       rmm::device_buffer{0, stream, mr}),
+                                                       rmm::device_buffer{},
+                                                       rmm::device_buffer{},
+                                                       0),
                                                      std::vector<column_name_info>{}}
           : device_json_column_to_cudf_column(
               json_col.child_columns.begin()->second,
@@ -952,7 +958,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     options.is_enabled_lines() ? root_column : root_column.child_columns.begin()->second;
 
   // Zero row entries
-  if (data_root.type == json_col_t::ListColumn && data_root.child_columns.size() == 0) {
+  if (data_root.type == json_col_t::ListColumn && data_root.child_columns.empty()) {
     return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{})};
   }
 
@@ -970,7 +976,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
   // Initialize meta data to be populated while recursing through the tree of columns
   std::vector<std::unique_ptr<column>> out_columns;
   std::vector<column_name_info> out_column_names;
-  auto parse_opt = parsing_options(options);
+  auto parse_opt = parsing_options(options, stream);
 
   // Iterate over the struct's child columns and convert to cudf column
   size_type column_index = 0;
@@ -981,7 +987,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
 
     std::optional<schema_element> child_schema_element = std::visit(
       cudf::detail::visitor_overload{
-        [column_index](const std::vector<data_type>& user_dtypes) -> std::optional<schema_element> {
+        [column_index](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
           return (static_cast<std::size_t>(column_index) < user_dtypes.size())
                    ? std::optional<schema_element>{{user_dtypes[column_index]}}
                    : std::optional<schema_element>{};
@@ -1007,7 +1013,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                 << "\n";
     };
     std::visit(
-      cudf::detail::visitor_overload{[column_index](const std::vector<data_type>&) {
+      cudf::detail::visitor_overload{[column_index](std::vector<data_type> const&) {
                                        std::cout << "Column by index: #" << column_index;
                                      },
                                      [col_name](std::map<std::string, data_type> const&) {
@@ -1037,5 +1043,4 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
   return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
 }
 
-}  // namespace detail
-}  // namespace cudf::io::json
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index ed4beee50bd..9231040eb70 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -19,24 +19,24 @@
 #include <hash/helper_functions.cuh>
 #include <io/utilities/hostdevice_vector.hpp>
 
-#include <cudf/detail/hashing.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scatter.cuh>
 #include <cudf/detail/utilities/algorithm.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <cuco/static_map.cuh>
-
-#include <cub/device/device_radix_sort.cuh>
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
+#include <cub/device/device_radix_sort.cuh>
+
+#include <cuco/static_set.cuh>
+
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -400,8 +400,6 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
 {
   CUDF_FUNC_RANGE();
   using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
-  using hash_map_type =
-    cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
 
   auto const num_nodes  = d_tree.node_categories.size();
   auto const num_fields = thrust::count(rmm::exec_policy(stream),
@@ -409,18 +407,12 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
                                         d_tree.node_categories.end(),
                                         node_t::NC_FN);
 
-  constexpr size_type empty_node_index_sentinel = -1;
-  hash_map_type key_map{compute_hash_table_size(num_fields, 40),  // 40% occupancy in hash map
-                        cuco::empty_key{empty_node_index_sentinel},
-                        cuco::empty_value{empty_node_index_sentinel},
-                        hash_table_allocator_type{default_allocator<char>{}, stream},
-                        stream.value()};
   auto const d_hasher = [d_input          = d_input.data(),
                          node_range_begin = d_tree.node_range_begin.data(),
                          node_range_end   = d_tree.node_range_end.data()] __device__(auto node_id) {
     auto const field_name = cudf::string_view(d_input + node_range_begin[node_id],
                                               node_range_end[node_id] - node_range_begin[node_id]);
-    return cudf::detail::default_hash<cudf::string_view>{}(field_name);
+    return cudf::hashing::detail::default_hash<cudf::string_view>{}(field_name);
   };
   auto const d_equal = [d_input          = d_input.data(),
                         node_range_begin = d_tree.node_range_begin.data(),
@@ -434,25 +426,33 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
   };
   // key-value pairs: uses node_id itself as node_type. (unique node_id for a field name due to
   // hashing)
-  auto const iter = cudf::detail::make_counting_transform_iterator(
-    0, [] __device__(size_type i) { return cuco::make_pair(i, i); });
+  auto const iter = thrust::make_counting_iterator<size_type>(0);
 
   auto const is_field_name_node = [node_categories =
                                      d_tree.node_categories.data()] __device__(auto node_id) {
     return node_categories[node_id] == node_t::NC_FN;
   };
-  key_map.insert_if(iter,
-                    iter + num_nodes,
-                    thrust::counting_iterator<size_type>(0),  // stencil
-                    is_field_name_node,
-                    d_hasher,
-                    d_equal,
-                    stream.value());
+
+  using hasher_type                             = decltype(d_hasher);
+  constexpr size_type empty_node_index_sentinel = -1;
+  auto key_set =
+    cuco::experimental::static_set{cuco::experimental::extent{compute_hash_table_size(
+                                     num_fields, 40)},  // 40% occupancy in hash map
+                                   cuco::empty_key{empty_node_index_sentinel},
+                                   d_equal,
+                                   cuco::experimental::linear_probing<1, hasher_type>{d_hasher},
+                                   hash_table_allocator_type{default_allocator<char>{}, stream},
+                                   stream.value()};
+  key_set.insert_if_async(iter,
+                          iter + num_nodes,
+                          thrust::counting_iterator<size_type>(0),  // stencil
+                          is_field_name_node,
+                          stream.value());
 
   auto const get_hash_value =
-    [key_map = key_map.get_device_view(), d_hasher, d_equal] __device__(auto node_id) -> size_type {
-    auto const it = key_map.find(node_id, d_hasher, d_equal);
-    return (it == key_map.end()) ? size_type{0} : it->second.load(cuda::std::memory_order_relaxed);
+    [key_set = key_set.ref(cuco::experimental::op::find)] __device__(auto node_id) -> size_type {
+    auto const it = key_set.find(node_id);
+    return (it == key_set.end()) ? size_type{0} : *it;
   };
 
   // convert field nodes to node indices, and other nodes to enum value.
@@ -528,7 +528,6 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
 {
   CUDF_FUNC_RANGE();
   auto const num_nodes = parent_node_ids.size();
-  rmm::device_uvector<size_type> col_id(num_nodes, stream, mr);
 
   // array of arrays
   NodeIndexT const row_array_children_level = is_enabled_lines ? 1 : 2;
@@ -560,17 +559,6 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
                     list_indices.begin());
   }
 
-  using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
-  using hash_map_type =
-    cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
-
-  constexpr size_type empty_node_index_sentinel = -1;
-  hash_map_type key_map{compute_hash_table_size(num_nodes),  // TODO reduce oversubscription
-                        cuco::empty_key{empty_node_index_sentinel},
-                        cuco::empty_value{empty_node_index_sentinel},
-                        cuco::erased_key{-2},
-                        hash_table_allocator_type{default_allocator<char>{}, stream},
-                        stream.value()};
   // path compression is not used since extra writes make all map operations slow.
   auto const d_hasher = [node_level      = node_levels.begin(),
                          node_type       = node_type.begin(),
@@ -578,18 +566,18 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
                          list_indices    = list_indices.begin(),
                          is_array_of_arrays,
                          row_array_children_level] __device__(auto node_id) {
-    auto hash =
-      cudf::detail::hash_combine(cudf::detail::default_hash<TreeDepthT>{}(node_level[node_id]),
-                                 cudf::detail::default_hash<size_type>{}(node_type[node_id]));
+    auto hash = cudf::hashing::detail::hash_combine(
+      cudf::hashing::detail::default_hash<TreeDepthT>{}(node_level[node_id]),
+      cudf::hashing::detail::default_hash<size_type>{}(node_type[node_id]));
     node_id = parent_node_ids[node_id];
     // Each node computes its hash by walking from its node up to the root.
     while (node_id != parent_node_sentinel) {
-      hash = cudf::detail::hash_combine(
-        hash, cudf::detail::default_hash<TreeDepthT>{}(node_level[node_id]));
-      hash = cudf::detail::hash_combine(
-        hash, cudf::detail::default_hash<size_type>{}(node_type[node_id]));
+      hash = cudf::hashing::detail::hash_combine(
+        hash, cudf::hashing::detail::default_hash<TreeDepthT>{}(node_level[node_id]));
+      hash = cudf::hashing::detail::hash_combine(
+        hash, cudf::hashing::detail::default_hash<size_type>{}(node_type[node_id]));
       if (is_array_of_arrays and node_level[node_id] == row_array_children_level)
-        hash = cudf::detail::hash_combine(hash, list_indices[node_id]);
+        hash = cudf::hashing::detail::hash_combine(hash, list_indices[node_id]);
       node_id = parent_node_ids[node_id];
     }
     return hash;
@@ -632,23 +620,26 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
     return node_id1 == node_id2;
   };
 
+  constexpr size_type empty_node_index_sentinel = -1;
+  using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
+  using hasher_type               = decltype(d_hashed_cache);
+
+  auto key_set = cuco::experimental::static_set{
+    cuco::experimental::extent{compute_hash_table_size(num_nodes)},
+    cuco::empty_key<cudf::size_type>{empty_node_index_sentinel},
+    d_equal,
+    cuco::experimental::linear_probing<1, hasher_type>{d_hashed_cache},
+    hash_table_allocator_type{default_allocator<char>{}, stream},
+    stream.value()};
+
   // insert and convert node ids to unique set ids
-  auto const num_inserted = thrust::count_if(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(num_nodes),
-    [d_hashed_cache,
-     d_equal,
-     view       = key_map.get_device_mutable_view(),
-     uq_node_id = col_id.begin()] __device__(auto node_id) mutable {
-      auto it = view.insert_and_find(cuco::make_pair(node_id, node_id), d_hashed_cache, d_equal);
-      uq_node_id[node_id] = (it.first)->first.load(cuda::std::memory_order_relaxed);
-      return it.second;
-    });
+  auto nodes_itr         = thrust::make_counting_iterator<size_type>(0);
+  auto const num_columns = key_set.insert(nodes_itr, nodes_itr + num_nodes, stream.value());
 
-  auto const num_columns = num_inserted;  // key_map.get_size() is not updated.
   rmm::device_uvector<size_type> unique_keys(num_columns, stream);
-  key_map.retrieve_all(unique_keys.begin(), thrust::make_discard_iterator(), stream.value());
+  rmm::device_uvector<size_type> col_id(num_nodes, stream, mr);
+  key_set.find_async(nodes_itr, nodes_itr + num_nodes, col_id.begin(), stream.value());
+  std::ignore = key_set.retrieve_all(unique_keys.begin(), stream.value());
 
   return {std::move(col_id), std::move(unique_keys)};
 }
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu
similarity index 95%
rename from cpp/src/io/json/json_gpu.cu
rename to cpp/src/io/json/legacy/json_gpu.cu
index d1711db0484..b358cc2071b 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/legacy/json_gpu.cu
@@ -19,8 +19,9 @@
 #include <io/utilities/column_type_histogram.hpp>
 #include <io/utilities/parsing_utils.cuh>
 
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/span.hpp>
@@ -44,12 +45,9 @@
 #include <thrust/pair.h>
 
 using cudf::device_span;
+using cudf::detail::grid_1d;
 
-namespace cudf {
-namespace io {
-namespace json {
-namespace gpu {
-using namespace ::cudf;
+namespace cudf::io::json::detail::legacy {
 
 namespace {
 /**
@@ -170,8 +168,8 @@ struct field_descriptor {
  * nullptr is passed when the input file does not consist of objects.
  * @return Descriptor of the parsed field
  */
-__device__ field_descriptor next_field_descriptor(const char* begin,
-                                                  const char* end,
+__device__ field_descriptor next_field_descriptor(char const* begin,
+                                                  char const* end,
                                                   parse_options_view const& opts,
                                                   cudf::size_type field_idx,
                                                   col_map_type col_map)
@@ -185,7 +183,7 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
                          false}
       : [&]() {
           auto const key_range = get_next_key(begin, end, opts.quotechar);
-          auto const key_hash  = cudf::detail::MurmurHash3_32<cudf::string_view>{}(
+          auto const key_hash  = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{}(
             cudf::string_view(key_range.first, key_range.second - key_range.first));
           auto const hash_col = col_map.find(key_hash);
           // Fall back to field index if not found (parsing error)
@@ -256,7 +254,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
                                                device_span<bitmask_type* const> const valid_fields,
                                                device_span<cudf::size_type> const num_valid_fields)
 {
-  const auto rec_id = threadIdx.x + (blockDim.x * blockIdx.x);
+  auto const rec_id = grid_1d::global_thread_id();
   if (rec_id >= row_offsets.size()) return;
 
   auto const row_data_range = get_row_data_range(data, row_offsets, rec_id);
@@ -272,7 +270,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
 
     current = desc.value_end + 1;
 
-    using string_index_pair = thrust::pair<const char*, size_type>;
+    using string_index_pair = thrust::pair<char const*, size_type>;
 
     if (!serialized_trie_contains(opts.trie_na,
                                   {desc.value_begin - is_quoted, value_len + is_quoted * 2})) {
@@ -331,7 +329,7 @@ __global__ void detect_data_types_kernel(
   int num_columns,
   device_span<cudf::io::column_type_histogram> const column_infos)
 {
-  auto const rec_id = threadIdx.x + (blockDim.x * blockIdx.x);
+  auto const rec_id = grid_1d::global_thread_id();
   if (rec_id >= row_offsets.size()) return;
 
   auto const are_rows_objects = col_map.capacity() != 0;
@@ -373,7 +371,7 @@ __global__ void detect_data_types_kernel(
     int exponent_count = 0;
     int other_count    = 0;
 
-    const bool maybe_hex =
+    bool const maybe_hex =
       ((value_len > 2 && *desc.value_begin == '0' && *(desc.value_begin + 1) == 'x') ||
        (value_len > 3 && *desc.value_begin == '-' && *(desc.value_begin + 1) == '0' &&
         *(desc.value_begin + 2) == 'x'));
@@ -489,12 +487,12 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
                                          unsigned long long int* keys_cnt,
                                          thrust::optional<mutable_table_device_view> keys_info)
 {
-  auto const rec_id = threadIdx.x + (blockDim.x * blockIdx.x);
+  auto const rec_id = grid_1d::global_thread_id();
   if (rec_id >= row_offsets.size()) return;
 
   auto const row_data_range = get_row_data_range(data, row_offsets, rec_id);
 
-  auto advance = [&](const char* begin) {
+  auto advance = [&](char const* begin) {
     return get_next_key_value_range(begin, row_data_range.second, options);
   };
   for (auto field_range = advance(row_data_range.first);
@@ -506,7 +504,7 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
       keys_info->column(0).element<uint64_t>(idx) = field_range.key_begin - data.begin();
       keys_info->column(1).element<uint16_t>(idx) = len;
       keys_info->column(2).element<uint32_t>(idx) =
-        cudf::detail::MurmurHash3_32<cudf::string_view>{}(
+        cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{}(
           cudf::string_view(field_range.key_begin, len));
     }
   }
@@ -515,7 +513,7 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
 }  // namespace
 
 /**
- * @copydoc cudf::io::json::gpu::convert_json_to_columns
+ * @copydoc cudf::io::json::detail::legacy::convert_json_to_columns
  */
 void convert_json_to_columns(parse_options_view const& opts,
                              device_span<char const> const data,
@@ -532,7 +530,7 @@ void convert_json_to_columns(parse_options_view const& opts,
   CUDF_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
     &min_grid_size, &block_size, convert_data_to_columns_kernel));
 
-  const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
+  int const grid_size = (row_offsets.size() + block_size - 1) / block_size;
 
   convert_data_to_columns_kernel<<<grid_size, block_size, 0, stream.value()>>>(opts,
                                                                                data,
@@ -547,11 +545,11 @@ void convert_json_to_columns(parse_options_view const& opts,
 }
 
 /**
- * @copydoc cudf::io::gpu::detect_data_types
+ * @copydoc cudf::io::json::detail::legacy::detect_data_types
  */
 
 std::vector<cudf::io::column_type_histogram> detect_data_types(
-  const parse_options_view& options,
+  parse_options_view const& options,
   device_span<char const> const data,
   device_span<uint64_t const> const row_offsets,
   bool do_set_null_count,
@@ -583,7 +581,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
   }();
 
   // Calculate actual block count to use based on records count
-  const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
+  int const grid_size = (row_offsets.size() + block_size - 1) / block_size;
 
   detect_data_types_kernel<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, row_offsets, *col_map, num_columns, d_column_infos);
@@ -592,7 +590,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
 }
 
 /**
- * @copydoc cudf::io::json::gpu::gpu_collect_keys_info
+ * @copydoc cudf::io::json::detail::legacy::collect_keys_info
  */
 void collect_keys_info(parse_options_view const& options,
                        device_span<char const> const data,
@@ -607,7 +605,7 @@ void collect_keys_info(parse_options_view const& options,
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, collect_keys_info_kernel));
 
   // Calculate actual block count to use based on records count
-  const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
+  int const grid_size = (row_offsets.size() + block_size - 1) / block_size;
 
   collect_keys_info_kernel<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, row_offsets, keys_cnt, keys_info);
@@ -615,7 +613,4 @@ void collect_keys_info(parse_options_view const& options,
   CUDF_CHECK_CUDA(stream.value());
 }
 
-}  // namespace gpu
-}  // namespace json
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/json_gpu.hpp b/cpp/src/io/json/legacy/json_gpu.hpp
similarity index 95%
rename from cpp/src/io/json/json_gpu.hpp
rename to cpp/src/io/json/legacy/json_gpu.hpp
index 46bc2dd95a3..48fe6c69390 100644
--- a/cpp/src/io/json/json_gpu.hpp
+++ b/cpp/src/io/json/legacy/json_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,10 +31,7 @@
 
 using cudf::device_span;
 
-namespace cudf {
-namespace io {
-namespace json {
-namespace gpu {
+namespace cudf::io::json::detail::legacy {
 
 using col_map_type = concurrent_unordered_map<uint32_t, cudf::size_type>;
 /**
@@ -100,7 +97,4 @@ void collect_keys_info(parse_options_view const& options,
                        thrust::optional<mutable_table_device_view> keys_info,
                        rmm::cuda_stream_view stream);
 
-}  // namespace gpu
-}  // namespace json
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/legacy/read_json.hpp b/cpp/src/io/json/legacy/read_json.hpp
new file mode 100644
index 00000000000..e3fa010e08e
--- /dev/null
+++ b/cpp/src/io/json/legacy/read_json.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/mr/memory_resource.h>
+
+#include <memory>
+#include <vector>
+
+namespace cudf::io::json::detail::legacy {
+
+table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
+                              json_reader_options const& reader_opts,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
+
+}  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu
similarity index 94%
rename from cpp/src/io/json/reader_impl.cu
rename to cpp/src/io/json/legacy/reader_impl.cu
index baec3cb1439..1ae7ccf71c1 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/legacy/reader_impl.cu
@@ -16,8 +16,6 @@
 
 #include "json_gpu.hpp"
 
-#include "experimental/read_json.hpp"
-
 #include <hash/concurrent_unordered_map.cuh>
 
 #include <io/comp/io_uncomp.hpp>
@@ -56,9 +54,8 @@
 
 using cudf::host_span;
 
-namespace cudf::io::json::detail {
+namespace cudf::io::json::detail::legacy {
 
-using col_map_type     = cudf::io::json::gpu::col_map_type;
 using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_map_type*)>>;
 
 /**
@@ -129,8 +126,7 @@ std::unique_ptr<table> create_json_keys_info_table(parse_options_view const& par
 {
   // Count keys
   rmm::device_scalar<unsigned long long int> key_counter(0, stream);
-  cudf::io::json::gpu::collect_keys_info(
-    parse_opts, data, row_offsets, key_counter.data(), {}, stream);
+  collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {}, stream);
 
   // Allocate columns to store hash value, length, and offset of each JSON object key in the input
   auto const num_keys = key_counter.value(stream);
@@ -148,8 +144,7 @@ std::unique_ptr<table> create_json_keys_info_table(parse_options_view const& par
   // Reset the key counter - now used for indexing
   key_counter.set_value_to_zero_async(stream);
   // Fill the allocated columns
-  cudf::io::json::gpu::collect_keys_info(
-    parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
+  collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
   return info_table;
 }
 
@@ -213,7 +208,7 @@ std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashe
           create_col_names_hash_map(sorted_info->get_column(2).view(), stream)};
 }
 
-std::vector<uint8_t> ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
+std::vector<uint8_t> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
                                       compression_type compression,
                                       size_t range_offset,
                                       size_t range_size,
@@ -222,7 +217,7 @@ std::vector<uint8_t> ingest_raw_input(std::vector<std::unique_ptr<datasource>> c
   CUDF_FUNC_RANGE();
   // Iterate through the user defined sources and read the contents into the local buffer
   size_t total_source_size = 0;
-  for (const auto& source : sources) {
+  for (auto const& source : sources) {
     total_source_size += source->size();
   }
   total_source_size = total_source_size - (range_offset * sources.size());
@@ -230,7 +225,7 @@ std::vector<uint8_t> ingest_raw_input(std::vector<std::unique_ptr<datasource>> c
   auto buffer = std::vector<uint8_t>(total_source_size);
 
   size_t bytes_read = 0;
-  for (const auto& source : sources) {
+  for (auto const& source : sources) {
     if (!source->is_empty()) {
       auto data_size   = (range_size_padded != 0) ? range_size_padded : source->size();
       auto destination = buffer.data() + bytes_read;
@@ -337,7 +332,7 @@ rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reade
                     rec_starts.begin(),
                     thrust::minus<uint64_t>());
 
-  const size_t bytes_to_upload = end_offset - start_offset;
+  size_t const bytes_to_upload = end_offset - start_offset;
   CUDF_EXPECTS(bytes_to_upload <= h_data.size(),
                "Error finding the record within the specified byte range.\n");
 
@@ -372,8 +367,8 @@ std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
   //   JSON array - [val1, val2, ...] and
   //   JSON object - {"col1":val1, "col2":val2, ...}
   // based on the top level opening bracket
-  const auto first_square_bracket = std::find(first_row.begin(), first_row.end(), '[');
-  const auto first_curly_bracket  = std::find(first_row.begin(), first_row.end(), '{');
+  auto const first_square_bracket = std::find(first_row.begin(), first_row.end(), '[');
+  auto const first_curly_bracket  = std::find(first_row.begin(), first_row.end(), '{');
   CUDF_EXPECTS(first_curly_bracket != first_row.end() || first_square_bracket != first_row.end(),
                "Input data is not a valid JSON file.");
   // If the first opening bracket is '{', assume object format
@@ -408,16 +403,16 @@ std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
                                       rmm::cuda_stream_view stream)
 {
   bool has_to_infer_column_types =
-    std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes());
+    std::visit([](auto const& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes());
 
   if (!has_to_infer_column_types) {
     return std::visit(
       cudf::detail::visitor_overload{
-        [&](const std::vector<data_type>& dtypes) {
+        [&](std::vector<data_type> const& dtypes) {
           CUDF_EXPECTS(dtypes.size() == column_names.size(), "Must specify types for all columns");
           return dtypes;
         },
-        [&](const std::map<std::string, data_type>& dtypes) {
+        [&](std::map<std::string, data_type> const& dtypes) {
           std::vector<data_type> sorted_dtypes;
           std::transform(std::cbegin(column_names),
                          std::cend(column_names),
@@ -429,7 +424,7 @@ std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
                          });
           return sorted_dtypes;
         },
-        [&](const std::map<std::string, schema_element>& dtypes) {
+        [&](std::map<std::string, schema_element> const& dtypes) {
           std::vector<data_type> sorted_dtypes;
           std::transform(std::cbegin(column_names),
                          std::cend(column_names),
@@ -443,11 +438,11 @@ std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
         }},
       reader_opts.get_dtypes());
   } else {
-    CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
+    CUDF_EXPECTS(not rec_starts.empty(), "No data available for data type inference.\n");
     auto const num_columns       = column_names.size();
     auto const do_set_null_count = column_map->capacity() > 0;
 
-    auto const h_column_infos = cudf::io::json::gpu::detect_data_types(
+    auto const h_column_infos = detect_data_types(
       parse_opts, data, rec_starts, do_set_null_count, num_columns, column_map, stream);
 
     auto get_type_id = [&](auto const& cinfo) {
@@ -495,8 +490,8 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
-  const auto num_columns = dtypes.size();
-  const auto num_records = rec_starts.size();
+  auto const num_columns = dtypes.size();
+  auto const num_records = rec_starts.size();
 
   // alloc output buffers.
   std::vector<cudf::io::detail::column_buffer> out_buffers;
@@ -523,7 +518,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
   auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
     num_columns, stream, rmm::mr::get_current_device_resource());
 
-  cudf::io::json::gpu::convert_json_to_columns(
+  convert_json_to_columns(
     parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream);
 
   stream.synchronize();
@@ -591,16 +586,11 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
  *
  * @return Table and its metadata
  */
-table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
+table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
-  CUDF_FUNC_RANGE();
-  if (not reader_opts.is_enabled_legacy()) {
-    return cudf::io::detail::json::experimental::read_json(sources, reader_opts, stream, mr);
-  }
-
   CUDF_EXPECTS(not sources.empty(), "No sources were defined");
   CUDF_EXPECTS(sources.size() == 1 or reader_opts.get_compression() == compression_type::NONE,
                "Multiple compressed inputs are not supported");
@@ -622,7 +612,7 @@ table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
     sources, reader_opts.get_compression(), range_offset, range_size, range_size_padded);
   host_span<char const> h_data{reinterpret_cast<char const*>(h_raw_data.data()), h_raw_data.size()};
 
-  CUDF_EXPECTS(h_data.size() != 0, "Ingest failed: uncompressed input data has zero size.\n");
+  CUDF_EXPECTS(not h_data.empty(), "Ingest failed: uncompressed input data has zero size.\n");
 
   auto d_data = rmm::device_uvector<char>(0, stream);
 
@@ -639,7 +629,7 @@ table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
     d_data = upload_data_to_device(reader_opts, h_data, rec_starts, stream);
   }
 
-  CUDF_EXPECTS(d_data.size() != 0, "Error uploading input data to the GPU.\n");
+  CUDF_EXPECTS(not d_data.is_empty(), "Error uploading input data to the GPU.\n");
 
   auto column_names_and_map =
     get_column_names_and_map(parse_opts.view(), h_data, rec_starts, d_data, stream);
@@ -664,4 +654,4 @@ table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
                                mr);
 }
 
-}  // namespace cudf::io::json::detail
+}  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 7a49f0cfd88..3bbfc4b5f83 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -44,6 +44,21 @@ struct tree_meta_t {
  */
 enum class json_col_t : char { ListColumn, StructColumn, StringColumn, Unknown };
 
+/**
+ * @brief Enum class to specify whether we just push onto and pop from the stack or whether we also
+ * reset to an empty stack on a newline character.
+ */
+enum class stack_behavior_t : char {
+  /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop
+  /// from the stack
+  PushPopWithoutReset,
+
+  /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop
+  /// from the stack. Newline characters are considered delimiters and therefore reset to an empty
+  /// stack.
+  ResetOnDelimiter
+};
+
 // Default name for a list's child column
 constexpr auto list_child_name{"element"};
 
@@ -79,8 +94,8 @@ struct json_column {
   json_column()                              = default;
   json_column(json_column&& other)           = default;
   json_column& operator=(json_column&&)      = default;
-  json_column(const json_column&)            = delete;
-  json_column& operator=(const json_column&) = delete;
+  json_column(json_column const&)            = delete;
+  json_column& operator=(json_column const&) = delete;
 
   /**
    * @brief Fills the rows up to the given \p up_to_row_offset with nulls.
@@ -133,7 +148,7 @@ struct device_json_column {
   rmm::device_uvector<row_offset_t> child_offsets;
 
   // Validity bitmap
-  rmm::device_uvector<bitmask_type> validity;
+  rmm::device_buffer validity;
 
   // Map of child columns, if applicable.
   // Following "element" as the default child column's name of a list column
@@ -175,12 +190,28 @@ namespace detail {
  * character of \p d_json_in, where a '{' represents that the corresponding input character is
  * within the context of a struct, a '[' represents that it is within the context of an array, and a
  * '_' symbol that it is at the root of the JSON.
+ * @param[in] stack_behavior Specifies the stack's behavior
  * @param[in] stream The cuda stream to dispatch GPU kernels to
  */
 void get_stack_context(device_span<SymbolT const> json_in,
                        SymbolT* d_top_of_stack,
+                       stack_behavior_t stack_behavior,
                        rmm::cuda_stream_view stream);
 
+/**
+ * @brief Post-processes a token stream that may contain tokens from invalid lines. Expects that the
+ * token stream begins with a LineEnd token.
+ *
+ * @param tokens The tokens to be post-processed
+ * @param token_indices The tokens' corresponding indices that are post-processed
+ * @param stream The cuda stream to dispatch GPU kernels to
+ * @return Returns the post-processed token stream
+ */
+std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> process_token_stream(
+  device_span<PdaTokenT const> tokens,
+  device_span<SymbolOffsetT const> token_indices,
+  rmm::cuda_stream_view stream);
+
 /**
  * @brief Parses the given JSON string and generates a tree representation of the given input.
  *
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index d8ca0411910..b691eaa8caf 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -27,6 +27,7 @@
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/io/detail/data_casting.cuh>
+#include <cudf/io/detail/tokenize_json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
@@ -39,8 +40,11 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/device_vector.h>
+#include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
+#include <thrust/tuple.h>
 
 #include <limits>
 #include <stack>
@@ -88,6 +92,115 @@ void check_input_size(std::size_t input_size)
 
 namespace cudf::io::json {
 
+// FST to prune tokens of invalid lines for recovering JSON lines format
+namespace token_filter {
+
+// Type used to represent the target state in the transition table
+using StateT = char;
+
+// Type used to represent a symbol group id
+using SymbolGroupT = uint8_t;
+
+/**
+ * @brief Definition of the DFA's states
+ */
+enum class dfa_states : StateT { VALID, INVALID, NUM_STATES };
+
+// Aliases for readability of the transition table
+constexpr auto TT_INV = dfa_states::INVALID;
+constexpr auto TT_VLD = dfa_states::VALID;
+
+/**
+ * @brief Definition of the symbol groups
+ */
+enum class dfa_symbol_group_id : SymbolGroupT {
+  ERROR,             ///< Error token symbol group
+  DELIMITER,         ///< Record / line delimiter symbol group
+  OTHER_SYMBOLS,     ///< Symbol group that implicitly matches all other tokens
+  NUM_SYMBOL_GROUPS  ///< Total number of symbol groups
+};
+
+constexpr auto TT_NUM_STATES     = static_cast<StateT>(dfa_states::NUM_STATES);
+constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
+
+// Lookup table to map an input symbol (i.e., a token) to a symbol group
+std::array<std::vector<PdaTokenT>, NUM_SYMBOL_GROUPS - 1> const symbol_groups{{
+  {static_cast<PdaTokenT>(token_t::ErrorBegin)},  // Symbols mapping to ERROR
+  {static_cast<PdaTokenT>(token_t::LineEnd)}      // Symbols mapping to DELIMITER
+}};
+
+/**
+ * @brief Function object to map (token,token_index) tuples to a symbol group.
+ */
+struct UnwrapTokenFromSymbolOp {
+  template <typename SymbolGroupLookupTableT>
+  CUDF_HOST_DEVICE SymbolGroupT operator()(SymbolGroupLookupTableT const& sgid_lut,
+                                           thrust::tuple<PdaTokenT, SymbolOffsetT> symbol) const
+  {
+    PdaTokenT const token_type = thrust::get<0>(symbol);
+    return sgid_lut.lookup(token_type);
+  }
+};
+
+/**
+ * @brief Translation function object that discards line delimiter tokens and tokens belonging to
+ * invalid lines.
+ */
+struct TransduceToken {
+  template <typename TransducerTableT, typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE SymbolT operator()(TransducerTableT const&,
+                                                StateT const state_id,
+                                                SymbolGroupT const match_id,
+                                                RelativeOffsetT const relative_offset,
+                                                SymbolT const read_symbol) const
+  {
+    const bool is_end_of_invalid_line =
+      (state_id == static_cast<StateT>(TT_INV) &&
+       match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DELIMITER));
+
+    if (is_end_of_invalid_line) {
+      return relative_offset == 0 ? SymbolT{token_t::StructEnd, 0}
+                                  : SymbolT{token_t::StructBegin, 0};
+    } else {
+      return read_symbol;
+    }
+  }
+
+  template <typename TransducerTableT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE int32_t operator()(TransducerTableT const&,
+                                                StateT const state_id,
+                                                SymbolGroupT const match_id,
+                                                SymbolT const read_symbol) const
+  {
+    // Number of tokens emitted on invalid lines
+    constexpr int32_t num_inv_tokens = 2;
+
+    const bool is_delimiter = match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DELIMITER);
+
+    // If state is either invalid or we're entering an invalid state, we discard tokens
+    const bool is_part_of_invalid_line =
+      (match_id != static_cast<SymbolGroupT>(dfa_symbol_group_id::ERROR) &&
+       state_id == static_cast<StateT>(TT_VLD));
+
+    // Indicates whether we transition from an invalid line to a potentially valid line
+    const bool is_end_of_invalid_line = (state_id == static_cast<StateT>(TT_INV) && is_delimiter);
+
+    int32_t const emit_count =
+      is_end_of_invalid_line ? num_inv_tokens : (is_part_of_invalid_line && !is_delimiter ? 1 : 0);
+    return emit_count;
+  }
+};
+
+// Transition table
+std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const transition_table{
+  {/* IN_STATE      ERROR   DELIM   OTHER */
+   /* VALID    */ {{TT_INV, TT_VLD, TT_VLD}},
+   /* INVALID  */ {{TT_INV, TT_VLD, TT_INV}}}};
+
+// The DFA's starting state
+constexpr auto start_state = static_cast<StateT>(TT_VLD);
+}  // namespace token_filter
+
 // JSON to stack operator DFA (Deterministic Finite Automata)
 namespace to_stack_op {
 
@@ -129,6 +242,7 @@ enum class dfa_symbol_group_id : uint8_t {
   CLOSING_BRACKET,   ///< Closing bracket SG: ]
   QUOTE_CHAR,        ///< Quote character SG: "
   ESCAPE_CHAR,       ///< Escape character SG: '\'
+  NEWLINE_CHAR,      ///< Newline character SG: '\n'
   OTHER_SYMBOLS,     ///< SG implicitly matching all other characters
   NUM_SYMBOL_GROUPS  ///< Total number of symbol groups
 };
@@ -138,21 +252,29 @@ constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NU
 
 // The i-th string representing all the characters of a symbol group
 std::array<std::string, NUM_SYMBOL_GROUPS - 1> const symbol_groups{
-  {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}}};
+  {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}, {"\n"}}};
 
 // Transition table
 std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const transition_table{
-  {/* IN_STATE          {       [       }       ]       "       \    OTHER */
-   /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}},
-   /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR}},
-   /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
+  {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
+   /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
+   /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR, TT_STR}},
+   /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
 
 // Translation table (i.e., for each transition, what are the symbols that we output)
 std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{
-  {/* IN_STATE         {      [      }      ]      "      \    OTHER */
-   /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}}},
-   /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}}},
-   /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}}}}};
+  {/* IN_STATE         {      [      }      ]      "      \     \n    OTHER */
+   /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}, {}}},
+   /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}, {}}},
+   /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}};
+
+// Translation table
+std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
+  resetting_translation_table{
+    {/* IN_STATE         {      [      }      ]      "      \     \n    OTHER */
+     /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {'\n'}, {}}},
+     /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}, {}}},
+     /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}};
 
 // The DFA's starting state
 constexpr auto start_state = static_cast<StateT>(TT_OOS);
@@ -355,7 +477,7 @@ static __constant__ PdaSymbolGroupIdT tos_sg_to_pda_sgid[] = {
 struct PdaSymbolToSymbolGroupId {
   template <typename SymbolT, typename StackSymbolT>
   __device__ __forceinline__ PdaSymbolGroupIdT
-  operator()(thrust::tuple<SymbolT, StackSymbolT> symbol_pair)
+  operator()(thrust::tuple<SymbolT, StackSymbolT> symbol_pair) const
   {
     // The symbol read from the input
     auto symbol = thrust::get<0>(symbol_pair);
@@ -409,6 +531,27 @@ enum class pda_state_t : StateT {
   PD_NUM_STATES
 };
 
+enum class json_format_cfg_t {
+  // Format describing regular JSON
+  JSON,
+
+  // Format describing permissive newline-delimited JSON
+  // I.e., newline characters are only treteated as delimiters at the root stack level
+  // E.g., this is treated as a single record:
+  // {"a":
+  //  123}
+  JSON_LINES,
+
+  // Format describing strict newline-delimited JSON
+  // I.e., All newlines are delimiting a record, independent of the context they appear in
+  JSON_LINES_STRICT,
+
+  // Transition table for parsing newline-delimited JSON that recovers from invalid JSON lines
+  // This format also follows `JSON_LINES_STRICT` behaviour
+  JSON_LINES_RECOVER
+
+};
+
 // Aliases for readability of the transition table
 constexpr auto PD_BOV = pda_state_t::PD_BOV;
 constexpr auto PD_BOA = pda_state_t::PD_BOA;
@@ -430,68 +573,133 @@ constexpr auto start_state = static_cast<StateT>(pda_state_t::PD_BOV);
 /**
  * @brief Getting the transition table
  */
-auto get_transition_table(bool newline_delimited_json)
+auto get_transition_table(json_format_cfg_t format)
 {
   static_assert(static_cast<PdaStackSymbolGroupIdT>(stack_symbol_group_id::STACK_ROOT) == 0);
   static_assert(static_cast<PdaStackSymbolGroupIdT>(stack_symbol_group_id::STACK_LIST) == 1);
   static_assert(static_cast<PdaStackSymbolGroupIdT>(stack_symbol_group_id::STACK_STRUCT) == 2);
 
-  // In case of newline-delimited JSON, multiple newlines are ignored, similar to whitespace.
-  // Thas is, empty lines are ignored
-  auto const PD_ANL = newline_delimited_json ? PD_BOV : PD_PVL;
   std::array<std::array<pda_state_t, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tt;
-  //  {       [       }       ]       "       \       ,       :     space   newline other
-  pda_tt[static_cast<StateT>(pda_state_t::PD_BOV)] = {
-    PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON,
-    PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON,
-    PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON};
-  pda_tt[static_cast<StateT>(pda_state_t::PD_BOA)] = {
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-    PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_LON,
-    PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_ERR};
-  pda_tt[static_cast<StateT>(pda_state_t::PD_LON)] = {
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_PVL, PD_LON,
-    PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_LON,
-    PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_LON};
-  pda_tt[static_cast<StateT>(pda_state_t::PD_STR)] = {
-    PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
-    PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
-    PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR};
-  pda_tt[static_cast<StateT>(pda_state_t::PD_SCE)] = {
-    PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
-    PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
-    PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR};
-  pda_tt[static_cast<StateT>(pda_state_t::PD_PVL)] = {
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ANL, PD_ERR,
-    PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_ERR,
-    PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_ERR};
-  pda_tt[static_cast<StateT>(pda_state_t::PD_BFN)] = {
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-    PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_BFN, PD_ERR};
-  pda_tt[static_cast<StateT>(pda_state_t::PD_FLN)] = {
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-    PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN};
-  pda_tt[static_cast<StateT>(pda_state_t::PD_FNE)] = {
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-    PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN};
-  pda_tt[static_cast<StateT>(pda_state_t::PD_PFN)] = {
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_PFN, PD_ERR};
-  pda_tt[static_cast<StateT>(pda_state_t::PD_ERR)] = {
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR};
+
+  if (format == json_format_cfg_t::JSON || format == json_format_cfg_t::JSON_LINES) {
+    // In case of newline-delimited JSON, multiple newlines are ignored, similar to whitespace.
+    // Thas is, empty lines are ignored
+    // PD_ANL describes the target state after a new line on an empty stack (JSON root level)
+    auto const PD_ANL = (format == json_format_cfg_t::JSON) ? PD_PVL : PD_BOV;
+
+    // First row:  empty stack         ("root" level of the JSON)
+    // Second row: '[' on top of stack (we're parsing a list value)
+    // Third row:  '{' on top of stack (we're parsing a struct value)
+    //  {       [       }       ]       "       \       ,       :     space   newline other
+    pda_tt[static_cast<StateT>(pda_state_t::PD_BOV)] = {
+      PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON,
+      PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON,
+      PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_BOA)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_LON,
+      PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_ERR};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_LON)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_PVL, PD_LON,
+      PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_LON,
+      PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_LON};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_STR)] = {
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_SCE)] = {
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_PVL)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ANL, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_ERR,
+      PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_ERR};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_BFN)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_BFN, PD_ERR};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_FLN)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_FNE)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_PFN)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_PFN, PD_ERR};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_ERR)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR};
+  }
+  // Transition table for strict JSON lines (including recovery)
+  // Newlines are treated as record delimiters
+  else {
+    // In case of newline-delimited JSON, multiple newlines are ignored, similar to whitespace.
+    // Thas is, empty lines are ignored
+    // PD_ANL describes the target state after a new line after encountering error state
+    auto const PD_ANL = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_BOV : PD_ERR;
+
+    // First row:  empty stack         ("root" level of the JSON)
+    // Second row: '[' on top of stack (we're parsing a list value)
+    // Third row:  '{' on top of stack (we're parsing a struct value)
+    //  {       [       }       ]       "       \       ,       :     space   newline other
+    pda_tt[static_cast<StateT>(pda_state_t::PD_BOV)] = {
+      PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON,
+      PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON,
+      PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_BOA)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
+      PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOV, PD_LON,
+      PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOV, PD_ERR};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_LON)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_LON,
+      PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_LON,
+      PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_LON};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_STR)] = {
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_SCE)] = {
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_PVL)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_ERR,
+      PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_ERR};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_BFN)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
+      PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_BOV, PD_ERR};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_FLN)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
+      PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_BOV, PD_FLN};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_FNE)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+      PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_PFN)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_BOV, PD_ERR};
+    pda_tt[static_cast<StateT>(pda_state_t::PD_ERR)] = {
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ANL, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ANL, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ANL, PD_ERR};
+  }
   return pda_tt;
 }
 
 /**
  * @brief Getting the translation table
  */
-auto get_translation_table()
+auto get_translation_table(bool include_line_delimiter)
 {
   constexpr auto StructBegin       = token_t::StructBegin;
   constexpr auto StructEnd         = token_t::StructEnd;
@@ -507,6 +715,15 @@ auto get_translation_table()
   constexpr auto ValueEnd          = token_t::ValueEnd;
   constexpr auto ErrorBegin        = token_t::ErrorBegin;
 
+  /**
+   * @brief Appends token_t::LineEnd token to the given token sequence, if and only if
+   * `include_line_delimiter` is true.
+   */
+  auto nl_tokens = [include_line_delimiter](std::vector<char> tokens) {
+    if (include_line_delimiter) { tokens.push_back(token_t::LineEnd); }
+    return tokens;
+  };
+
   std::array<std::array<std::vector<char>, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt;
   pda_tlt[static_cast<StateT>(pda_state_t::PD_BOV)] = {{                /*ROOT*/
                                                         {StructBegin},  // OPENING_BRACE
@@ -518,7 +735,7 @@ auto get_translation_table()
                                                         {ErrorBegin},   // COMMA
                                                         {ErrorBegin},   // COLON
                                                         {},             // WHITE_SPACE
-                                                        {},             // LINE_BREAK
+                                                        nl_tokens({}),  // LINE_BREAK
                                                         {ValueBegin},   // OTHER
                                                         /*LIST*/
                                                         {StructBegin},  // OPENING_BRACE
@@ -530,7 +747,7 @@ auto get_translation_table()
                                                         {ErrorBegin},   // COMMA
                                                         {ErrorBegin},   // COLON
                                                         {},             // WHITE_SPACE
-                                                        {},             // LINE_BREAK
+                                                        nl_tokens({}),  // LINE_BREAK
                                                         {ValueBegin},   // OTHER
                                                         /*STRUCT*/
                                                         {StructBegin},   // OPENING_BRACE
@@ -542,7 +759,7 @@ auto get_translation_table()
                                                         {ErrorBegin},    // COMMA
                                                         {ErrorBegin},    // COLON
                                                         {},              // WHITE_SPACE
-                                                        {},              // LINE_BREAK
+                                                        nl_tokens({}),   // LINE_BREAK
                                                         {ValueBegin}}};  // OTHER
   pda_tlt[static_cast<StateT>(pda_state_t::PD_BOA)] = {
     {                                                                    /*ROOT*/
@@ -555,7 +772,7 @@ auto get_translation_table()
      {ErrorBegin},                                                       // COMMA
      {ErrorBegin},                                                       // COLON
      {ErrorBegin},                                                       // WHITE_SPACE
-     {ErrorBegin},                                                       // LINE_BREAK
+     nl_tokens({ErrorBegin}),                                            // LINE_BREAK
      {ErrorBegin},                                                       // OTHER
      /*LIST*/
      {StructBegin},  // OPENING_BRACE
@@ -567,7 +784,7 @@ auto get_translation_table()
      {ErrorBegin},   // COMMA
      {ErrorBegin},   // COLON
      {},             // WHITE_SPACE
-     {},             // LINE_BREAK
+     nl_tokens({}),  // LINE_BREAK
      {ValueBegin},   // OTHER
      /*STRUCT*/
      {ErrorBegin},                         // OPENING_BRACE
@@ -579,7 +796,7 @@ auto get_translation_table()
      {ErrorBegin},                         // COMMA
      {ErrorBegin},                         // COLON
      {},                                   // WHITE_SPACE
-     {},                                   // LINE_BREAK
+     nl_tokens({}),                        // LINE_BREAK
      {ErrorBegin}}};                       // OTHER
   pda_tlt[static_cast<StateT>(pda_state_t::PD_LON)] = {
     {                                      /*ROOT*/
@@ -592,132 +809,132 @@ auto get_translation_table()
      {ErrorBegin},                         // COMMA
      {ErrorBegin},                         // COLON
      {ValueEnd},                           // WHITE_SPACE
-     {ValueEnd},                           // LINE_BREAK
+     nl_tokens({ValueEnd}),                // LINE_BREAK
      {},                                   // OTHER
      /*LIST*/
-     {ErrorBegin},         // OPENING_BRACE
-     {ErrorBegin},         // OPENING_BRACKET
-     {ErrorBegin},         // CLOSING_BRACE
-     {ValueEnd, ListEnd},  // CLOSING_BRACKET
-     {ErrorBegin},         // QUOTE
-     {ErrorBegin},         // ESCAPE
-     {ValueEnd},           // COMMA
-     {ErrorBegin},         // COLON
-     {ValueEnd},           // WHITE_SPACE
-     {ValueEnd},           // LINE_BREAK
-     {},                   // OTHER
+     {ErrorBegin},           // OPENING_BRACE
+     {ErrorBegin},           // OPENING_BRACKET
+     {ErrorBegin},           // CLOSING_BRACE
+     {ValueEnd, ListEnd},    // CLOSING_BRACKET
+     {ErrorBegin},           // QUOTE
+     {ErrorBegin},           // ESCAPE
+     {ValueEnd},             // COMMA
+     {ErrorBegin},           // COLON
+     {ValueEnd},             // WHITE_SPACE
+     nl_tokens({ValueEnd}),  // LINE_BREAK
+     {},                     // OTHER
      /*STRUCT*/
-     {ErrorBegin},                                                    // OPENING_BRACE
-     {ErrorBegin},                                                    // OPENING_BRACKET
-     {ValueEnd, StructMemberEnd, StructEnd},                          // CLOSING_BRACE
-     {ErrorBegin},                                                    // CLOSING_BRACKET
-     {ErrorBegin},                                                    // QUOTE
-     {ErrorBegin},                                                    // ESCAPE
-     {ValueEnd, StructMemberEnd},                                     // COMMA
-     {ErrorBegin},                                                    // COLON
-     {ValueEnd},                                                      // WHITE_SPACE
-     {ValueEnd},                                                      // LINE_BREAK
-     {}}};                                                            // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_STR)] = {{              /*ROOT*/
-                                                        {},           // OPENING_BRACE
-                                                        {},           // OPENING_BRACKET
-                                                        {},           // CLOSING_BRACE
-                                                        {},           // CLOSING_BRACKET
-                                                        {StringEnd},  // QUOTE
-                                                        {},           // ESCAPE
-                                                        {},           // COMMA
-                                                        {},           // COLON
-                                                        {},           // WHITE_SPACE
-                                                        {},           // LINE_BREAK
-                                                        {},           // OTHER
+     {ErrorBegin},                                                      // OPENING_BRACE
+     {ErrorBegin},                                                      // OPENING_BRACKET
+     {ValueEnd, StructMemberEnd, StructEnd},                            // CLOSING_BRACE
+     {ErrorBegin},                                                      // CLOSING_BRACKET
+     {ErrorBegin},                                                      // QUOTE
+     {ErrorBegin},                                                      // ESCAPE
+     {ValueEnd, StructMemberEnd},                                       // COMMA
+     {ErrorBegin},                                                      // COLON
+     {ValueEnd},                                                        // WHITE_SPACE
+     nl_tokens({ValueEnd}),                                             // LINE_BREAK
+     {}}};                                                              // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_STR)] = {{                /*ROOT*/
+                                                        {},             // OPENING_BRACE
+                                                        {},             // OPENING_BRACKET
+                                                        {},             // CLOSING_BRACE
+                                                        {},             // CLOSING_BRACKET
+                                                        {StringEnd},    // QUOTE
+                                                        {},             // ESCAPE
+                                                        {},             // COMMA
+                                                        {},             // COLON
+                                                        {},             // WHITE_SPACE
+                                                        nl_tokens({}),  // LINE_BREAK
+                                                        {},             // OTHER
                                                         /*LIST*/
-                                                        {},           // OPENING_BRACE
-                                                        {},           // OPENING_BRACKET
-                                                        {},           // CLOSING_BRACE
-                                                        {},           // CLOSING_BRACKET
-                                                        {StringEnd},  // QUOTE
-                                                        {},           // ESCAPE
-                                                        {},           // COMMA
-                                                        {},           // COLON
-                                                        {},           // WHITE_SPACE
-                                                        {},           // LINE_BREAK
-                                                        {},           // OTHER
+                                                        {},             // OPENING_BRACE
+                                                        {},             // OPENING_BRACKET
+                                                        {},             // CLOSING_BRACE
+                                                        {},             // CLOSING_BRACKET
+                                                        {StringEnd},    // QUOTE
+                                                        {},             // ESCAPE
+                                                        {},             // COMMA
+                                                        {},             // COLON
+                                                        {},             // WHITE_SPACE
+                                                        nl_tokens({}),  // LINE_BREAK
+                                                        {},             // OTHER
                                                         /*STRUCT*/
-                                                        {},           // OPENING_BRACE
-                                                        {},           // OPENING_BRACKET
-                                                        {},           // CLOSING_BRACE
-                                                        {},           // CLOSING_BRACKET
-                                                        {StringEnd},  // QUOTE
-                                                        {},           // ESCAPE
-                                                        {},           // COMMA
-                                                        {},           // COLON
-                                                        {},           // WHITE_SPACE
-                                                        {},           // LINE_BREAK
-                                                        {}}};         // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_SCE)] = {{              /*ROOT*/
-                                                        {},           // OPENING_BRACE
-                                                        {},           // OPENING_BRACKET
-                                                        {},           // CLOSING_BRACE
-                                                        {},           // CLOSING_BRACKET
-                                                        {},           // QUOTE
-                                                        {},           // ESCAPE
-                                                        {},           // COMMA
-                                                        {},           // COLON
-                                                        {},           // WHITE_SPACE
-                                                        {},           // LINE_BREAK
-                                                        {},           // OTHER
+                                                        {},             // OPENING_BRACE
+                                                        {},             // OPENING_BRACKET
+                                                        {},             // CLOSING_BRACE
+                                                        {},             // CLOSING_BRACKET
+                                                        {StringEnd},    // QUOTE
+                                                        {},             // ESCAPE
+                                                        {},             // COMMA
+                                                        {},             // COLON
+                                                        {},             // WHITE_SPACE
+                                                        nl_tokens({}),  // LINE_BREAK
+                                                        {}}};           // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_SCE)] = {{                /*ROOT*/
+                                                        {},             // OPENING_BRACE
+                                                        {},             // OPENING_BRACKET
+                                                        {},             // CLOSING_BRACE
+                                                        {},             // CLOSING_BRACKET
+                                                        {},             // QUOTE
+                                                        {},             // ESCAPE
+                                                        {},             // COMMA
+                                                        {},             // COLON
+                                                        {},             // WHITE_SPACE
+                                                        nl_tokens({}),  // LINE_BREAK
+                                                        {},             // OTHER
                                                         /*LIST*/
-                                                        {},  // OPENING_BRACE
-                                                        {},  // OPENING_BRACKET
-                                                        {},  // CLOSING_BRACE
-                                                        {},  // CLOSING_BRACKET
-                                                        {},  // QUOTE
-                                                        {},  // ESCAPE
-                                                        {},  // COMMA
-                                                        {},  // COLON
-                                                        {},  // WHITE_SPACE
-                                                        {},  // LINE_BREAK
-                                                        {},  // OTHER
+                                                        {},             // OPENING_BRACE
+                                                        {},             // OPENING_BRACKET
+                                                        {},             // CLOSING_BRACE
+                                                        {},             // CLOSING_BRACKET
+                                                        {},             // QUOTE
+                                                        {},             // ESCAPE
+                                                        {},             // COMMA
+                                                        {},             // COLON
+                                                        {},             // WHITE_SPACE
+                                                        nl_tokens({}),  // LINE_BREAK
+                                                        {},             // OTHER
                                                         /*STRUCT*/
-                                                        {},    // OPENING_BRACE
-                                                        {},    // OPENING_BRACKET
-                                                        {},    // CLOSING_BRACE
-                                                        {},    // CLOSING_BRACKET
-                                                        {},    // QUOTE
-                                                        {},    // ESCAPE
-                                                        {},    // COMMA
-                                                        {},    // COLON
-                                                        {},    // WHITE_SPACE
-                                                        {},    // LINE_BREAK
-                                                        {}}};  // OTHER
+                                                        {},             // OPENING_BRACE
+                                                        {},             // OPENING_BRACKET
+                                                        {},             // CLOSING_BRACE
+                                                        {},             // CLOSING_BRACKET
+                                                        {},             // QUOTE
+                                                        {},             // ESCAPE
+                                                        {},             // COMMA
+                                                        {},             // COLON
+                                                        {},             // WHITE_SPACE
+                                                        nl_tokens({}),  // LINE_BREAK
+                                                        {}}};           // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_PVL)] = {
-    {               /*ROOT*/
-     {ErrorBegin},  // OPENING_BRACE
-     {ErrorBegin},  // OPENING_BRACKET
-     {ErrorBegin},  // CLOSING_BRACE
-     {ErrorBegin},  // CLOSING_BRACKET
-     {ErrorBegin},  // QUOTE
-     {ErrorBegin},  // ESCAPE
-     {ErrorBegin},  // COMMA
-     {ErrorBegin},  // COLON
-     {},            // WHITE_SPACE
-     {},            // LINE_BREAK
-     {ErrorBegin},  // OTHER
+    {                /*ROOT*/
+     {ErrorBegin},   // OPENING_BRACE
+     {ErrorBegin},   // OPENING_BRACKET
+     {ErrorBegin},   // CLOSING_BRACE
+     {ErrorBegin},   // CLOSING_BRACKET
+     {ErrorBegin},   // QUOTE
+     {ErrorBegin},   // ESCAPE
+     {ErrorBegin},   // COMMA
+     {ErrorBegin},   // COLON
+     {},             // WHITE_SPACE
+     nl_tokens({}),  // LINE_BREAK
+     {ErrorBegin},   // OTHER
      /*LIST*/
-     {ErrorBegin},  // OPENING_BRACE
-     {ErrorBegin},  // OPENING_BRACKET
-     {ErrorBegin},  // CLOSING_BRACE
-     {ListEnd},     // CLOSING_BRACKET
-     {ErrorBegin},  // QUOTE
-     {ErrorBegin},  // ESCAPE
-     {},            // COMMA
-     {ErrorBegin},  // COLON
-     {},            // WHITE_SPACE
-     {},            // LINE_BREAK
-     {ErrorBegin},  // OTHER
+     {ErrorBegin},   // OPENING_BRACE
+     {ErrorBegin},   // OPENING_BRACKET
+     {ErrorBegin},   // CLOSING_BRACE
+     {ListEnd},      // CLOSING_BRACKET
+     {ErrorBegin},   // QUOTE
+     {ErrorBegin},   // ESCAPE
+     {},             // COMMA
+     {ErrorBegin},   // COLON
+     {},             // WHITE_SPACE
+     nl_tokens({}),  // LINE_BREAK
+     {ErrorBegin},   // OTHER
      /*STRUCT*/
      {ErrorBegin},                  // OPENING_BRACE
      {ErrorBegin},                  // OPENING_BRACKET
@@ -728,145 +945,145 @@ auto get_translation_table()
      {StructMemberEnd},             // COMMA
      {ErrorBegin},                  // COLON
      {},                            // WHITE_SPACE
-     {},                            // LINE_BREAK
+     nl_tokens({}),                 // LINE_BREAK
      {ErrorBegin}}};                // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_BFN)] = {
-    {               /*ROOT*/
-     {ErrorBegin},  // OPENING_BRACE
-     {ErrorBegin},  // OPENING_BRACKET
-     {ErrorBegin},  // CLOSING_BRACE
-     {ErrorBegin},  // CLOSING_BRACKET
-     {ErrorBegin},  // QUOTE
-     {ErrorBegin},  // ESCAPE
-     {ErrorBegin},  // COMMA
-     {ErrorBegin},  // COLON
-     {ErrorBegin},  // WHITE_SPACE
-     {ErrorBegin},  // LINE_BREAK
-     {ErrorBegin},  // OTHER
+    {                          /*ROOT*/
+     {ErrorBegin},             // OPENING_BRACE
+     {ErrorBegin},             // OPENING_BRACKET
+     {ErrorBegin},             // CLOSING_BRACE
+     {ErrorBegin},             // CLOSING_BRACKET
+     {ErrorBegin},             // QUOTE
+     {ErrorBegin},             // ESCAPE
+     {ErrorBegin},             // COMMA
+     {ErrorBegin},             // COLON
+     {ErrorBegin},             // WHITE_SPACE
+     nl_tokens({ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},             // OTHER
      /*LIST*/
-     {ErrorBegin},  // OPENING_BRACE
-     {ErrorBegin},  // OPENING_BRACKET
-     {ErrorBegin},  // CLOSING_BRACE
-     {ErrorBegin},  // CLOSING_BRACKET
-     {ErrorBegin},  // QUOTE
-     {ErrorBegin},  // ESCAPE
-     {ErrorBegin},  // COMMA
-     {ErrorBegin},  // COLON
-     {ErrorBegin},  // WHITE_SPACE
-     {ErrorBegin},  // LINE_BREAK
-     {ErrorBegin},  // OTHER
+     {ErrorBegin},             // OPENING_BRACE
+     {ErrorBegin},             // OPENING_BRACKET
+     {ErrorBegin},             // CLOSING_BRACE
+     {ErrorBegin},             // CLOSING_BRACKET
+     {ErrorBegin},             // QUOTE
+     {ErrorBegin},             // ESCAPE
+     {ErrorBegin},             // COMMA
+     {ErrorBegin},             // COLON
+     {ErrorBegin},             // WHITE_SPACE
+     nl_tokens({ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},             // OTHER
      /*STRUCT*/
-     {ErrorBegin},                                                     // OPENING_BRACE
-     {ErrorBegin},                                                     // OPENING_BRACKET
-     {StructEnd},                                                      // CLOSING_BRACE
-     {ErrorBegin},                                                     // CLOSING_BRACKET
-     {StructMemberBegin, FieldNameBegin},                              // QUOTE
-     {ErrorBegin},                                                     // ESCAPE
-     {ErrorBegin},                                                     // COMMA
-     {ErrorBegin},                                                     // COLON
-     {},                                                               // WHITE_SPACE
-     {},                                                               // LINE_BREAK
-     {ErrorBegin}}};                                                   // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_FLN)] = {{               /*ROOT*/
-                                                        {ErrorBegin},  // OPENING_BRACE
-                                                        {ErrorBegin},  // OPENING_BRACKET
-                                                        {ErrorBegin},  // CLOSING_BRACE
-                                                        {ErrorBegin},  // CLOSING_BRACKET
-                                                        {ErrorBegin},  // QUOTE
-                                                        {ErrorBegin},  // ESCAPE
-                                                        {ErrorBegin},  // COMMA
-                                                        {ErrorBegin},  // COLON
-                                                        {ErrorBegin},  // WHITE_SPACE
-                                                        {ErrorBegin},  // LINE_BREAK
-                                                        {ErrorBegin},  // OTHER
+     {ErrorBegin},                                                                // OPENING_BRACE
+     {ErrorBegin},                                                                // OPENING_BRACKET
+     {StructEnd},                                                                 // CLOSING_BRACE
+     {ErrorBegin},                                                                // CLOSING_BRACKET
+     {StructMemberBegin, FieldNameBegin},                                         // QUOTE
+     {ErrorBegin},                                                                // ESCAPE
+     {ErrorBegin},                                                                // COMMA
+     {ErrorBegin},                                                                // COLON
+     {},                                                                          // WHITE_SPACE
+     nl_tokens({}),                                                               // LINE_BREAK
+     {ErrorBegin}}};                                                              // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_FLN)] = {{                          /*ROOT*/
+                                                        {ErrorBegin},             // OPENING_BRACE
+                                                        {ErrorBegin},             // OPENING_BRACKET
+                                                        {ErrorBegin},             // CLOSING_BRACE
+                                                        {ErrorBegin},             // CLOSING_BRACKET
+                                                        {ErrorBegin},             // QUOTE
+                                                        {ErrorBegin},             // ESCAPE
+                                                        {ErrorBegin},             // COMMA
+                                                        {ErrorBegin},             // COLON
+                                                        {ErrorBegin},             // WHITE_SPACE
+                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
+                                                        {ErrorBegin},             // OTHER
                                                         /*LIST*/
-                                                        {ErrorBegin},  // OPENING_BRACE
-                                                        {ErrorBegin},  // OPENING_BRACKET
-                                                        {ErrorBegin},  // CLOSING_BRACE
-                                                        {ErrorBegin},  // CLOSING_BRACKET
-                                                        {ErrorBegin},  // QUOTE
-                                                        {ErrorBegin},  // ESCAPE
-                                                        {ErrorBegin},  // COMMA
-                                                        {ErrorBegin},  // COLON
-                                                        {ErrorBegin},  // WHITE_SPACE
-                                                        {ErrorBegin},  // LINE_BREAK
-                                                        {ErrorBegin},  // OTHER
+                                                        {ErrorBegin},             // OPENING_BRACE
+                                                        {ErrorBegin},             // OPENING_BRACKET
+                                                        {ErrorBegin},             // CLOSING_BRACE
+                                                        {ErrorBegin},             // CLOSING_BRACKET
+                                                        {ErrorBegin},             // QUOTE
+                                                        {ErrorBegin},             // ESCAPE
+                                                        {ErrorBegin},             // COMMA
+                                                        {ErrorBegin},             // COLON
+                                                        {ErrorBegin},             // WHITE_SPACE
+                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
+                                                        {ErrorBegin},             // OTHER
                                                         /*STRUCT*/
-                                                        {},              // OPENING_BRACE
-                                                        {},              // OPENING_BRACKET
-                                                        {},              // CLOSING_BRACE
-                                                        {},              // CLOSING_BRACKET
-                                                        {FieldNameEnd},  // QUOTE
-                                                        {},              // ESCAPE
-                                                        {},              // COMMA
-                                                        {},              // COLON
-                                                        {},              // WHITE_SPACE
-                                                        {},              // LINE_BREAK
-                                                        {}}};            // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_FNE)] = {{                 /*ROOT*/
-                                                        {ErrorBegin},    // OPENING_BRACE
-                                                        {ErrorBegin},    // OPENING_BRACKET
-                                                        {ErrorBegin},    // CLOSING_BRACE
-                                                        {ErrorBegin},    // CLOSING_BRACKET
-                                                        {ErrorBegin},    // QUOTE
-                                                        {ErrorBegin},    // ESCAPE
-                                                        {ErrorBegin},    // COMMA
-                                                        {ErrorBegin},    // COLON
-                                                        {ErrorBegin},    // WHITE_SPACE
-                                                        {ErrorBegin},    // LINE_BREAK
-                                                        {ErrorBegin},    // OTHER
+                                                        {},                       // OPENING_BRACE
+                                                        {},                       // OPENING_BRACKET
+                                                        {},                       // CLOSING_BRACE
+                                                        {},                       // CLOSING_BRACKET
+                                                        {FieldNameEnd},           // QUOTE
+                                                        {},                       // ESCAPE
+                                                        {},                       // COMMA
+                                                        {},                       // COLON
+                                                        {},                       // WHITE_SPACE
+                                                        nl_tokens({}),            // LINE_BREAK
+                                                        {}}};                     // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_FNE)] = {{                          /*ROOT*/
+                                                        {ErrorBegin},             // OPENING_BRACE
+                                                        {ErrorBegin},             // OPENING_BRACKET
+                                                        {ErrorBegin},             // CLOSING_BRACE
+                                                        {ErrorBegin},             // CLOSING_BRACKET
+                                                        {ErrorBegin},             // QUOTE
+                                                        {ErrorBegin},             // ESCAPE
+                                                        {ErrorBegin},             // COMMA
+                                                        {ErrorBegin},             // COLON
+                                                        {ErrorBegin},             // WHITE_SPACE
+                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
+                                                        {ErrorBegin},             // OTHER
                                                         /*LIST*/
-                                                        {ErrorBegin},  // OPENING_BRACE
-                                                        {ErrorBegin},  // OPENING_BRACKET
-                                                        {ErrorBegin},  // CLOSING_BRACE
-                                                        {ErrorBegin},  // CLOSING_BRACKET
-                                                        {ErrorBegin},  // QUOTE
-                                                        {ErrorBegin},  // ESCAPE
-                                                        {ErrorBegin},  // COMMA
-                                                        {ErrorBegin},  // COLON
-                                                        {ErrorBegin},  // WHITE_SPACE
-                                                        {ErrorBegin},  // LINE_BREAK
-                                                        {ErrorBegin},  // OTHER
+                                                        {ErrorBegin},             // OPENING_BRACE
+                                                        {ErrorBegin},             // OPENING_BRACKET
+                                                        {ErrorBegin},             // CLOSING_BRACE
+                                                        {ErrorBegin},             // CLOSING_BRACKET
+                                                        {ErrorBegin},             // QUOTE
+                                                        {ErrorBegin},             // ESCAPE
+                                                        {ErrorBegin},             // COMMA
+                                                        {ErrorBegin},             // COLON
+                                                        {ErrorBegin},             // WHITE_SPACE
+                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
+                                                        {ErrorBegin},             // OTHER
                                                         /*STRUCT*/
-                                                        {},            // OPENING_BRACE
-                                                        {},            // OPENING_BRACKET
-                                                        {},            // CLOSING_BRACE
-                                                        {},            // CLOSING_BRACKET
-                                                        {},            // QUOTE
-                                                        {},            // ESCAPE
-                                                        {},            // COMMA
-                                                        {},            // COLON
-                                                        {},            // WHITE_SPACE
-                                                        {},            // LINE_BREAK
-                                                        {}}};          // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_PFN)] = {{               /*ROOT*/
-                                                        {ErrorBegin},  // OPENING_BRACE
-                                                        {ErrorBegin},  // OPENING_BRACKET
-                                                        {ErrorBegin},  // CLOSING_BRACE
-                                                        {ErrorBegin},  // CLOSING_BRACKET
-                                                        {ErrorBegin},  // QUOTE
-                                                        {ErrorBegin},  // ESCAPE
-                                                        {ErrorBegin},  // COMMA
-                                                        {ErrorBegin},  // COLON
-                                                        {ErrorBegin},  // WHITE_SPACE
-                                                        {ErrorBegin},  // LINE_BREAK
-                                                        {ErrorBegin},  // OTHER
+                                                        {},                       // OPENING_BRACE
+                                                        {},                       // OPENING_BRACKET
+                                                        {},                       // CLOSING_BRACE
+                                                        {},                       // CLOSING_BRACKET
+                                                        {},                       // QUOTE
+                                                        {},                       // ESCAPE
+                                                        {},                       // COMMA
+                                                        {},                       // COLON
+                                                        {},                       // WHITE_SPACE
+                                                        nl_tokens({}),            // LINE_BREAK
+                                                        {}}};                     // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_PFN)] = {{                          /*ROOT*/
+                                                        {ErrorBegin},             // OPENING_BRACE
+                                                        {ErrorBegin},             // OPENING_BRACKET
+                                                        {ErrorBegin},             // CLOSING_BRACE
+                                                        {ErrorBegin},             // CLOSING_BRACKET
+                                                        {ErrorBegin},             // QUOTE
+                                                        {ErrorBegin},             // ESCAPE
+                                                        {ErrorBegin},             // COMMA
+                                                        {ErrorBegin},             // COLON
+                                                        {ErrorBegin},             // WHITE_SPACE
+                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
+                                                        {ErrorBegin},             // OTHER
                                                         /*LIST*/
-                                                        {ErrorBegin},  // OPENING_BRACE
-                                                        {ErrorBegin},  // OPENING_BRACKET
-                                                        {ErrorBegin},  // CLOSING_BRACE
-                                                        {ErrorBegin},  // CLOSING_BRACKET
-                                                        {ErrorBegin},  // QUOTE
-                                                        {ErrorBegin},  // ESCAPE
-                                                        {ErrorBegin},  // COMMA
-                                                        {ErrorBegin},  // COLON
-                                                        {ErrorBegin},  // WHITE_SPACE
-                                                        {ErrorBegin},  // LINE_BREAK
-                                                        {ErrorBegin},  // OTHER
+                                                        {ErrorBegin},             // OPENING_BRACE
+                                                        {ErrorBegin},             // OPENING_BRACKET
+                                                        {ErrorBegin},             // CLOSING_BRACE
+                                                        {ErrorBegin},             // CLOSING_BRACKET
+                                                        {ErrorBegin},             // QUOTE
+                                                        {ErrorBegin},             // ESCAPE
+                                                        {ErrorBegin},             // COMMA
+                                                        {ErrorBegin},             // COLON
+                                                        {ErrorBegin},             // WHITE_SPACE
+                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
+                                                        {ErrorBegin},             // OTHER
                                                         /*STRUCT*/
                                                         {ErrorBegin},    // OPENING_BRACE
                                                         {ErrorBegin},    // OPENING_BRACKET
@@ -877,7 +1094,7 @@ auto get_translation_table()
                                                         {ErrorBegin},    // COMMA
                                                         {},              // COLON
                                                         {},              // WHITE_SPACE
-                                                        {},              // LINE_BREAK
+                                                        nl_tokens({}),   // LINE_BREAK
                                                         {ErrorBegin}}};  // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_ERR)] = {{                 /*ROOT*/
@@ -890,32 +1107,32 @@ auto get_translation_table()
                                                         {},              // COMMA
                                                         {},              // COLON
                                                         {},              // WHITE_SPACE
-                                                        {},              // LINE_BREAK
+                                                        nl_tokens({}),   // LINE_BREAK
                                                         {},              // OTHER
                                                         /*LIST*/
-                                                        {},  // OPENING_BRACE
-                                                        {},  // OPENING_BRACKET
-                                                        {},  // CLOSING_BRACE
-                                                        {},  // CLOSING_BRACKET
-                                                        {},  // QUOTE
-                                                        {},  // ESCAPE
-                                                        {},  // COMMA
-                                                        {},  // COLON
-                                                        {},  // WHITE_SPACE
-                                                        {},  // LINE_BREAK
-                                                        {},  // OTHER
+                                                        {},             // OPENING_BRACE
+                                                        {},             // OPENING_BRACKET
+                                                        {},             // CLOSING_BRACE
+                                                        {},             // CLOSING_BRACKET
+                                                        {},             // QUOTE
+                                                        {},             // ESCAPE
+                                                        {},             // COMMA
+                                                        {},             // COLON
+                                                        {},             // WHITE_SPACE
+                                                        nl_tokens({}),  // LINE_BREAK
+                                                        {},             // OTHER
                                                         /*STRUCT*/
-                                                        {},    // OPENING_BRACE
-                                                        {},    // OPENING_BRACKET
-                                                        {},    // CLOSING_BRACE
-                                                        {},    // CLOSING_BRACKET
-                                                        {},    // QUOTE
-                                                        {},    // ESCAPE
-                                                        {},    // COMMA
-                                                        {},    // COLON
-                                                        {},    // WHITE_SPACE
-                                                        {},    // LINE_BREAK
-                                                        {}}};  // OTHER
+                                                        {},             // OPENING_BRACE
+                                                        {},             // OPENING_BRACKET
+                                                        {},             // CLOSING_BRACE
+                                                        {},             // CLOSING_BRACKET
+                                                        {},             // QUOTE
+                                                        {},             // ESCAPE
+                                                        {},             // COMMA
+                                                        {},             // COLON
+                                                        {},             // WHITE_SPACE
+                                                        nl_tokens({}),  // LINE_BREAK
+                                                        {}}};           // OTHER
   return pda_tlt;
 }
 
@@ -929,9 +1146,32 @@ struct JSONToStackOp {
   template <typename StackSymbolT>
   constexpr CUDF_HOST_DEVICE fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const
   {
-    return (stack_symbol == '{' || stack_symbol == '[')   ? fst::stack_op_type::PUSH
-           : (stack_symbol == '}' || stack_symbol == ']') ? fst::stack_op_type::POP
-                                                          : fst::stack_op_type::READ;
+    switch (stack_symbol) {
+      case '{':
+      case '[': return fst::stack_op_type::PUSH;
+      case '}':
+      case ']': return fst::stack_op_type::POP;
+      default: return fst::stack_op_type::READ;
+    }
+  }
+};
+
+/**
+ * @brief Function object used to filter for brackets and braces that represent push and pop
+ * operations
+ */
+struct JSONWithRecoveryToStackOp {
+  template <typename StackSymbolT>
+  constexpr CUDF_HOST_DEVICE fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const
+  {
+    switch (stack_symbol) {
+      case '{':
+      case '[': return fst::stack_op_type::PUSH;
+      case '}':
+      case ']': return fst::stack_op_type::POP;
+      case '\n': return fst::stack_op_type::RESET;
+      default: return fst::stack_op_type::READ;
+    }
   }
 };
 
@@ -1030,6 +1270,7 @@ namespace detail {
 
 void get_stack_context(device_span<SymbolT const> json_in,
                        SymbolT* d_top_of_stack,
+                       stack_behavior_t stack_behavior,
                        rmm::cuda_stream_view stream)
 {
   check_input_size(json_in.size());
@@ -1052,15 +1293,19 @@ void get_stack_context(device_span<SymbolT const> json_in,
   rmm::device_uvector<SymbolOffsetT> stack_op_indices{json_in.size(), stream};
 
   // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes
-  using ToStackOpFstT =
-    cudf::io::fst::detail::Dfa<StackSymbolT,
-                               static_cast<int32_t>(
-                                 to_stack_op::dfa_symbol_group_id::NUM_SYMBOL_GROUPS),
-                               static_cast<int32_t>(to_stack_op::dfa_states::TT_NUM_STATES)>;
-  ToStackOpFstT json_to_stack_ops_fst{to_stack_op::symbol_groups,
-                                      to_stack_op::transition_table,
-                                      to_stack_op::translation_table,
-                                      stream};
+  constexpr auto max_translation_table_size =
+    to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES;
+
+  // Translation table specialized on the choice of whether to reset on newlines outside of strings
+  const auto translation_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter)
+                                   ? to_stack_op::resetting_translation_table
+                                   : to_stack_op::translation_table;
+
+  auto json_to_stack_ops_fst = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(to_stack_op::symbol_groups),
+    fst::detail::make_transition_table(to_stack_op::transition_table),
+    fst::detail::make_translation_table<max_translation_table_size>(translation_table),
+    stream);
 
   // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
   // of structs/lists
@@ -1075,16 +1320,80 @@ void get_stack_context(device_span<SymbolT const> json_in,
   // Copy back to actual number of stack operations
   auto const num_stack_ops = d_num_stack_ops.value(stream);
 
-  // stack operations with indices are converted to top of the stack for each character in the input
-  fst::sparse_stack_op_to_top_of_stack<StackLevelT>(
-    stack_ops.data(),
-    device_span<SymbolOffsetT>{stack_op_indices.data(), num_stack_ops},
-    JSONToStackOp{},
-    d_top_of_stack,
-    root_symbol,
-    read_symbol,
-    json_in.size(),
+  // Stack operations with indices are converted to top of the stack for each character in the input
+  if (stack_behavior == stack_behavior_t::ResetOnDelimiter) {
+    fst::sparse_stack_op_to_top_of_stack<StackLevelT>(
+      stack_ops.data(),
+      device_span<SymbolOffsetT>{stack_op_indices.data(), num_stack_ops},
+      JSONWithRecoveryToStackOp{},
+      d_top_of_stack,
+      root_symbol,
+      read_symbol,
+      json_in.size(),
+      stream);
+  } else {
+    fst::sparse_stack_op_to_top_of_stack<StackLevelT>(
+      stack_ops.data(),
+      device_span<SymbolOffsetT>{stack_op_indices.data(), num_stack_ops},
+      JSONToStackOp{},
+      d_top_of_stack,
+      root_symbol,
+      read_symbol,
+      json_in.size(),
+      stream);
+  }
+}
+
+std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> process_token_stream(
+  device_span<PdaTokenT const> tokens,
+  device_span<SymbolOffsetT const> token_indices,
+  rmm::cuda_stream_view stream)
+{
+  // Instantiate FST for post-processing the token stream to remove all tokens that belong to an
+  // invalid JSON line
+  token_filter::UnwrapTokenFromSymbolOp sgid_op{};
+  auto filter_fst =
+    fst::detail::make_fst(fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op),
+                          fst::detail::make_transition_table(token_filter::transition_table),
+                          fst::detail::make_translation_functor(token_filter::TransduceToken{}),
+                          stream);
+
+  auto const mr = rmm::mr::get_current_device_resource();
+  rmm::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
+  rmm::device_uvector<PdaTokenT> filtered_tokens_out{tokens.size(), stream, mr};
+  rmm::device_uvector<SymbolOffsetT> filtered_token_indices_out{tokens.size(), stream, mr};
+
+  // The FST is run on the reverse token stream, discarding all tokens between ErrorBegin and the
+  // next LineEnd (LineEnd, inv_token_0, inv_token_1, ..., inv_token_n, ErrorBegin, LineEnd, ...),
+  // emitting a [StructBegin, StructEnd] pair on the end of such an invalid line. In that example,
+  // inv_token_i for i in [0, n] together with the ErrorBegin are removed and replaced with
+  // StructBegin, StructEnd. Also, all LineEnd are removed as well, as these are not relevant after
+  // this stage anymore
+  filter_fst.Transduce(
+    thrust::make_reverse_iterator(thrust::make_zip_iterator(tokens.data(), token_indices.data()) +
+                                  tokens.size()),
+    static_cast<SymbolOffsetT>(tokens.size()),
+    thrust::make_reverse_iterator(
+      thrust::make_zip_iterator(filtered_tokens_out.data(), filtered_token_indices_out.data()) +
+      tokens.size()),
+    thrust::make_discard_iterator(),
+    d_num_selected_tokens.data(),
+    token_filter::start_state,
     stream);
+
+  auto const num_total_tokens = d_num_selected_tokens.value(stream);
+  rmm::device_uvector<PdaTokenT> tokens_out{num_total_tokens, stream, mr};
+  rmm::device_uvector<SymbolOffsetT> token_indices_out{num_total_tokens, stream, mr};
+  thrust::copy(rmm::exec_policy(stream),
+               filtered_tokens_out.end() - num_total_tokens,
+               filtered_tokens_out.end(),
+               tokens_out.data());
+  thrust::copy(rmm::exec_policy(stream),
+               filtered_token_indices_out.end() - num_total_tokens,
+               filtered_token_indices_out.end(),
+               token_indices_out.data());
+
+  return std::make_pair(std::move(tokens_out), std::move(token_indices_out));
 }
 
 std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> get_token_stream(
@@ -1100,40 +1409,40 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
 
   auto const new_line_delimited_json = options.is_enabled_lines();
 
+  // (!new_line_delimited_json)                         => JSON
+  // (new_line_delimited_json and recover_from_error)   => JSON_LINES_RECOVER
+  // (new_line_delimited_json and !recover_from_error)  => JSON_LINES
+  auto format = new_line_delimited_json
+                  ? (options.recovery_mode() == json_recovery_mode_t::RECOVER_WITH_NULL
+                       ? tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER
+                       : tokenizer_pda::json_format_cfg_t::JSON_LINES)
+                  : tokenizer_pda::json_format_cfg_t::JSON;
+
   // Prepare for PDA transducer pass, merging input symbols with stack symbols
-  rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids = [json_in, stream]() {
-    // Memory holding the top-of-stack stack context for the input
-    rmm::device_uvector<StackSymbolT> stack_op_indices{json_in.size(), stream};
-
-    // Identify what is the stack context for each input character (JSON-root, struct, or list)
-    get_stack_context(json_in, stack_op_indices.data(), stream);
-
-    rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids{json_in.size(), stream};
-    auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_op_indices.data());
-    thrust::transform(rmm::exec_policy(stream),
-                      zip_in,
-                      zip_in + json_in.size(),
-                      pda_sgids.data(),
-                      tokenizer_pda::PdaSymbolToSymbolGroupId{});
-    return pda_sgids;
-  }();
-
-  // PDA transducer alias
-  using ToTokenStreamFstT =
-    cudf::io::fst::detail::Dfa<StackSymbolT,
-                               tokenizer_pda::NUM_PDA_SGIDS,
-                               static_cast<tokenizer_pda::StateT>(
-                                 tokenizer_pda::pda_state_t::PD_NUM_STATES)>;
-
-  // Instantiating PDA transducer
-  std::vector<std::vector<char>> pda_sgid_identity{tokenizer_pda::NUM_PDA_SGIDS};
-  std::generate(std::begin(pda_sgid_identity),
-                std::end(pda_sgid_identity),
-                [i = char{0}]() mutable { return std::vector<char>{i++}; });
-  ToTokenStreamFstT json_to_tokens_fst{pda_sgid_identity,
-                                       tokenizer_pda::get_transition_table(new_line_delimited_json),
-                                       tokenizer_pda::get_translation_table(),
-                                       stream};
+  auto const recover_from_error = (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER);
+
+  // Memory holding the top-of-stack stack context for the input
+  rmm::device_uvector<StackSymbolT> stack_symbols{json_in.size(), stream};
+
+  // Identify what is the stack context for each input character (JSON-root, struct, or list)
+  auto const stack_behavior =
+    recover_from_error ? stack_behavior_t::ResetOnDelimiter : stack_behavior_t::PushPopWithoutReset;
+  get_stack_context(json_in, stack_symbols.data(), stack_behavior, stream);
+
+  // Input to the full pushdown automaton finite-state transducer, where a input symbol comprises
+  // the combination of a character from the JSON input together with the stack context for that
+  // character.
+  auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_symbols.data());
+
+  constexpr auto max_translation_table_size =
+    tokenizer_pda::NUM_PDA_SGIDS *
+    static_cast<tokenizer_pda::StateT>(tokenizer_pda::pda_state_t::PD_NUM_STATES);
+  auto json_to_tokens_fst = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{}),
+    fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)),
+    fst::detail::make_translation_table<max_translation_table_size>(
+      tokenizer_pda::get_translation_table(recover_from_error)),
+    stream);
 
   // Perform a PDA-transducer pass
   // Compute the maximum amount of tokens that can possibly be emitted for a given input size
@@ -1145,21 +1454,34 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   auto const max_token_out_count =
     cudf::util::div_rounding_up_safe(json_in.size(), min_chars_per_struct) * max_tokens_per_struct;
   rmm::device_scalar<std::size_t> num_written_tokens{stream};
-  rmm::device_uvector<PdaTokenT> tokens{max_token_out_count, stream, mr};
-  rmm::device_uvector<SymbolOffsetT> tokens_indices{max_token_out_count, stream, mr};
-
-  json_to_tokens_fst.Transduce(pda_sgids.begin(),
+  // In case we're recovering on invalid JSON lines, post-processing the token stream requires to
+  // see a JSON-line delimiter as the very first item
+  SymbolOffsetT const delimiter_offset =
+    (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER ? 1 : 0);
+  rmm::device_uvector<PdaTokenT> tokens{max_token_out_count + delimiter_offset, stream, mr};
+  rmm::device_uvector<SymbolOffsetT> tokens_indices{
+    max_token_out_count + delimiter_offset, stream, mr};
+
+  json_to_tokens_fst.Transduce(zip_in,
                                static_cast<SymbolOffsetT>(json_in.size()),
-                               tokens.data(),
-                               tokens_indices.data(),
+                               tokens.data() + delimiter_offset,
+                               tokens_indices.data() + delimiter_offset,
                                num_written_tokens.data(),
                                tokenizer_pda::start_state,
                                stream);
 
-  auto const num_total_tokens = num_written_tokens.value(stream);
+  auto const num_total_tokens = num_written_tokens.value(stream) + delimiter_offset;
   tokens.resize(num_total_tokens, stream);
   tokens_indices.resize(num_total_tokens, stream);
 
+  if (delimiter_offset == 1) {
+    tokens.set_element(0, token_t::LineEnd, stream);
+    auto [filtered_tokens, filtered_tokens_indices] =
+      process_token_stream(tokens, tokens_indices, stream);
+    tokens         = std::move(filtered_tokens);
+    tokens_indices = std::move(filtered_tokens_indices);
+  }
+
   CUDF_EXPECTS(num_total_tokens <= max_token_out_count,
                "Generated token count exceeds the expected token count");
 
@@ -1194,7 +1516,7 @@ void make_json_column(json_column& root_column,
   CUDF_FUNC_RANGE();
 
   // Parse the JSON and get the token stream
-  const auto [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr);
+  auto const [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr);
 
   // Copy the JSON tokens to the host
   thrust::host_vector<PdaTokenT> tokens =
@@ -1281,6 +1603,7 @@ void make_json_column(json_column& root_column,
       case token_t::ValueBegin: return "ValueBegin";
       case token_t::ValueEnd: return "ValueEnd";
       case token_t::ErrorBegin: return "ErrorBegin";
+      case token_t::LineEnd: return "LineEnd";
       default: return "Unknown";
     }
   };
@@ -1324,7 +1647,7 @@ void make_json_column(json_column& root_column,
         CUDF_EXPECTS(current_data_path.top().column->child_columns.size() <= 1,
                      "Encountered a list column with more than a single child column");
         // The child column has yet to be created
-        if (current_data_path.top().column->child_columns.size() == 0) {
+        if (current_data_path.top().column->child_columns.empty()) {
           current_data_path.top().column->child_columns.emplace(std::string{list_child_name},
                                                                 json_column{json_col_t::Unknown});
           current_data_path.top().column->column_order.push_back(list_child_name);
@@ -1566,12 +1889,12 @@ void make_json_column(json_column& root_column,
  *
  * @param options The reader options to influence the relevant type inference and type casting
  * options
+ * @param stream The CUDA stream to which kernels are dispatched
  */
-auto parsing_options(cudf::io::json_reader_options const& options)
+auto parsing_options(cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream)
 {
   auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'};
 
-  auto const stream     = cudf::get_default_stream();
   parse_opts.dayfirst   = options.is_enabled_dayfirst();
   parse_opts.keepquotes = options.is_enabled_keep_quotes();
   parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
@@ -1636,7 +1959,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
       auto string_spans_it = thrust::make_transform_iterator(
         offset_length_it, [data = d_input.data()] __device__(auto ip) {
-          return thrust::pair<const char*, std::size_t>{
+          return thrust::pair<char const*, std::size_t>{
             data + thrust::get<0>(ip), static_cast<std::size_t>(thrust::get<1>(ip))};
         });
 
@@ -1652,21 +1975,25 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       }
       // Infer column type, if we don't have an explicit type for it
       else {
-        target_type = cudf::io::detail::infer_data_type(
-          parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream);
+        target_type =
+          cudf::io::detail::infer_data_type(parsing_options(options, stream).json_view(),
+                                            d_input,
+                                            string_ranges_it,
+                                            col_size,
+                                            stream);
       }
 
       auto [result_bitmask, null_count] = make_validity(json_col);
 
       // Convert strings to the inferred data type
-      auto col = experimental::detail::parse_data(string_spans_it,
-                                                  col_size,
-                                                  target_type,
-                                                  std::move(result_bitmask),
-                                                  null_count,
-                                                  parsing_options(options).view(),
-                                                  stream,
-                                                  mr);
+      auto col = parse_data(string_spans_it,
+                            col_size,
+                            target_type,
+                            std::move(result_bitmask),
+                            null_count,
+                            parsing_options(options, stream).view(),
+                            stream,
+                            mr);
 
       // Reset nullable if we do not have nulls
       // This is to match the existing JSON reader's behaviour:
@@ -1718,8 +2045,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
 
       rmm::device_uvector<json_column::row_offset_t> d_offsets =
         cudf::detail::make_device_uvector_async(json_col.child_offsets, stream, mr);
-      auto offsets_column =
-        std::make_unique<column>(data_type{type_id::INT32}, num_rows, d_offsets.release());
+      auto offsets_column = std::make_unique<column>(
+        data_type{type_id::INT32}, num_rows, d_offsets.release(), rmm::device_buffer{}, 0);
       // Create children column
       auto [child_column, names] =
         json_col.child_columns.empty()
@@ -1796,7 +2123,7 @@ table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,
     new_line_delimited_json ? root_column : root_column.child_columns.begin()->second;
 
   // Zero row entries
-  if (data_root.type == json_col_t::ListColumn && data_root.child_columns.size() == 0) {
+  if (data_root.type == json_col_t::ListColumn && data_root.child_columns.empty()) {
     return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{})};
   }
 
@@ -1823,7 +2150,7 @@ table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,
 
     std::optional<schema_element> child_schema_element = std::visit(
       cudf::detail::visitor_overload{
-        [column_index](const std::vector<data_type>& user_dtypes) -> std::optional<schema_element> {
+        [column_index](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
           auto ret = (static_cast<std::size_t>(column_index) < user_dtypes.size())
                        ? std::optional<schema_element>{{user_dtypes[column_index]}}
                        : std::optional<schema_element>{};
diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/read_json.cu
similarity index 80%
rename from cpp/src/io/json/experimental/read_json.cpp
rename to cpp/src/io/json/read_json.cu
index c18b15708ab..080da7800f4 100644
--- a/cpp/src/io/json/experimental/read_json.cpp
+++ b/cpp/src/io/json/read_json.cu
@@ -17,15 +17,21 @@
 #include "read_json.hpp"
 
 #include <io/comp/io_uncomp.hpp>
+#include <io/json/legacy/read_json.hpp>
 #include <io/json/nested_json.hpp>
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/scatter.h>
+
 #include <numeric>
 
-namespace cudf::io::detail::json::experimental {
+namespace cudf::io::json::detail {
 
 size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
                     size_t range_offset,
@@ -39,21 +45,30 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
   });
 }
 
-rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> const& sources,
+rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
                                            compression_type compression,
                                            size_t range_offset,
                                            size_t range_size,
                                            rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
+  // We append a line delimiter between two files to make sure the last line of file i and the first
+  // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
+  // delimiter.
+  auto constexpr num_delimiter_chars = 1;
+  auto const num_extra_delimiters    = num_delimiter_chars * (sources.size() - 1);
+
   // Iterate through the user defined sources and read the contents into the local buffer
-  auto const total_source_size = sources_size(sources, range_offset, range_size);
+  auto const total_source_size =
+    sources_size(sources, range_offset, range_size) + num_extra_delimiters;
 
   if (compression == compression_type::NONE) {
+    std::vector<size_type> delimiter_map{};
+    delimiter_map.reserve(sources.size());
     auto d_buffer     = rmm::device_uvector<char>(total_source_size, stream);
     size_t bytes_read = 0;
     std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    for (const auto& source : sources) {
+    for (auto const& source : sources) {
       if (!source->is_empty()) {
         auto data_size   = (range_size != 0) ? range_size : source->size();
         auto destination = reinterpret_cast<uint8_t*>(d_buffer.data()) + bytes_read;
@@ -66,9 +81,27 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
             destination, h_buffer->data(), h_buffer->size(), cudaMemcpyDefault, stream.value()));
           bytes_read += h_buffer->size();
         }
+        delimiter_map.push_back(bytes_read);
+        bytes_read += num_delimiter_chars;
       }
     }
 
+    // If this is a multi-file source, we scatter the JSON line delimiters between files
+    if (sources.size() > 1) {
+      static_assert(num_delimiter_chars == 1,
+                    "Currently only single-character delimiters are supported");
+      auto const delimiter_source = thrust::make_constant_iterator('\n');
+      auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
+        host_span<size_type const>{delimiter_map.data(), delimiter_map.size() - 1},
+        stream,
+        rmm::mr::get_current_device_resource());
+      thrust::scatter(rmm::exec_policy_nosync(stream),
+                      delimiter_source,
+                      delimiter_source + d_delimiter_map.size(),
+                      d_delimiter_map.data(),
+                      d_buffer.data());
+    }
+
     stream.synchronize();
     return d_buffer;
 
@@ -165,6 +198,11 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
+
+  if (reader_opts.is_enabled_legacy()) {
+    return legacy::read_json(sources, reader_opts, stream, mr);
+  }
+
   if (not should_load_whole_source(reader_opts)) {
     CUDF_EXPECTS(reader_opts.is_enabled_lines(),
                  "Specifying a byte range is supported only for JSON Lines");
@@ -181,8 +219,8 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
 
   auto const buffer = get_record_range_raw_input(sources, reader_opts, stream);
 
-  return cudf::io::json::detail::device_parse_nested_json(buffer, reader_opts, stream, mr);
+  return device_parse_nested_json(buffer, reader_opts, stream, mr);
   // For debug purposes, use host_parse_nested_json()
 }
 
-}  // namespace cudf::io::detail::json::experimental
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/experimental/read_json.hpp b/cpp/src/io/json/read_json.hpp
similarity index 91%
rename from cpp/src/io/json/experimental/read_json.hpp
rename to cpp/src/io/json/read_json.hpp
index 48e104c4254..db37e7abcdb 100644
--- a/cpp/src/io/json/experimental/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 
 #include <memory>
 
-namespace cudf::io::detail::json::experimental {
+namespace cudf::io::json::detail {
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
@@ -42,4 +42,4 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                         char const delimiter,
                                         rmm::cuda_stream_view stream);
 
-}  // namespace cudf::io::detail::json::experimental
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 026c933fb1b..1e44522ed33 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -75,10 +75,10 @@ namespace {
 struct escape_strings_fn {
   column_device_view const d_column;
   bool const append_colon{false};
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
-  __device__ void write_char(char_utf8 chr, char*& d_buffer, offset_type& bytes)
+  __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes)
   {
     if (d_buffer)
       d_buffer += cudf::strings::detail::from_char_utf8(chr, d_buffer);
@@ -86,6 +86,37 @@ struct escape_strings_fn {
       bytes += cudf::strings::detail::bytes_in_char_utf8(chr);
   }
 
+  __device__ inline char nibble_to_hex(uint8_t nibble) const
+  {
+    return nibble < 10 ? '0' + nibble : 'a' + nibble - 10;
+  }
+
+  __device__ void write_utf8_codepoint(uint16_t codepoint, char*& d_buffer, size_type& bytes)
+  {
+    if (d_buffer) {
+      d_buffer[0] = '\\';
+      d_buffer[1] = 'u';
+      d_buffer[2] = nibble_to_hex((codepoint >> 12) & 0x0F);
+      d_buffer[3] = nibble_to_hex((codepoint >> 8) & 0x0F);
+      d_buffer[4] = nibble_to_hex((codepoint >> 4) & 0x0F);
+      d_buffer[5] = nibble_to_hex((codepoint)&0x0F);
+      d_buffer += 6;
+    } else {
+      bytes += 6;
+    }
+  }
+
+  __device__ void write_utf16_codepoint(uint32_t codepoint, char*& d_buffer, size_type& bytes)
+  {
+    constexpr uint16_t UTF16_HIGH_SURROGATE_BEGIN = 0xD800;
+    constexpr uint16_t UTF16_LOW_SURROGATE_BEGIN  = 0xDC00;
+    codepoint -= 0x1'0000;
+    uint16_t hex_high = ((codepoint >> 10) & 0x3FF) + UTF16_HIGH_SURROGATE_BEGIN;
+    uint16_t hex_low  = (codepoint & 0x3FF) + UTF16_LOW_SURROGATE_BEGIN;
+    write_utf8_codepoint(hex_high, d_buffer, bytes);
+    write_utf8_codepoint(hex_low, d_buffer, bytes);
+  }
+
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
@@ -99,12 +130,25 @@ struct escape_strings_fn {
     constexpr char_utf8 const quote = '\"';  // wrap quotes
     bool constexpr quote_row        = true;
 
-    char* d_buffer    = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    offset_type bytes = 0;
+    char* d_buffer  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    size_type bytes = 0;
 
     if (quote_row) write_char(quote, d_buffer, bytes);
-    for (auto chr : d_str) {
-      auto escaped_chars = cudf::io::json::experimental::detail::get_escaped_char(chr);
+    for (auto utf8_char : d_str) {
+      if (utf8_char > 0x0000'00FF) {
+        // multi-byte char
+        uint32_t codepoint = cudf::strings::detail::utf8_to_codepoint(utf8_char);
+        if (codepoint <= 0x0000'FFFF) {
+          // write \uXXXX utf-8 codepoint
+          write_utf8_codepoint(codepoint, d_buffer, bytes);
+        } else {
+          // write \uXXXX\uXXXX utf-16 surrogate pair
+          // codepoint > 0xFFFF && codepoint <= 0x10FFFF
+          write_utf16_codepoint(codepoint, d_buffer, bytes);
+        }
+        continue;
+      }
+      auto escaped_chars = get_escaped_char(utf8_char);
       if (escaped_chars.first == '\0') {
         write_char(escaped_chars.second, d_buffer, bytes);
       } else {
@@ -159,18 +203,12 @@ struct struct_scatter_strings_fn {
     auto const d_str_null = tbl.column(col).is_null(row);
     auto const this_index = row * num_strviews_per_row + col * strviews_per_column + 1;
     // prefix
-    if (col == 0) { d_strviews[this_index - 1] = row_prefix; }
+    if (col == 0) d_strviews[this_index - 1] = row_prefix;
+    if (col != 0) d_strviews[this_index - 1] = include_nulls ? value_separator : string_view{};
     if (!include_nulls && d_str_null) {
-      if (col != 0) d_strviews[this_index - 1] = string_view{};
       d_strviews[this_index]     = string_view{};
       d_strviews[this_index + 1] = string_view{};
     } else {
-      // if previous column was null, then we skip the value separator
-      if (col != 0)
-        if (tbl.column(col - 1).is_null(row) && !include_nulls)
-          d_strviews[this_index - 1] = string_view{};
-        else
-          d_strviews[this_index - 1] = value_separator;
       auto const d_col_name = col_names.element<string_view>(col);
       auto const d_str = d_str_null ? narep : tbl.column(col).template element<string_view>(row);
       // column_name: value
@@ -182,6 +220,16 @@ struct struct_scatter_strings_fn {
   }
 };
 
+struct validity_fn {
+  table_device_view const tbl;
+  __device__ bool operator()(size_type idx) const
+  {
+    auto const row = idx / tbl.num_columns();
+    auto const col = idx % tbl.num_columns();
+    return tbl.column(col).is_valid(row);
+  }
+};
+
 /**
  * @brief Concatenate the strings from each row of the given table as structs in JSON string
  *
@@ -247,6 +295,41 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
                    thrust::make_counting_iterator<size_type>(0),
                    thrust::make_counting_iterator<size_type>(total_rows),
                    scatter_fn);
+  if (!include_nulls) {
+    // if previous column was null, then we skip the value separator
+    rmm::device_uvector<bool> d_str_separator(total_rows, stream);
+    auto row_num = cudf::detail::make_counting_transform_iterator(
+      0, [tbl = *tbl_device_view] __device__(auto idx) -> size_type {
+        return idx / tbl.num_columns();
+      });
+    auto validity_iterator =
+      cudf::detail::make_counting_transform_iterator(0, validity_fn{*tbl_device_view});
+    thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
+                                  row_num,
+                                  row_num + total_rows,
+                                  validity_iterator,
+                                  d_str_separator.begin(),
+                                  false,
+                                  thrust::equal_to<size_type>{},
+                                  thrust::logical_or<bool>{});
+    thrust::for_each(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     thrust::make_counting_iterator<size_type>(total_rows),
+                     [write_separator = d_str_separator.begin(),
+                      d_strviews      = d_strviews.begin(),
+                      value_separator,
+                      tbl = *tbl_device_view,
+                      strviews_per_column,
+                      num_strviews_per_row] __device__(auto idx) {
+                       auto const row = idx / tbl.num_columns();
+                       auto const col = idx % tbl.num_columns();
+                       auto const this_index =
+                         row * num_strviews_per_row + col * strviews_per_column + 1;
+                       if (write_separator[idx] && tbl.column(col).is_valid(row)) {
+                         d_strviews[this_index - 1] = value_separator;
+                       }
+                     });
+  }
   auto joined_col = make_strings_column(d_strviews, string_view{nullptr, 0}, stream, mr);
 
   // gather from offset and create a new string column
@@ -261,7 +344,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
                  row_string_offsets.begin());
   return make_strings_column(
     strings_columns.num_rows(),
-    std::make_unique<cudf::column>(std::move(row_string_offsets)),
+    std::make_unique<cudf::column>(std::move(row_string_offsets), rmm::device_buffer{}, 0),
     std::move(joined_col->release().children[strings_column_view::chars_column_index]),
     0,
     {});
@@ -381,7 +464,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
                  row_string_offsets.begin());
   return make_strings_column(
     num_lists,
-    std::make_unique<cudf::column>(std::move(row_string_offsets)),
+    std::make_unique<cudf::column>(std::move(row_string_offsets), rmm::device_buffer{}, 0),
     std::move(joined_col->release().children[strings_column_view::chars_column_index]),
     lists_strings.null_count(),
     cudf::detail::copy_bitmask(lists_strings.parent(), stream, mr));
@@ -499,7 +582,7 @@ struct column_to_strings_fn {
     return cudf::strings::detail::from_timestamps(
       column,
       format,
-      strings_column_view(column_view{data_type{type_id::STRING}, 0, nullptr}),
+      strings_column_view(make_empty_column(type_id::STRING)->view()),
       stream_,
       mr_);
   }
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 7e733a45e2f..2e5eeab7298 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -156,18 +156,16 @@ aggregate_orc_metadata::aggregate_orc_metadata(
 std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
 aggregate_orc_metadata::select_stripes(
   std::vector<std::vector<size_type>> const& user_specified_stripes,
-  int64_t skip_rows_opt,
-  std::optional<size_type> const& num_rows_opt,
+  uint64_t skip_rows,
+  std::optional<size_type> const& num_rows,
   rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(
-    (skip_rows_opt == 0 and not num_rows_opt.has_value()) or user_specified_stripes.empty(),
-    "Can't use both the row selection and the stripe selection");
+  CUDF_EXPECTS((skip_rows == 0 and not num_rows.has_value()) or user_specified_stripes.empty(),
+               "Can't use both the row selection and the stripe selection");
 
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not user_specified_stripes.empty()) { return std::pair<uint64_t, size_type>{0, 0}; }
-    return cudf::io::detail::skip_rows_num_rows_from_options(
-      skip_rows_opt, num_rows_opt, get_num_rows());
+    return cudf::io::detail::skip_rows_num_rows_from_options(skip_rows, num_rows, get_num_rows());
   }();
 
   std::vector<metadata::stripe_source_mapping> selected_stripes_mapping;
@@ -183,7 +181,7 @@ aggregate_orc_metadata::select_stripes(
 
       // Coalesce stripe info at the source file later since that makes downstream processing much
       // easier in impl::read
-      for (const auto& stripe_idx : user_specified_stripes[src_file_idx]) {
+      for (auto const& stripe_idx : user_specified_stripes[src_file_idx]) {
         CUDF_EXPECTS(
           stripe_idx >= 0 and stripe_idx < static_cast<decltype(stripe_idx)>(
                                              per_file_metadata[src_file_idx].ff.stripes.size()),
@@ -228,13 +226,13 @@ aggregate_orc_metadata::select_stripes(
       per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
 
       for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
-        const auto stripe         = mapping.stripe_info[i].first;
-        const auto sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
-        const auto sf_comp_length = stripe->footerLength;
+        auto const stripe         = mapping.stripe_info[i].first;
+        auto const sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
+        auto const sf_comp_length = stripe->footerLength;
         CUDF_EXPECTS(
           sf_comp_offset + sf_comp_length < per_file_metadata[mapping.source_idx].source->size(),
           "Invalid stripe information");
-        const auto buffer =
+        auto const buffer =
           per_file_metadata[mapping.source_idx].source->host_read(sf_comp_offset, sf_comp_length);
         auto sf_data = per_file_metadata[mapping.source_idx].decompressor->decompress_blocks(
           {buffer->data(), buffer->size()}, stream);
@@ -250,7 +248,7 @@ aggregate_orc_metadata::select_stripes(
 }
 
 column_hierarchy aggregate_orc_metadata::select_columns(
-  std::optional<std::vector<std::string>> const& column_paths)
+  std::optional<std::vector<std::string>> const& column_paths) const
 {
   auto const& pfm = per_file_metadata[0];
 
@@ -260,7 +258,7 @@ column_hierarchy aggregate_orc_metadata::select_columns(
       add_column_to_mapping(selected_columns, pfm, col_id);
     }
   } else {
-    for (const auto& path : column_paths.value()) {
+    for (auto const& path : column_paths.value()) {
       bool name_found = false;
       for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
         if (pfm.column_path(col_id) == path) {
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index e22ffaeb5f8..587684ccc0d 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -43,7 +43,7 @@ struct column_hierarchy {
  * to aggregate that metadata from all the files.
  */
 class aggregate_orc_metadata {
-  using OrcStripeInfo = std::pair<const StripeInformation*, const StripeFooter*>;
+  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
 
   /**
    * @brief Sums up the number of rows of each source
@@ -91,7 +91,7 @@ class aggregate_orc_metadata {
   /**
    * @brief Returns the name of the given column from the given source.
    */
-  [[nodiscard]] std::string const& column_name(const int source_idx, const int column_id) const
+  [[nodiscard]] std::string const& column_name(int const source_idx, int const column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
@@ -103,7 +103,7 @@ class aggregate_orc_metadata {
    *
    * Full name includes ancestor columns' names.
    */
-  [[nodiscard]] std::string const& column_path(const int source_idx, const int column_id) const
+  [[nodiscard]] std::string const& column_path(int const source_idx, int const column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
@@ -117,8 +117,8 @@ class aggregate_orc_metadata {
    */
   std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>> select_stripes(
     std::vector<std::vector<size_type>> const& user_specified_stripes,
-    int64_t row_start,
-    std::optional<size_type> const& num_rows_opt,
+    uint64_t skip_rows,
+    std::optional<size_type> const& num_rows,
     rmm::cuda_stream_view stream);
 
   /**
@@ -131,7 +131,8 @@ class aggregate_orc_metadata {
    * `nullopt` if user did not select columns to read
    * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level
    */
-  column_hierarchy select_columns(std::optional<std::vector<std::string>> const& column_paths);
+  column_hierarchy select_columns(
+    std::optional<std::vector<std::string>> const& column_paths) const;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index a0e873f75d8..0007530a5af 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -16,471 +16,261 @@
 
 #include "orc_gpu.hpp"
 
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/orc_types.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <io/utilities/block_utils.cuh>
+#include <cudf/table/experimental/row_operators.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/sort.h>
-
-namespace cudf {
-namespace io {
-namespace orc {
-namespace gpu {
-constexpr int init_hash_bits = 12;
-
-struct dictinit_state_s {
-  uint32_t nnz;
-  uint32_t total_dupes;
-  DictionaryChunk chunk;
-  volatile uint32_t scratch_red[32];
-  uint32_t* dict;
-  union {
-    uint16_t u16[1 << (init_hash_bits)];
-    uint32_t u32[1 << (init_hash_bits - 1)];
-  } map;
-};
 
-/**
- * @brief Return a 12-bit hash from a string
- */
-static inline __device__ uint32_t hash_string(const string_view val)
-{
-  if (val.empty()) {
-    return 0;
-  } else {
-    char const* ptr = val.data();
-    uint32_t len    = val.size_bytes();
-    return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
-  }
-}
+namespace cudf::io::orc::gpu {
 
 /**
- * @brief Fill dictionary with the indices of non-null rows
- *
- * @param[in,out] s dictionary builder state
- * @param[in] t thread id
- * @param[in] temp_storage shared memory storage to scan non-null positions
+ * @brief Counts the number of characters in each rowgroup of each string column.
  */
-template <int block_size, typename Storage>
-static __device__ void LoadNonNullIndices(volatile dictinit_state_s* s,
-                                          int t,
-                                          Storage& temp_storage)
+__global__ void rowgroup_char_counts_kernel(device_2dspan<size_type> char_counts,
+                                            device_span<orc_column_device_view const> orc_columns,
+                                            device_2dspan<rowgroup_rows const> rowgroup_bounds,
+                                            device_span<uint32_t const> str_col_indexes)
 {
-  if (t == 0) { s->nnz = 0; }
-  if (s->chunk.num_rows <= 0) {
-    // A sync is needed for s->nnz if there are no times through the loop
-    __syncthreads();
-  }
-  for (uint32_t i = 0; i < s->chunk.num_rows; i += block_size) {
-    const uint32_t* valid_map = s->chunk.leaf_column->null_mask();
-    auto column_offset        = s->chunk.leaf_column->offset();
-    uint32_t is_valid, nz_pos;
-    if (t < block_size / 32) {
-      if (!valid_map) {
-        s->scratch_red[t] = 0xffff'ffffu;
-      } else {
-        uint32_t const row   = s->chunk.start_row + i + t * 32;
-        auto const chunk_end = s->chunk.start_row + s->chunk.num_rows;
-
-        auto const valid_map_idx = (row + column_offset) / 32;
-        uint32_t valid           = (row < chunk_end) ? valid_map[valid_map_idx] : 0;
-
-        auto const rows_in_next_word = (row + column_offset) & 0x1f;
-        if (rows_in_next_word != 0) {
-          auto const rows_in_current_word = 32 - rows_in_next_word;
-          // Read next word if any rows are within the chunk
-          uint32_t const valid_next =
-            (row + rows_in_current_word < chunk_end) ? valid_map[valid_map_idx + 1] : 0;
-          valid = __funnelshift_r(valid, valid_next, rows_in_next_word);
-        }
-        s->scratch_red[t] = valid;
-      }
-    }
-    __syncthreads();
-    is_valid = (i + t < s->chunk.num_rows) ? (s->scratch_red[t >> 5] >> (t & 0x1f)) & 1 : 0;
-    uint32_t tmp_nnz;
-    cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>(temp_storage)
-      .ExclusiveSum(is_valid, nz_pos, tmp_nnz);
-    nz_pos += s->nnz;
-    __syncthreads();
-    if (!t) { s->nnz += tmp_nnz; }
-    if (is_valid) { s->dict[nz_pos] = i + t; }
-    __syncthreads();
-  }
+  // Index of the column in the `str_col_indexes` array
+  auto const str_col_idx = blockIdx.y;
+  // Index of the column in the `orc_columns` array
+  auto const col_idx       = str_col_indexes[str_col_idx];
+  auto const row_group_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row_group_idx >= rowgroup_bounds.size().first) { return; }
+
+  auto const& str_col  = orc_columns[col_idx];
+  auto const start_row = rowgroup_bounds[row_group_idx][col_idx].begin + str_col.offset();
+  auto const num_rows  = rowgroup_bounds[row_group_idx][col_idx].size();
+
+  auto const& offsets = str_col.child(strings_column_view::offsets_column_index);
+  char_counts[str_col_idx][row_group_idx] =
+    (num_rows == 0)
+      ? 0
+      : offsets.element<size_type>(start_row + num_rows) - offsets.element<size_type>(start_row);
 }
 
-/**
- * @brief Gather all non-NULL string rows and compute total character data size
- */
-// blockDim {block_size,1,1}
-template <int block_size>
-__global__ void __launch_bounds__(block_size, 2)
-  gpuInitDictionaryIndices(device_2dspan<DictionaryChunk> chunks,
-                           device_span<orc_column_device_view const> orc_columns,
-                           device_span<device_span<uint32_t>> dict_data,
-                           device_span<device_span<uint32_t>> dict_index,
-                           device_span<device_span<uint32_t>> tmp_indices,
-                           device_2dspan<rowgroup_rows const> rowgroup_bounds,
-                           device_span<uint32_t const> str_col_indexes)
+void rowgroup_char_counts(device_2dspan<size_type> counts,
+                          device_span<orc_column_device_view const> orc_columns,
+                          device_2dspan<rowgroup_rows const> rowgroup_bounds,
+                          device_span<uint32_t const> str_col_indexes,
+                          rmm::cuda_stream_view stream)
 {
-  __shared__ __align__(16) dictinit_state_s state_g;
+  if (rowgroup_bounds.count() == 0) { return; }
 
-  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
-  using block_scan   = cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>;
+  auto const num_rowgroups = rowgroup_bounds.size().first;
+  auto const num_str_cols  = str_col_indexes.size();
 
-  __shared__ union {
-    typename block_reduce::TempStorage reduce_storage;
-    typename block_scan::TempStorage scan_storage;
-  } temp_storage;
+  int block_size    = 0;  // suggested thread count to use
+  int min_grid_size = 0;  // minimum block count required
+  CUDF_CUDA_TRY(
+    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, rowgroup_char_counts_kernel));
+  auto const grid_size =
+    dim3(cudf::util::div_rounding_up_unsafe<unsigned int>(num_rowgroups, block_size),
+         static_cast<unsigned int>(num_str_cols));
 
-  dictinit_state_s* const s = &state_g;
-  // Index of the column in the `str_col_indexes` array
-  uint32_t const str_col_idx = blockIdx.x;
-  // Index of the column in the `orc_columns` array
-  auto const col_idx      = str_col_indexes[str_col_idx];
-  uint32_t group_id       = blockIdx.y;
-  auto const num_str_cols = str_col_indexes.size();
-  uint32_t nnz, start_row, dict_char_count;
-  int t = threadIdx.x;
+  rowgroup_char_counts_kernel<<<grid_size, block_size, 0, stream.value()>>>(
+    counts, orc_columns, rowgroup_bounds, str_col_indexes);
+}
 
-  if (t == 0) {
-    s->chunk             = chunks[group_id][str_col_idx];
-    s->chunk.leaf_column = &orc_columns[col_idx];
-    s->chunk.dict_data   = dict_data[str_col_idx].data() + rowgroup_bounds[group_id][col_idx].begin;
-    s->chunk.dict_index  = dict_index[str_col_idx].data();
-    s->chunk.start_row   = rowgroup_bounds[group_id][col_idx].begin;
-    s->chunk.num_rows    = rowgroup_bounds[group_id][col_idx].size();
-    s->dict              = tmp_indices[str_col_idx].data() + s->chunk.start_row;
-  }
-  for (uint32_t i = 0; i < sizeof(s->map) / sizeof(uint32_t); i += block_size) {
-    if (i + t < sizeof(s->map) / sizeof(uint32_t)) s->map.u32[i + t] = 0;
-  }
-  __syncthreads();
-  // First, take care of NULLs, and count how many strings we have (TODO: bypass this step when
-  // there are no nulls)
-  LoadNonNullIndices<block_size>(s, t, temp_storage.scan_storage);
-  // Sum the lengths of all the strings
-  if (t == 0) {
-    s->chunk.string_char_count = 0;
-    s->total_dupes             = 0;
-  }
-  nnz              = s->nnz;
-  auto t_dict_data = s->chunk.dict_data;
-  start_row        = s->chunk.start_row;
-  for (uint32_t i = 0; i < nnz; i += block_size) {
-    uint32_t ck_row = 0;
-    uint32_t hash   = 0;
-    uint32_t len    = 0;
-    if (i + t < nnz) {
-      ck_row                 = s->dict[i + t];
-      string_view string_val = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
-      len                    = static_cast<uint32_t>(string_val.size_bytes());
-      hash                   = hash_string(string_val);
-    }
-    len = block_reduce(temp_storage.reduce_storage).Sum(len);
-    if (t == 0) s->chunk.string_char_count += len;
-    if (i + t < nnz) {
-      atomicAdd(&s->map.u32[hash >> 1], 1 << ((hash & 1) ? 16 : 0));
-      t_dict_data[i + t] = start_row + ck_row;
+template <int block_size>
+__global__ void __launch_bounds__(block_size)
+  initialize_dictionary_hash_maps_kernel(device_span<stripe_dictionary> dictionaries)
+{
+  auto const dict_map = dictionaries[blockIdx.x].map_slots;
+  auto const t        = threadIdx.x;
+  for (size_type i = 0; i < dict_map.size(); i += block_size) {
+    if (t + i < dict_map.size()) {
+      new (&dict_map[t + i].first) map_type::atomic_key_type{KEY_SENTINEL};
+      new (&dict_map[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL};
     }
-    __syncthreads();
   }
-  // Reorder the 16-bit local indices according to the hash value of the strings
-  static_assert((init_hash_bits == 12), "Hardcoded for init_hash_bits=12");
+}
+
+struct equality_functor {
+  column_device_view const& col;
+  __device__ bool operator()(size_type lhs_idx, size_type rhs_idx) const
   {
-    // Cumulative sum of hash map counts
-    uint32_t count01 = s->map.u32[t * 4 + 0];
-    uint32_t count23 = s->map.u32[t * 4 + 1];
-    uint32_t count45 = s->map.u32[t * 4 + 2];
-    uint32_t count67 = s->map.u32[t * 4 + 3];
-    uint32_t sum01   = count01 + (count01 << 16);
-    uint32_t sum23   = count23 + (count23 << 16);
-    uint32_t sum45   = count45 + (count45 << 16);
-    uint32_t sum67   = count67 + (count67 << 16);
-    sum23 += (sum01 >> 16) * 0x1'0001;
-    sum45 += (sum23 >> 16) * 0x1'0001;
-    sum67 += (sum45 >> 16) * 0x1'0001;
-    uint32_t sum_w = sum67 >> 16;
-    block_scan(temp_storage.scan_storage).InclusiveSum(sum_w, sum_w);
-    __syncthreads();
-    sum_w                 = (sum_w - (sum67 >> 16)) * 0x1'0001;
-    s->map.u32[t * 4 + 0] = sum_w + sum01 - count01;
-    s->map.u32[t * 4 + 1] = sum_w + sum23 - count23;
-    s->map.u32[t * 4 + 2] = sum_w + sum45 - count45;
-    s->map.u32[t * 4 + 3] = sum_w + sum67 - count67;
-    __syncthreads();
+    // We don't call this for nulls so this is fine
+    auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
+    return equal(col.element<string_view>(lhs_idx), col.element<string_view>(rhs_idx));
   }
-  // Put the indices back in hash order
-  for (uint32_t i = 0; i < nnz; i += block_size) {
-    uint32_t ck_row  = 0;
-    uint32_t hash    = 0;
-    uint32_t pos     = 0;
-    uint32_t pos_old = 0;
-    uint32_t sh      = 0;
-    if (i + t < nnz) {
-      ck_row                 = t_dict_data[i + t] - start_row;
-      string_view string_val = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
-      hash                   = hash_string(string_val);
-      sh                     = (hash & 1) ? 16 : 0;
-      pos_old                = s->map.u16[hash];
-    }
-    // The isolation of the atomicAdd, along with pos_old/pos_new is to guarantee deterministic
-    // behavior for the first row in the hash map that will be used for early duplicate detection
-    __syncthreads();
-    if (i + t < nnz) {
-      pos          = (atomicAdd(&s->map.u32[hash >> 1], 1 << sh) >> sh) & 0xffff;
-      s->dict[pos] = ck_row;
-    }
-    __syncthreads();
-    bool collision         = false;
-    uint32_t colliding_row = 0;
-    uint32_t pos_new       = 0;
-    if (i + t < nnz) {
-      pos_new   = s->map.u16[hash];
-      collision = (pos != pos_old && pos_new > pos_old + 1);
-      if (collision) { colliding_row = s->dict[pos_old]; }
-    }
-    __syncthreads();
-    if (collision) { atomicMin(s->dict + pos_old, ck_row); }
+};
 
-    __syncthreads();
-    // Resolve collision
-    if (collision && ck_row == s->dict[pos_old]) { s->dict[pos] = colliding_row; }
+struct hash_functor {
+  column_device_view const& col;
+  __device__ auto operator()(size_type idx) const
+  {
+    return cudf::hashing::detail::MurmurHash3_x86_32<string_view>{}(col.element<string_view>(idx));
   }
-  __syncthreads();
-  // Now that the strings are ordered by hash, compare every string with the first entry in the hash
-  // map, the position of the first string can be inferred from the hash map counts
-  dict_char_count = 0;
-  for (uint32_t i = 0; i < nnz; i += block_size) {
-    uint32_t ck_row = 0, ck_row_ref = 0, is_dupe = 0;
-    if (i + t < nnz) {
-      ck_row                   = s->dict[i + t];
-      string_view string_value = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
-      auto const string_length = static_cast<uint32_t>(string_value.size_bytes());
-      auto const hash          = hash_string(string_value);
-      ck_row_ref               = s->dict[(hash > 0) ? s->map.u16[hash - 1] : 0];
-      if (ck_row_ref != ck_row) {
-        string_view reference_string =
-          s->chunk.leaf_column->element<string_view>(ck_row_ref + start_row);
-        is_dupe = (string_value == reference_string);
-        dict_char_count += (is_dupe) ? 0 : string_length;
+};
+
+template <int block_size>
+__global__ void __launch_bounds__(block_size)
+  populate_dictionary_hash_maps_kernel(device_2dspan<stripe_dictionary> dictionaries,
+                                       device_span<orc_column_device_view const> columns)
+{
+  auto const col_idx    = blockIdx.x;
+  auto const stripe_idx = blockIdx.y;
+  auto const t          = threadIdx.x;
+  auto& dict            = dictionaries[col_idx][stripe_idx];
+  auto const& col       = columns[dict.column_idx];
+
+  // Make a view of the hash map
+  auto hash_map_mutable  = map_type::device_mutable_view(dict.map_slots.data(),
+                                                        dict.map_slots.size(),
+                                                        cuco::empty_key{KEY_SENTINEL},
+                                                        cuco::empty_value{VALUE_SENTINEL});
+  auto const hash_fn     = hash_functor{col};
+  auto const equality_fn = equality_functor{col};
+
+  auto const start_row = dict.start_row;
+  auto const end_row   = dict.start_row + dict.num_rows;
+
+  size_type entry_count{0};
+  size_type char_count{0};
+  // all threads should loop the same number of times
+  for (thread_index_type cur_row = start_row + t; cur_row - t < end_row; cur_row += block_size) {
+    auto const is_valid = cur_row < end_row and col.is_valid(cur_row);
+
+    if (is_valid) {
+      // insert element at cur_row to hash map and count successful insertions
+      auto const is_unique =
+        hash_map_mutable.insert(std::pair(cur_row, cur_row), hash_fn, equality_fn);
+
+      if (is_unique) {
+        ++entry_count;
+        char_count += col.element<string_view>(cur_row).size_bytes();
       }
     }
-    uint32_t dupes_in_block;
-    uint32_t dupes_before;
-    block_scan(temp_storage.scan_storage).InclusiveSum(is_dupe, dupes_before, dupes_in_block);
-    dupes_before += s->total_dupes;
+    // ensure that threads access adjacent rows in each iteration
     __syncthreads();
-    if (!t) { s->total_dupes += dupes_in_block; }
-    if (i + t < nnz) {
-      if (!is_dupe) {
-        t_dict_data[i + t - dupes_before] = ck_row + start_row;
-      } else {
-        s->chunk.dict_index[ck_row + start_row] = (ck_row_ref + start_row) | (1u << 31);
-      }
-    }
   }
-  // temp_storage is being used twice, so make sure there is `__syncthreads()` between them
-  // while making any future changes.
-  dict_char_count = block_reduce(temp_storage.reduce_storage).Sum(dict_char_count);
-  if (!t) {
-    chunks[group_id][str_col_idx].num_strings       = nnz;
-    chunks[group_id][str_col_idx].string_char_count = s->chunk.string_char_count;
-    chunks[group_id][str_col_idx].num_dict_strings  = nnz - s->total_dupes;
-    chunks[group_id][str_col_idx].dict_char_count   = dict_char_count;
-    chunks[group_id][str_col_idx].leaf_column       = s->chunk.leaf_column;
-
-    chunks[group_id][str_col_idx].dict_data  = s->chunk.dict_data;
-    chunks[group_id][str_col_idx].dict_index = s->chunk.dict_index;
-    chunks[group_id][str_col_idx].start_row  = s->chunk.start_row;
-    chunks[group_id][str_col_idx].num_rows   = s->chunk.num_rows;
+
+  using block_reduce = cub::BlockReduce<size_type, block_size>;
+  __shared__ typename block_reduce::TempStorage reduce_storage;
+
+  auto const block_entry_count = block_reduce(reduce_storage).Sum(entry_count);
+  __syncthreads();
+  auto const block_char_count = block_reduce(reduce_storage).Sum(char_count);
+
+  if (t == 0) {
+    dict.entry_count = block_entry_count;
+    dict.char_count  = block_char_count;
   }
 }
 
-/**
- * @brief In-place concatenate dictionary data for all chunks in each stripe
- *
- * @param[in] stripes StripeDictionary device array [stripe][column]
- * @param[in] chunks DictionaryChunk device array [rowgroup][column]
- * @param[in] num_columns Number of columns
- */
-// blockDim {1024,1,1}
-__global__ void __launch_bounds__(1024)
-  gpuCompactChunkDictionaries(device_2dspan<StripeDictionary> stripes,
-                              device_2dspan<DictionaryChunk const> chunks)
+template <int block_size>
+__global__ void __launch_bounds__(block_size)
+  collect_map_entries_kernel(device_2dspan<stripe_dictionary> dictionaries)
 {
-  __shared__ __align__(16) StripeDictionary stripe_g;
-  __shared__ __align__(16) DictionaryChunk chunk_g;
-  __shared__ const uint32_t* volatile ck_curptr_g;
-  __shared__ uint32_t volatile ck_curlen_g;
-
-  uint32_t col_id    = blockIdx.x;
-  uint32_t stripe_id = blockIdx.y;
-  uint32_t chunk_len;
-  int t = threadIdx.x;
-  const uint32_t* src;
-  uint32_t* dst;
-
-  if (t == 0) stripe_g = stripes[stripe_id][col_id];
-  __syncthreads();
-  if (!stripe_g.dict_data) { return; }
-  if (t == 0) chunk_g = chunks[stripe_g.start_chunk][col_id];
+  auto const col_idx    = blockIdx.x;
+  auto const stripe_idx = blockIdx.y;
+  auto const& dict      = dictionaries[col_idx][stripe_idx];
+
+  if (not dict.is_enabled) { return; }
+
+  auto const t = threadIdx.x;
+  auto map     = map_type::device_view(dict.map_slots.data(),
+                                   dict.map_slots.size(),
+                                   cuco::empty_key{KEY_SENTINEL},
+                                   cuco::empty_value{VALUE_SENTINEL});
+
+  __shared__ cuda::atomic<size_type, cuda::thread_scope_block> counter;
+
+  using cuda::std::memory_order_relaxed;
+  if (t == 0) { new (&counter) cuda::atomic<size_type, cuda::thread_scope_block>{0}; }
   __syncthreads();
-  dst = stripe_g.dict_data + chunk_g.num_dict_strings;
-  for (uint32_t g = 1; g < stripe_g.num_chunks; g++) {
-    if (!t) {
-      src         = chunks[stripe_g.start_chunk + g][col_id].dict_data;
-      chunk_len   = chunks[stripe_g.start_chunk + g][col_id].num_dict_strings;
-      ck_curptr_g = src;
-      ck_curlen_g = chunk_len;
-    }
-    __syncthreads();
-    src       = ck_curptr_g;
-    chunk_len = ck_curlen_g;
-    if (src != dst) {
-      for (uint32_t i = 0; i < chunk_len; i += 1024) {
-        uint32_t idx = (i + t < chunk_len) ? src[i + t] : 0;
-        __syncthreads();
-        if (i + t < chunk_len) dst[i + t] = idx;
+  for (size_type i = 0; i < dict.map_slots.size(); i += block_size) {
+    if (t + i < dict.map_slots.size()) {
+      auto* slot = reinterpret_cast<map_type::value_type*>(map.begin_slot() + t + i);
+      auto key   = slot->first;
+      if (key != KEY_SENTINEL) {
+        auto loc       = counter.fetch_add(1, memory_order_relaxed);
+        dict.data[loc] = key;
+        slot->second   = loc;
       }
     }
-    dst += chunk_len;
-    __syncthreads();
   }
 }
 
-struct build_state_s {
-  uint32_t total_dupes;
-  StripeDictionary stripe;
-  volatile uint32_t scratch_red[32];
-};
-
-/**
- * @brief Eliminate duplicates in-place and generate column dictionary index
- *
- * @param[in] stripes StripeDictionary device array [stripe][column]
- * @param[in] num_columns Number of string columns
- */
-// NOTE: Prone to poor utilization on small datasets due to 1 block per dictionary
-// blockDim {1024,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  gpuBuildStripeDictionaries(device_2dspan<StripeDictionary> stripes)
+  get_dictionary_indices_kernel(device_2dspan<stripe_dictionary> dictionaries,
+                                device_span<orc_column_device_view const> columns)
 {
-  __shared__ __align__(16) build_state_s state_g;
-  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
-  using block_scan   = cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>;
-  __shared__ union {
-    typename block_reduce::TempStorage reduce_storage;
-    typename block_scan::TempStorage scan_storage;
-  } temp_storage;
-
-  build_state_s* const s = &state_g;
-  uint32_t col_id        = blockIdx.x;
-  uint32_t stripe_id     = blockIdx.y;
-  uint32_t num_strings;
-  uint32_t *dict_data, *dict_index;
-  uint32_t dict_char_count;
-  int t = threadIdx.x;
-
-  if (t == 0) s->stripe = stripes[stripe_id][col_id];
-  if (t == 31 * 32) { s->total_dupes = 0; }
-  __syncthreads();
-  num_strings = s->stripe.num_strings;
-  dict_data   = s->stripe.dict_data;
-  if (!dict_data) return;
-  dict_index                 = s->stripe.dict_index;
-  string_view current_string = string_view::min();
-  dict_char_count            = 0;
-  for (uint32_t i = 0; i < num_strings; i += block_size) {
-    uint32_t cur     = (i + t < num_strings) ? dict_data[i + t] : 0;
-    uint32_t cur_len = 0;
-    bool is_dupe     = false;
-    if (i + t < num_strings) {
-      current_string = s->stripe.leaf_column->element<string_view>(cur);
-      cur_len        = current_string.size_bytes();
-    }
-    if (i + t != 0 && i + t < num_strings) {
-      uint32_t prev = dict_data[i + t - 1];
-      is_dupe       = (current_string == (s->stripe.leaf_column->element<string_view>(prev)));
-    }
-    dict_char_count += (is_dupe) ? 0 : cur_len;
-    uint32_t dupes_in_block;
-    uint32_t dupes_before;
-    block_scan(temp_storage.scan_storage).InclusiveSum(is_dupe, dupes_before, dupes_in_block);
-    dupes_before += s->total_dupes;
-    __syncthreads();
-    if (!t) { s->total_dupes += dupes_in_block; }
-    if (i + t < num_strings) {
-      dict_index[cur] = i + t - dupes_before;
-      if (!is_dupe && dupes_before != 0) { dict_data[i + t - dupes_before] = cur; }
+  auto const col_idx    = blockIdx.x;
+  auto const stripe_idx = blockIdx.y;
+  auto const& dict      = dictionaries[col_idx][stripe_idx];
+  auto const& col       = columns[dict.column_idx];
+
+  if (not dict.is_enabled) { return; }
+
+  auto const t         = threadIdx.x;
+  auto const start_row = dict.start_row;
+  auto const end_row   = dict.start_row + dict.num_rows;
+
+  auto const map = map_type::device_view(dict.map_slots.data(),
+                                         dict.map_slots.size(),
+                                         cuco::empty_key{KEY_SENTINEL},
+                                         cuco::empty_value{VALUE_SENTINEL});
+
+  thread_index_type cur_row = start_row + t;
+  while (cur_row < end_row) {
+    if (col.is_valid(cur_row)) {
+      auto const hash_fn     = hash_functor{col};
+      auto const equality_fn = equality_functor{col};
+      auto const found_slot  = map.find(cur_row, hash_fn, equality_fn);
+      cudf_assert(found_slot != map.end() &&
+                  "Unable to find value in map in dictionary index construction");
+      if (found_slot != map.end()) {
+        // No need for atomic as this is not going to be modified by any other thread
+        auto const val_ptr  = reinterpret_cast<map_type::mapped_type const*>(&found_slot->second);
+        dict.index[cur_row] = *val_ptr;
+      }
     }
-    __syncthreads();
-  }
-  dict_char_count = block_reduce(temp_storage.reduce_storage).Sum(dict_char_count);
-  if (t == 0) {
-    stripes[stripe_id][col_id].num_strings     = num_strings - s->total_dupes;
-    stripes[stripe_id][col_id].dict_char_count = dict_char_count;
+    cur_row += block_size;
   }
 }
 
-void InitDictionaryIndices(device_span<orc_column_device_view const> orc_columns,
-                           device_2dspan<DictionaryChunk> chunks,
-                           device_span<device_span<uint32_t>> dict_data,
-                           device_span<device_span<uint32_t>> dict_index,
-                           device_span<device_span<uint32_t>> tmp_indices,
-                           device_2dspan<rowgroup_rows const> rowgroup_bounds,
-                           device_span<uint32_t const> str_col_indexes,
-                           rmm::cuda_stream_view stream)
+void initialize_dictionary_hash_maps(device_2dspan<stripe_dictionary> dictionaries,
+                                     rmm::cuda_stream_view stream)
 {
-  static constexpr int block_size = 512;
-  dim3 dim_block(block_size, 1);
-  dim3 dim_grid(str_col_indexes.size(), rowgroup_bounds.size().first);
-  gpuInitDictionaryIndices<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
-    chunks, orc_columns, dict_data, dict_index, tmp_indices, rowgroup_bounds, str_col_indexes);
+  if (dictionaries.count() == 0) { return; }
+  constexpr int block_size = 1024;
+  initialize_dictionary_hash_maps_kernel<block_size>
+    <<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries.flat_view());
 }
 
-/**
- * @copydoc cudf::io::orc::gpu::BuildStripeDictionaries
- */
-void BuildStripeDictionaries(device_2dspan<StripeDictionary> d_stripes_dicts,
-                             host_2dspan<StripeDictionary const> h_stripe_dicts,
-                             device_2dspan<DictionaryChunk const> chunks,
-                             rmm::cuda_stream_view stream)
+void populate_dictionary_hash_maps(device_2dspan<stripe_dictionary> dictionaries,
+                                   device_span<orc_column_device_view const> columns,
+                                   rmm::cuda_stream_view stream)
 {
-  auto const num_stripes = h_stripe_dicts.size().first;
-  auto const num_columns = h_stripe_dicts.size().second;
-
-  dim3 dim_block(1024, 1);  // 1024 threads per chunk
-  dim3 dim_grid_build(num_columns, num_stripes);
-  gpuCompactChunkDictionaries<<<dim_grid_build, dim_block, 0, stream.value()>>>(d_stripes_dicts,
-                                                                                chunks);
-  for (uint32_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
-    for (auto const& stripe_dict : h_stripe_dicts[stripe_idx]) {
-      if (stripe_dict.dict_data != nullptr) {
-        auto const dict_data_ptr = thrust::device_pointer_cast(stripe_dict.dict_data);
-        auto const string_column = stripe_dict.leaf_column;
-        // NOTE: Requires the --expt-extended-lambda nvcc flag
-        thrust::sort(rmm::exec_policy(stream),
-                     dict_data_ptr,
-                     dict_data_ptr + stripe_dict.num_strings,
-                     [string_column] __device__(const uint32_t& lhs, const uint32_t& rhs) {
-                       return string_column->element<string_view>(lhs) <
-                              string_column->element<string_view>(rhs);
-                     });
-      }
-    }
-  }
-  gpuBuildStripeDictionaries<1024>
-    <<<dim_grid_build, dim_block, 0, stream.value()>>>(d_stripes_dicts);
+  if (dictionaries.count() == 0) { return; }
+  constexpr int block_size = 256;
+  dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
+  populate_dictionary_hash_maps_kernel<block_size>
+    <<<dim_grid, block_size, 0, stream.value()>>>(dictionaries, columns);
+}
+
+void collect_map_entries(device_2dspan<stripe_dictionary> dictionaries,
+                         rmm::cuda_stream_view stream)
+{
+  if (dictionaries.count() == 0) { return; }
+  constexpr int block_size = 1024;
+  dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
+  collect_map_entries_kernel<block_size><<<dim_grid, block_size, 0, stream.value()>>>(dictionaries);
+}
+
+void get_dictionary_indices(device_2dspan<stripe_dictionary> dictionaries,
+                            device_span<orc_column_device_view const> columns,
+                            rmm::cuda_stream_view stream)
+{
+  if (dictionaries.count() == 0) { return; }
+  constexpr int block_size = 1024;
+  dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
+  get_dictionary_indices_kernel<block_size>
+    <<<dim_grid, block_size, 0, stream.value()>>>(dictionaries, columns);
 }
 
-}  // namespace gpu
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc::gpu
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 72919b47da6..fc50b7118be 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -24,9 +24,19 @@
 
 #include <string>
 
-namespace cudf {
-namespace io {
-namespace orc {
+namespace cudf::io::orc {
+
+namespace {
+[[nodiscard]] constexpr uint32_t varint_size(uint64_t val)
+{
+  auto len = 1u;
+  while (val > 0x7f) {
+    val >>= 7;
+    ++len;
+  }
+  return len;
+}
+}  // namespace
 
 uint32_t ProtobufReader::read_field_size(uint8_t const* end)
 {
@@ -515,6 +525,4 @@ void metadata::init_parent_descriptors()
   }
 }
 
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index b2763a8a4ce..6f65e384d2d 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -465,16 +465,6 @@ class ProtobufWriter {
     return l;
   }
 
-  uint32_t varint_size(uint64_t val)
-  {
-    auto len = 1u;
-    while (val > 0x7f) {
-      val >>= 7;
-      ++len;
-    }
-    return len;
-  }
-
   uint32_t put_int(int64_t v)
   {
     int64_t s = (v < 0);
diff --git a/cpp/src/io/orc/orc_field_reader.hpp b/cpp/src/io/orc/orc_field_reader.hpp
index ccbe6553e0f..58f3fff7eb4 100644
--- a/cpp/src/io/orc/orc_field_reader.hpp
+++ b/cpp/src/io/orc/orc_field_reader.hpp
@@ -42,8 +42,8 @@ template <int index>
 struct FunctionSwitchImpl {
   template <typename... Operator>
   static inline void run(ProtobufReader* pbr,
-                         const uint8_t* end,
-                         const int& encoded_field_number,
+                         uint8_t const* end,
+                         int const& encoded_field_number,
                          std::tuple<Operator...>& ops)
   {
     if (encoded_field_number == std::get<index>(ops).encoded_field_number) {
@@ -58,8 +58,8 @@ template <>
 struct FunctionSwitchImpl<0> {
   template <typename... Operator>
   static inline void run(ProtobufReader* pbr,
-                         const uint8_t* end,
-                         const int& encoded_field_number,
+                         uint8_t const* end,
+                         int const& encoded_field_number,
                          std::tuple<Operator...>& ops)
   {
     if (encoded_field_number == std::get<0>(ops).encoded_field_number) {
diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp
index fdba0d81a32..4862562d526 100644
--- a/cpp/src/io/orc/orc_field_writer.hpp
+++ b/cpp/src/io/orc/orc_field_writer.hpp
@@ -40,7 +40,7 @@ struct ProtobufWriter::ProtobufFieldWriter {
    * @brief Function to write a unsigned integer to the internal buffer
    */
   template <typename T>
-  void field_uint(int field, const T& value)
+  void field_uint(int field, T const& value)
   {
     struct_size += p->put_uint(encode_field_number<T>(field));
     struct_size += p->put_uint(static_cast<uint64_t>(value));
@@ -51,7 +51,7 @@ struct ProtobufWriter::ProtobufFieldWriter {
    * buffer
    */
   template <typename T>
-  void field_packed_uint(int field, const std::vector<T>& value)
+  void field_packed_uint(int field, std::vector<T> const& value)
   {
     struct_size += p->put_uint(encode_field_number<std::vector<T>>(field));
     auto lpos = p->m_buff.size();
@@ -81,7 +81,7 @@ struct ProtobufWriter::ProtobufFieldWriter {
    * @brief Function to write a struct to the internal buffer
    */
   template <typename T>
-  void field_struct(int field, const T& value)
+  void field_struct(int field, T const& value)
   {
     struct_size += p->put_uint(encode_field_number(field, ProtofType::FIXEDLEN));
     auto lpos = p->m_buff.size();
@@ -96,9 +96,9 @@ struct ProtobufWriter::ProtobufFieldWriter {
   /**
    * @brief Function to write a vector of strings to the internal buffer
    */
-  void field_repeated_string(int field, const std::vector<std::string>& value)
+  void field_repeated_string(int field, std::vector<std::string> const& value)
   {
-    for (const auto& elem : value)
+    for (auto const& elem : value)
       field_blob(field, elem);
   }
 
@@ -106,9 +106,9 @@ struct ProtobufWriter::ProtobufFieldWriter {
    * @brief Function to write a vector of structs to the internal buffer
    */
   template <typename T>
-  void field_repeated_struct(int field, const std::vector<T>& value)
+  void field_repeated_struct(int field, std::vector<T> const& value)
   {
-    for (const auto& elem : value)
+    for (auto const& elem : value)
       field_struct(field, elem);
   }
 
@@ -117,9 +117,9 @@ struct ProtobufWriter::ProtobufFieldWriter {
    * buffer
    */
   template <typename T>
-  void field_repeated_struct_blob(int field, const std::vector<T>& value)
+  void field_repeated_struct_blob(int field, std::vector<T> const& value)
   {
-    for (const auto& elem : value)
+    for (auto const& elem : value)
       field_blob(field, elem);
   }
 
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 353affac6f2..681cc0fb9d2 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -16,20 +16,23 @@
 
 #pragma once
 
-#include <cudf/detail/timezone.cuh>
-
 #include "orc.hpp"
 
+#include <io/comp/gpuinflate.hpp>
+#include <io/statistics/statistics.cuh>
+#include <io/utilities/column_buffer.hpp>
+
+#include <cudf/detail/timezone.cuh>
 #include <cudf/io/orc_types.hpp>
+#include <cudf/io/types.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
-#include <io/comp/gpuinflate.hpp>
-#include <io/statistics/statistics.cuh>
-#include <io/utilities/column_buffer.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cuco/static_map.cuh>
+
 namespace cudf {
 namespace io {
 namespace orc {
@@ -38,9 +41,22 @@ namespace gpu {
 using cudf::detail::device_2dspan;
 using cudf::detail::host_2dspan;
 
+auto constexpr KEY_SENTINEL   = size_type{-1};
+auto constexpr VALUE_SENTINEL = size_type{-1};
+
+using map_type = cuco::static_map<size_type, size_type>;
+
+/**
+ * @brief The alias of `map_type::pair_atomic_type` class.
+ *
+ * Declare this struct by trivial subclassing instead of type aliasing so we can have forward
+ * declaration of this struct somewhere else.
+ */
+struct slot_type : public map_type::slot_type {};
+
 struct CompressedStreamInfo {
   CompressedStreamInfo() = default;
-  explicit constexpr CompressedStreamInfo(const uint8_t* compressed_data_, size_t compressed_size_)
+  explicit constexpr CompressedStreamInfo(uint8_t const* compressed_data_, size_t compressed_size_)
     : compressed_data(compressed_data_),
       uncompressed_data(nullptr),
       compressed_data_size(compressed_size_),
@@ -54,7 +70,7 @@ struct CompressedStreamInfo {
       max_uncompressed_block_size(0)
   {
   }
-  const uint8_t* compressed_data;  // [in] base ptr to compressed stream data
+  uint8_t const* compressed_data;  // [in] base ptr to compressed stream data
   uint8_t* uncompressed_data;  // [in] base ptr to uncompressed stream data or NULL if not known yet
   size_t compressed_data_size;              // [in] compressed data size for this stream
   device_span<uint8_t const>* dec_in_ctl;   // [in] input buffer to decompress
@@ -91,7 +107,7 @@ struct DictionaryEntry {
  * @brief Struct to describe per stripe's column information
  */
 struct ColumnDesc {
-  const uint8_t* streams[CI_NUM_STREAMS];  // ptr to data stream index
+  uint8_t const* streams[CI_NUM_STREAMS];  // ptr to data stream index
   uint32_t strm_id[CI_NUM_STREAMS];        // stream ids
   uint32_t strm_len[CI_NUM_STREAMS];       // stream length
   uint32_t* valid_map_base;                // base pointer of valid bit map for this column
@@ -171,36 +187,63 @@ struct StripeStream {
 };
 
 /**
- * @brief Struct to describe a dictionary chunk
+ * @brief Struct to describe a stripe dictionary
  */
-struct DictionaryChunk {
-  uint32_t* dict_data;   // dictionary data (index of non-null rows)
-  uint32_t* dict_index;  // row indices of corresponding string (row from dictionary index)
-  uint32_t start_row;    // start row of this chunk
-  uint32_t num_rows;     // num rows in this chunk
-  uint32_t num_strings;  // number of strings in this chunk
-  uint32_t
-    string_char_count;   // total size of string data (NOTE: assumes less than 4G bytes per chunk)
-  uint32_t num_dict_strings;                  // number of strings in dictionary
-  uint32_t dict_char_count;                   // size of dictionary string data for this chunk
-
-  orc_column_device_view const* leaf_column;  //!< Pointer to string column
+struct stripe_dictionary {
+  // input
+  device_span<slot_type> map_slots;  // hash map storage
+  uint32_t column_idx      = 0;      // column index
+  size_type start_row      = 0;      // first row in the stripe
+  size_type start_rowgroup = 0;      // first rowgroup in the stripe
+  size_type num_rows       = 0;      // number of rows in the stripe
+
+  // output
+  device_span<uint32_t> data;     // index of elements in the column to include in the dictionary
+  device_span<uint32_t> index;    // index into the dictionary for each row in the column
+  size_type entry_count = 0;      // number of entries in the dictionary
+  size_type char_count  = 0;      // number of characters in the dictionary
+  bool is_enabled       = false;  // true if dictionary encoding is enabled for this stripe
 };
 
 /**
- * @brief Struct to describe a dictionary
+ * @brief Initializes the hash maps storage for dictionary encoding to sentinel values.
+ *
+ * @param dictionaries Dictionary descriptors
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
-struct StripeDictionary {
-  uint32_t* dict_data;       // row indices of corresponding string (row from dictionary index)
-  uint32_t* dict_index;      // dictionary index from row index
-  uint32_t column_id;        // real column id
-  uint32_t start_chunk;      // first chunk in stripe
-  uint32_t num_chunks;       // number of chunks in the stripe
-  uint32_t num_strings;      // number of unique strings in the dictionary
-  uint32_t dict_char_count;  // total size of dictionary string data
-
-  orc_column_device_view const* leaf_column;  //!< Pointer to string column
-};
+void initialize_dictionary_hash_maps(device_2dspan<stripe_dictionary> dictionaries,
+                                     rmm::cuda_stream_view stream);
+
+/**
+ * @brief Populates the hash maps with unique values from the stripe.
+ *
+ * @param dictionaries Dictionary descriptors
+ * @param columns  Pre-order flattened device array of ORC column views
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void populate_dictionary_hash_maps(device_2dspan<stripe_dictionary> dictionaries,
+                                   device_span<orc_column_device_view const> columns,
+                                   rmm::cuda_stream_view stream);
+
+/**
+ * @brief Stores the indices of the hash map entries in the dictionary data buffer.
+ *
+ * @param dictionaries Dictionary descriptors
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void collect_map_entries(device_2dspan<stripe_dictionary> dictionaries,
+                         rmm::cuda_stream_view stream);
+
+/**
+ * @brief Stores the corresponding dictionary indices for each row in the column.
+ *
+ * @param dictionaries Dictionary descriptors
+ * @param columns Pre-order flattened device array of ORC column views
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void get_dictionary_indices(device_2dspan<stripe_dictionary> dictionaries,
+                            device_span<orc_column_device_view const> columns,
+                            rmm::cuda_stream_view stream);
 
 constexpr uint32_t encode_block_size = 512;
 
@@ -316,14 +359,16 @@ void EncodeOrcColumnData(device_2dspan<EncChunk const> chunks,
 /**
  * @brief Launches kernel for encoding column dictionaries
  *
- * @param[in] stripes Stripe dictionaries device array [stripe][string_column]
+ * @param[in] stripes Stripe dictionaries device array
+ * @param[in] columns Pre-order flattened device array of ORC column views
  * @param[in] chunks encoder chunk device array [column][rowgroup]
  * @param[in] num_string_columns Number of string columns
  * @param[in] num_stripes Number of stripes
  * @param[in,out] enc_streams chunk streams device array [column][rowgroup]
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  */
-void EncodeStripeDictionaries(StripeDictionary const* stripes,
+void EncodeStripeDictionaries(stripe_dictionary const* stripes,
+                              device_span<orc_column_device_view const> columns,
                               device_2dspan<EncChunk const> chunks,
                               uint32_t num_string_columns,
                               uint32_t num_stripes,
@@ -350,55 +395,41 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
  * @param[in] comp_blk_size Compression block size
  * @param[in] max_comp_blk_size Max size of any block after compression
  * @param[in] comp_block_align Required alignment for compressed blocks
+ * @param[in] collect_statistics Whether to collect compression statistics
  * @param[in,out] strm_desc StripeStream device array [stripe][stream]
  * @param[in,out] enc_streams chunk streams device array [column][rowgroup]
  * @param[out] comp_res Per-block compression status
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
- */
-void CompressOrcDataStreams(uint8_t* compressed_data,
-                            uint32_t num_compressed_blocks,
-                            CompressionKind compression,
-                            uint32_t comp_blk_size,
-                            uint32_t max_comp_blk_size,
-                            uint32_t comp_block_align,
-                            device_2dspan<StripeStream> strm_desc,
-                            device_2dspan<encoder_chunk_streams> enc_streams,
-                            device_span<compression_result> comp_res,
-                            rmm::cuda_stream_view stream);
-
-/**
- * @brief Launches kernel for initializing dictionary chunks
  *
- * @param[in] orc_columns Pre-order flattened device array of ORC column views
- * @param[in,out] chunks DictionaryChunk device array [rowgroup][column]
- * @param[in] dict_data dictionary data (index of non-null rows)
- * @param[in] dict_index row indices of corresponding string (row from dictionary index)
- * @param[in] tmp_indices Temporary buffer for dictionary indices
- * @param[in] rowgroup_bounds Ranges of rows in each rowgroup [rowgroup][column]
- * @param[in] str_col_indexes List of columns that are strings type
- * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @return Compression statistics (if requested)
  */
-void InitDictionaryIndices(device_span<orc_column_device_view const> orc_columns,
-                           device_2dspan<DictionaryChunk> chunks,
-                           device_span<device_span<uint32_t>> dict_data,
-                           device_span<device_span<uint32_t>> dict_index,
-                           device_span<device_span<uint32_t>> tmp_indices,
-                           device_2dspan<rowgroup_rows const> rowgroup_bounds,
-                           device_span<uint32_t const> str_col_indexes,
-                           rmm::cuda_stream_view stream);
+std::optional<writer_compression_statistics> CompressOrcDataStreams(
+  device_span<uint8_t> compressed_data,
+  uint32_t num_compressed_blocks,
+  CompressionKind compression,
+  uint32_t comp_blk_size,
+  uint32_t max_comp_blk_size,
+  uint32_t comp_block_align,
+  bool collect_statistics,
+  device_2dspan<StripeStream> strm_desc,
+  device_2dspan<encoder_chunk_streams> enc_streams,
+  device_span<compression_result> comp_res,
+  rmm::cuda_stream_view stream);
 
 /**
- * @brief Launches kernel for building stripe dictionaries
+ * @brief Counts the number of characters in each rowgroup of each string column.
  *
- * @param[in] d_stripes StripeDictionary device 2D array [stripe][column]
- * @param[in] h_stripes StripeDictionary host 2D array [stripe][column]
- * @param[in] chunks DictionaryChunk device array [rowgroup][column]
- * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @param counts Output array of character counts [column][rowgroup]
+ * @param orc_columns Pre-order flattened device array of ORC column views
+ * @param rowgroup_bounds Ranges of rows in each rowgroup [rowgroup][column]
+ * @param str_col_indexes Indexes of string columns in orc_columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void BuildStripeDictionaries(device_2dspan<StripeDictionary> d_stripes,
-                             host_2dspan<StripeDictionary const> h_stripes,
-                             device_2dspan<DictionaryChunk const> chunks,
-                             rmm::cuda_stream_view stream);
+void rowgroup_char_counts(device_2dspan<size_type> counts,
+                          device_span<orc_column_device_view const> orc_columns,
+                          device_2dspan<rowgroup_rows const> rowgroup_bounds,
+                          device_span<uint32_t const> str_col_indexes,
+                          rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernels to initialize statistics collection
@@ -409,7 +440,7 @@ void BuildStripeDictionaries(device_2dspan<StripeDictionary> d_stripes,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  */
 void orc_init_statistics_groups(statistics_group* groups,
-                                const stats_column_desc* cols,
+                                stats_column_desc const* cols,
                                 device_2dspan<rowgroup_rows const> rowgroup_bounds,
                                 rmm::cuda_stream_view stream);
 
@@ -422,7 +453,7 @@ void orc_init_statistics_groups(statistics_group* groups,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  */
 void orc_init_statistics_buffersize(statistics_merge_group* groups,
-                                    const statistics_chunk* chunks,
+                                    statistics_chunk const* chunks,
                                     uint32_t statistics_count,
                                     rmm::cuda_stream_view stream);
 
@@ -437,7 +468,7 @@ void orc_init_statistics_buffersize(statistics_merge_group* groups,
  */
 void orc_encode_statistics(uint8_t* blob_bfr,
                            statistics_merge_group* groups,
-                           const statistics_chunk* chunks,
+                           statistics_chunk const* chunks,
                            uint32_t statistics_count,
                            rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 1561737da48..157269cf52e 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -27,7 +27,6 @@
 #include <io/comp/gpuinflate.hpp>
 #include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/config_utils.hpp>
-#include <io/utilities/time_utils.cuh>
 
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
@@ -35,146 +34,108 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/copy.h>
-#include <thrust/execution_policy.h>
+#include <thrust/fill.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
 #include <thrust/pair.h>
 #include <thrust/scan.h>
 #include <thrust/transform.h>
-#include <thrust/tuple.h>
 
 #include <algorithm>
 #include <iterator>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace orc {
+namespace cudf::io::detail::orc {
 using namespace cudf::io::orc;
 
 namespace {
+
 /**
- * @brief Function that translates ORC data kind to cuDF type enum
+ * @brief Keeps track of orc mapping and child column details.
  */
-constexpr type_id to_type_id(const orc::SchemaType& schema,
-                             bool use_np_dtypes,
-                             type_id timestamp_type_id,
-                             type_id decimal_type_id)
-{
-  switch (schema.kind) {
-    case orc::BOOLEAN: return type_id::BOOL8;
-    case orc::BYTE: return type_id::INT8;
-    case orc::SHORT: return type_id::INT16;
-    case orc::INT: return type_id::INT32;
-    case orc::LONG: return type_id::INT64;
-    case orc::FLOAT: return type_id::FLOAT32;
-    case orc::DOUBLE: return type_id::FLOAT64;
-    case orc::STRING:
-    case orc::BINARY:
-    case orc::VARCHAR:
-    case orc::CHAR:
-      // Variable-length types can all be mapped to STRING
-      return type_id::STRING;
-    case orc::TIMESTAMP:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_NANOSECONDS;
-    case orc::DATE:
-      // There isn't a (DAYS -> np.dtype) mapping
-      return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
-    case orc::DECIMAL: return decimal_type_id;
-    // Need to update once cuDF plans to support map type
-    case orc::MAP:
-    case orc::LIST: return type_id::LIST;
-    case orc::STRUCT: return type_id::STRUCT;
-    default: break;
-  }
+struct reader_column_meta {
+  // Mapping between column id in orc to processing order.
+  std::vector<std::vector<size_type>> orc_col_map;
 
-  return type_id::EMPTY;
-}
+  // Number of rows in child columns.
+  std::vector<uint32_t> num_child_rows;
 
-gpu::StreamIndexType get_stream_index_type(orc::StreamKind kind)
-{
-  switch (kind) {
-    case orc::DATA: return gpu::CI_DATA;
-    case orc::LENGTH:
-    case orc::SECONDARY: return gpu::CI_DATA2;
-    case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
-    case orc::PRESENT: return gpu::CI_PRESENT;
-    case orc::ROW_INDEX: return gpu::CI_INDEX;
-    default:
-      // Skip this stream as it's not strictly required
-      return gpu::CI_NUM_STREAMS;
-  }
-}
+  // Consists of parent column valid_map and null count.
+  std::vector<column_validity_info> parent_column_data;
 
-/**
- * @brief struct to store buffer data and size of list buffer
- */
-struct list_buffer_data {
-  size_type* data;
-  size_type size;
-};
+  std::vector<size_type> parent_column_index;
 
-// Generates offsets for list buffer from number of elements in a row.
-void generate_offsets_for_list(rmm::device_uvector<list_buffer_data> const& buff_data,
-                               rmm::cuda_stream_view stream)
-{
-  auto transformer = [] __device__(list_buffer_data list_data) {
-    thrust::exclusive_scan(
-      thrust::seq, list_data.data, list_data.data + list_data.size, list_data.data);
+  // Start row of child columns [stripe][column].
+  std::vector<uint32_t> child_start_row;
+
+  // Number of rows of child columns [stripe][column].
+  std::vector<uint32_t> num_child_rows_per_stripe;
+
+  struct row_group_meta {
+    uint32_t num_rows;   // number of rows in a column in a row group
+    uint32_t start_row;  // start row in a column in a row group
   };
-  thrust::for_each(rmm::exec_policy(stream), buff_data.begin(), buff_data.end(), transformer);
-  stream.synchronize();
-}
+
+  // Row group metadata [rowgroup][column].
+  std::vector<row_group_meta> rwgrp_meta;
+};
 
 /**
  * @brief Struct that maps ORC streams to columns
  */
 struct orc_stream_info {
-  orc_stream_info() = default;
-  explicit orc_stream_info(
-    uint64_t offset_, size_t dst_pos_, uint32_t length_, uint32_t gdf_idx_, uint32_t stripe_idx_)
-    : offset(offset_),
-      dst_pos(dst_pos_),
-      length(length_),
-      gdf_idx(gdf_idx_),
-      stripe_idx(stripe_idx_)
+  explicit orc_stream_info(uint64_t offset_,
+                           std::size_t dst_pos_,
+                           uint32_t length_,
+                           uint32_t stripe_idx_)
+    : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_)
   {
   }
   uint64_t offset;      // offset in file
-  size_t dst_pos;       // offset in memory relative to start of compressed stripe data
-  size_t length;        // length in file
-  uint32_t gdf_idx;     // column index
+  std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
+  std::size_t length;   // length in file
   uint32_t stripe_idx;  // stripe index
 };
 
 /**
  * @brief Function that populates column descriptors stream/chunk
  */
-size_t gather_stream_info(const size_t stripe_index,
-                          const orc::StripeInformation* stripeinfo,
-                          const orc::StripeFooter* stripefooter,
-                          const std::vector<int>& orc2gdf,
-                          const std::vector<orc::SchemaType> types,
-                          bool use_index,
-                          size_t* num_dictionary_entries,
-                          cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                          std::vector<orc_stream_info>& stream_info,
-                          bool apply_struct_map)
+std::size_t gather_stream_info(std::size_t stripe_index,
+                               orc::StripeInformation const* stripeinfo,
+                               orc::StripeFooter const* stripefooter,
+                               host_span<int const> orc2gdf,
+                               host_span<orc::SchemaType const> types,
+                               bool use_index,
+                               bool apply_struct_map,
+                               std::size_t* num_dictionary_entries,
+                               std::vector<orc_stream_info>& stream_info,
+                               cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
 {
   uint64_t src_offset = 0;
   uint64_t dst_offset = 0;
-  for (const auto& stream : stripefooter->streams) {
+
+  auto const get_stream_index_type = [](orc::StreamKind kind) {
+    switch (kind) {
+      case orc::DATA: return gpu::CI_DATA;
+      case orc::LENGTH:
+      case orc::SECONDARY: return gpu::CI_DATA2;
+      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
+      case orc::PRESENT: return gpu::CI_PRESENT;
+      case orc::ROW_INDEX: return gpu::CI_INDEX;
+      default:
+        // Skip this stream as it's not strictly required
+        return gpu::CI_NUM_STREAMS;
+    }
+  };
+
+  for (auto const& stream : stripefooter->streams) {
     if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
       dst_offset += stream.length;
       continue;
@@ -187,10 +148,10 @@ size_t gather_stream_info(const size_t stripe_index,
       // A struct-type column has no data itself, but rather child columns
       // for each of its fields. There is only a PRESENT stream, which
       // needs to be included for the reader.
-      const auto schema_type = types[column_id];
-      if (schema_type.subtypes.size() != 0) {
+      auto const schema_type = types[column_id];
+      if (not schema_type.subtypes.empty()) {
         if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
-          for (const auto& idx : schema_type.subtypes) {
+          for (auto const& idx : schema_type.subtypes) {
             auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
             if (child_idx >= 0) {
               col                             = child_idx;
@@ -220,7 +181,7 @@ size_t gather_stream_info(const size_t stripe_index,
         }
       }
       stream_info.emplace_back(
-        stripeinfo->offset + src_offset, dst_offset, stream.length, col, stripe_index);
+        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
       dst_offset += stream.length;
     }
     src_offset += stream.length;
@@ -230,95 +191,66 @@ size_t gather_stream_info(const size_t stripe_index,
 }
 
 /**
- * @brief Determines cuDF type of an ORC Decimal column.
+ * @brief Decompresses the stripe data, at stream granularity.
+ *
+ * @param decompressor Block decompressor
+ * @param stripe_data List of source stripe column data
+ * @param stream_info List of stream to column mappings
+ * @param chunks Vector of list of column chunk descriptors
+ * @param row_groups Vector of list of row index descriptors
+ * @param num_stripes Number of stripes making up column chunks
+ * @param row_index_stride Distance between each row index
+ * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Device buffer to decompressed page data
  */
-auto decimal_column_type(std::vector<std::string> const& decimal128_columns,
-                         cudf::io::orc::detail::aggregate_orc_metadata const& metadata,
-                         int column_index)
-{
-  if (metadata.get_col_type(column_index).kind != DECIMAL) { return type_id::EMPTY; }
-
-  if (std::find(decimal128_columns.cbegin(),
-                decimal128_columns.cend(),
-                metadata.column_path(0, column_index)) != decimal128_columns.end()) {
-    return type_id::DECIMAL128;
-  }
-
-  auto const precision = metadata.get_col_type(column_index)
-                           .precision.value_or(cuda::std::numeric_limits<int64_t>::digits10);
-  if (precision <= cuda::std::numeric_limits<int32_t>::digits10) { return type_id::DECIMAL32; }
-  if (precision <= cuda::std::numeric_limits<int64_t>::digits10) { return type_id::DECIMAL64; }
-  return type_id::DECIMAL128;
-}
-
-}  // namespace
-
-__global__ void decompress_check_kernel(device_span<compression_result const> results,
-                                        bool* any_block_failure)
-{
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < results.size()) {
-    if (results[tid].status != compression_status::SUCCESS) {
-      *any_block_failure = true;  // Doesn't need to be atomic
-    }
-  }
-}
-
-void decompress_check(device_span<compression_result> results,
-                      bool* any_block_failure,
-                      rmm::cuda_stream_view stream)
-{
-  if (results.empty()) { return; }  // early exit for empty results
-
-  dim3 block(128);
-  dim3 grid(cudf::util::div_rounding_up_safe(results.size(), static_cast<size_t>(block.x)));
-  decompress_check_kernel<<<grid, block, 0, stream.value()>>>(results, any_block_failure);
-}
-
-rmm::device_buffer reader::impl::decompress_stripe_data(
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-  const std::vector<rmm::device_buffer>& stripe_data,
+rmm::device_buffer decompress_stripe_data(
   OrcDecompressor const& decompressor,
-  std::vector<orc_stream_info>& stream_info,
-  size_t num_stripes,
+  host_span<rmm::device_buffer const> stripe_data,
+  host_span<orc_stream_info> stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
   cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-  size_t row_index_stride,
+  std::size_t num_stripes,
+  std::size_t row_index_stride,
   bool use_base_stride,
   rmm::cuda_stream_view stream)
 {
-  // For checking whether we decompress successfully
-  hostdevice_vector<bool> any_block_failure(1, stream);
-  any_block_failure[0] = false;
-  any_block_failure.host_to_device(stream);
-
   // Parse the columns' compressed info
-  hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, stream_info.size(), stream);
-  for (const auto& info : stream_info) {
+  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
+    0, stream_info.size(), stream);
+  for (auto const& info : stream_info) {
     compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<const uint8_t*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
       info.length));
   }
-  compinfo.host_to_device(stream);
+  compinfo.host_to_device_async(stream);
 
   gpu::ParseCompressedStripeData(compinfo.device_ptr(),
                                  compinfo.size(),
                                  decompressor.GetBlockSize(),
                                  decompressor.GetLog2MaxCompressionRatio(),
                                  stream);
-  compinfo.device_to_host(stream, true);
+  compinfo.device_to_host_sync(stream);
 
   // Count the exact number of compressed blocks
-  size_t num_compressed_blocks   = 0;
-  size_t num_uncompressed_blocks = 0;
-  size_t total_decomp_size       = 0;
-  for (size_t i = 0; i < compinfo.size(); ++i) {
+  std::size_t num_compressed_blocks   = 0;
+  std::size_t num_uncompressed_blocks = 0;
+  std::size_t total_decomp_size       = 0;
+  for (std::size_t i = 0; i < compinfo.size(); ++i) {
     num_compressed_blocks += compinfo[i].num_compressed_blocks;
     num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
     total_decomp_size += compinfo[i].max_uncompressed_size;
   }
-  CUDF_EXPECTS(total_decomp_size > 0, "No decompressible data found");
+  CUDF_EXPECTS(
+    not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
+    "Inconsistent info on compression blocks");
+
+  // Buffer needs to be padded.
+  // Required by `gpuDecodeOrcColumnData`.
+  rmm::device_buffer decomp_data(
+    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+  if (decomp_data.is_empty()) { return decomp_data; }
 
-  rmm::device_buffer decomp_data(total_decomp_size, stream);
   rmm::device_uvector<device_span<uint8_t const>> inflate_in(
     num_compressed_blocks + num_uncompressed_blocks, stream);
   rmm::device_uvector<device_span<uint8_t>> inflate_out(
@@ -330,11 +262,11 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                compression_result{0, compression_status::FAILURE});
 
   // Parse again to populate the decompression input/output buffers
-  size_t decomp_offset           = 0;
+  std::size_t decomp_offset      = 0;
   uint32_t max_uncomp_block_size = 0;
   uint32_t start_pos             = 0;
   auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
-  for (size_t i = 0; i < compinfo.size(); ++i) {
+  for (std::size_t i = 0; i < compinfo.size(); ++i) {
     auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
     compinfo[i].uncompressed_data = dst_base + decomp_offset;
     compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
@@ -350,13 +282,19 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
     max_uncomp_block_size =
       std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
   }
-  compinfo.host_to_device(stream);
+  compinfo.host_to_device_async(stream);
   gpu::ParseCompressedStripeData(compinfo.device_ptr(),
                                  compinfo.size(),
                                  decompressor.GetBlockSize(),
                                  decompressor.GetLog2MaxCompressionRatio(),
                                  stream);
 
+  // Value for checking whether we decompress successfully.
+  // It doesn't need to be atomic as there is no race condition: we only write `true` if needed.
+  cudf::detail::hostdevice_vector<bool> any_block_failure(1, stream);
+  any_block_failure[0] = false;
+  any_block_failure.host_to_device_async(stream);
+
   // Dispatch batches of blocks to decompress
   if (num_compressed_blocks > 0) {
     device_span<device_span<uint8_t const>> inflate_in_view{inflate_in.data(),
@@ -405,8 +343,19 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
         break;
       default: CUDF_FAIL("Unexpected decompression dispatch"); break;
     }
-    decompress_check(inflate_res, any_block_failure.device_ptr(), stream);
+
+    // Check if any block has been failed to decompress.
+    // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
+    thrust::for_each(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(std::size_t{0}),
+      thrust::make_counting_iterator(inflate_res.size()),
+      [results           = inflate_res.begin(),
+       any_block_failure = any_block_failure.device_ptr()] __device__(auto const idx) {
+        if (results[idx].status != compression_status::SUCCESS) { *any_block_failure = true; }
+      });
   }
+
   if (num_uncompressed_blocks > 0) {
     device_span<device_span<uint8_t const>> copy_in_view{inflate_in.data() + num_compressed_blocks,
                                                          num_uncompressed_blocks};
@@ -414,24 +363,25 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                                                     num_uncompressed_blocks};
     gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
   }
-  gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
 
-  any_block_failure.device_to_host(stream);
+  // Copy without stream sync, thus need to wait for stream sync below to access.
+  any_block_failure.device_to_host_async(stream);
 
-  compinfo.device_to_host(stream, true);
+  gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
+  compinfo.device_to_host_sync(stream);  // This also sync stream for `any_block_failure`.
 
   // We can check on host after stream synchronize
   CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
 
-  const size_t num_columns = chunks.size().second;
+  auto const num_columns = chunks.size().second;
 
   // Update the stream information with the updated uncompressed info
   // TBD: We could update the value from the information we already
   // have in stream_info[], but using the gpu results also updates
   // max_uncompressed_size to the actual uncompressed size, or zero if
   // decompression failed.
-  for (size_t i = 0; i < num_stripes; ++i) {
-    for (size_t j = 0; j < num_columns; ++j) {
+  for (std::size_t i = 0; i < num_stripes; ++i) {
+    for (std::size_t j = 0; j < num_columns; ++j) {
       auto& chunk = chunks[i][j];
       for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
         if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
@@ -443,8 +393,8 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   }
 
   if (row_groups.size().first) {
-    chunks.host_to_device(stream);
-    row_groups.host_to_device(stream);
+    chunks.host_to_device_async(stream);
+    row_groups.host_to_device_async(stream);
     gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
                             compinfo.device_ptr(),
                             chunks.base_device_ptr(),
@@ -461,12 +411,11 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
 
 /**
  * @brief Updates null mask of columns whose parent is a struct column.
- *        If struct column has null element, that row would be
- *        skipped while writing child column in ORC, so we need to insert the missing null
- *        elements in child column.
- *        There is another behavior from pyspark, where if the child column doesn't have any null
- *        elements, it will not have present stream, so in that case parent null mask need to be
- *        copied to child column.
+ *
+ * If struct column has null element, that row would be skipped while writing child column in ORC,
+ * so we need to insert the missing null elements in child column. There is another behavior from
+ * pyspark, where if the child column doesn't have any null elements, it will not have present
+ * stream, so in that case parent null mask need to be copied to child column.
  *
  * @param chunks Vector of list of column chunk descriptors
  * @param out_buffers Output columns' device buffers
@@ -474,18 +423,18 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
  * @param mr Device memory resource to use for device memory allocation
  */
 void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                      std::vector<column_buffer>& out_buffers,
+                      host_span<column_buffer> out_buffers,
                       rmm::cuda_stream_view stream,
                       rmm::mr::device_memory_resource* mr)
 {
-  const auto num_stripes = chunks.size().first;
-  const auto num_columns = chunks.size().second;
+  auto const num_stripes = chunks.size().first;
+  auto const num_columns = chunks.size().second;
   bool is_mask_updated   = false;
 
-  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
+  for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
     if (chunks[0][col_idx].parent_validity_info.valid_map_base != nullptr) {
       if (not is_mask_updated) {
-        chunks.device_to_host(stream, true);
+        chunks.device_to_host_sync(stream);
         is_mask_updated = true;
       }
 
@@ -521,87 +470,56 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
                            };
                          });
 
-        out_buffers[col_idx]._null_mask = std::move(merged_null_mask);
+        out_buffers[col_idx].set_null_mask(std::move(merged_null_mask));
 
       } else {
         // Since child column doesn't have a mask, copy parent null mask
         auto mask_size = bitmask_allocation_size_bytes(parent_mask_len);
-        out_buffers[col_idx]._null_mask =
-          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr);
+        out_buffers[col_idx].set_null_mask(
+          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr));
       }
     }
   }
 
-  thrust::counting_iterator<int> col_idx_it(0);
-  thrust::counting_iterator<int> stripe_idx_it(0);
-
   if (is_mask_updated) {
     // Update chunks with pointers to column data which might have been changed.
-    std::for_each(stripe_idx_it, stripe_idx_it + num_stripes, [&](auto stripe_idx) {
-      std::for_each(col_idx_it, col_idx_it + num_columns, [&](auto col_idx) {
+    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
+      for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
         auto& chunk          = chunks[stripe_idx][col_idx];
         chunk.valid_map_base = out_buffers[col_idx].null_mask();
-      });
-    });
-    chunks.host_to_device(stream, true);
+      }
+    }
+    chunks.host_to_device_sync(stream);
   }
 }
 
 /**
- * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
- * layer.
+ * @brief Converts the stripe column data and outputs to columns.
+ *
+ * @param num_dicts Number of dictionary entries required
+ * @param skip_rows Number of rows to offset from start
+ * @param row_index_stride Distance between each row index
+ * @param level Current nesting level being processed
+ * @param tz_table Local time to UTC conversion table
+ * @param chunks Vector of list of column chunk descriptors
+ * @param row_groups Vector of list of row index descriptors
+ * @param out_buffers Output columns' device buffers
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
  */
-void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
-                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
-                      rmm::cuda_stream_view stream)
+void decode_stream_data(std::size_t num_dicts,
+                        std::size_t skip_rows,
+                        std::size_t row_index_stride,
+                        std::size_t level,
+                        table_view const& tz_table,
+                        cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+                        cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
+                        std::vector<column_buffer>& out_buffers,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr)
 {
   auto const num_stripes = chunks.size().first;
-  if (num_stripes == 0) return;
-
   auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
-  for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
-    // Null counts sums are only needed for children of struct columns
-    if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
-    }
-  }
-  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
-    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
-
-  thrust::for_each(rmm::exec_policy(stream),
-                   d_prefix_sums_to_update.begin(),
-                   d_prefix_sums_to_update.end(),
-                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
-                     auto const& idx_psums) {
-                     auto const col_idx = idx_psums.first;
-                     auto const psums   = idx_psums.second;
-
-                     thrust::transform(
-                       thrust::seq,
-                       thrust::make_counting_iterator(0),
-                       thrust::make_counting_iterator(0) + psums.size(),
-                       psums.begin(),
-                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
-
-                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
-                   });
-  // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
-  stream.synchronize();
-}
-
-void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                                      size_t num_dicts,
-                                      size_t skip_rows,
-                                      table_device_view tz_table,
-                                      cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-                                      size_t row_index_stride,
-                                      std::vector<column_buffer>& out_buffers,
-                                      size_t level,
-                                      rmm::cuda_stream_view stream)
-{
-  const auto num_stripes = chunks.size().first;
-  const auto num_columns = chunks.size().second;
   thrust::counting_iterator<int> col_idx_it(0);
   thrust::counting_iterator<int> stripe_idx_it(0);
 
@@ -617,15 +535,16 @@ void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::Col
   // Allocate global dictionary for deserializing
   rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
 
-  chunks.host_to_device(stream, true);
+  chunks.host_to_device_sync(stream);
   gpu::DecodeNullsAndStringDictionaries(
     chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
 
   if (level > 0) {
     // Update nullmasks for children if parent was a struct and had null mask
-    update_null_mask(chunks, out_buffers, stream, _mr);
+    update_null_mask(chunks, out_buffers, stream, mr);
   }
 
+  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
   rmm::device_scalar<size_type> error_count(0, stream);
   // Update the null map for child columns
   gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
@@ -634,13 +553,13 @@ void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::Col
                            num_columns,
                            num_stripes,
                            skip_rows,
-                           tz_table,
+                           *tz_table_dptr,
                            row_groups.size().first,
                            row_index_stride,
                            level,
                            error_count.data(),
                            stream);
-  chunks.device_to_host(stream);
+  chunks.device_to_host_async(stream);
   // `value` synchronizes
   auto const num_errors = error_count.value(stream);
   CUDF_EXPECTS(num_errors == 0, "ORC data decode failed");
@@ -656,56 +575,102 @@ void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::Col
   });
 }
 
-// Aggregate child column metadata per stripe and per column
-void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
-                                        cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
-                                        std::vector<column_buffer>& out_buffers,
-                                        std::vector<orc_column_meta> const& list_col,
-                                        const size_type level)
+/**
+ * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
+ * layer.
+ */
+void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
+                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
+                      rmm::cuda_stream_view stream)
 {
-  const auto num_of_stripes         = chunks.size().first;
-  const auto num_of_rowgroups       = row_groups.size().first;
-  const auto num_parent_cols        = selected_columns.levels[level].size();
-  const auto num_child_cols         = selected_columns.levels[level + 1].size();
-  const auto number_of_child_chunks = num_child_cols * num_of_stripes;
-  auto& num_child_rows              = _col_meta.num_child_rows;
-  auto& parent_column_data          = _col_meta.parent_column_data;
+  auto const num_stripes = chunks.size().first;
+  if (num_stripes == 0) return;
+
+  auto const num_columns = chunks.size().second;
+  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
+  for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
+    // Null counts sums are only needed for children of struct columns
+    if (chunks[0][col_idx].type_kind == STRUCT) {
+      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
+    }
+  }
+  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
+    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
+
+  thrust::for_each(rmm::exec_policy(stream),
+                   d_prefix_sums_to_update.begin(),
+                   d_prefix_sums_to_update.end(),
+                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
+                     auto const& idx_psums) {
+                     auto const col_idx = idx_psums.first;
+                     auto const psums   = idx_psums.second;
+
+                     thrust::transform(
+                       thrust::seq,
+                       thrust::make_counting_iterator(0),
+                       thrust::make_counting_iterator(0) + psums.size(),
+                       psums.begin(),
+                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+
+                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
+                   });
+  // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
+  stream.synchronize();
+}
+
+/**
+ * @brief Aggregate child metadata from parent column chunks.
+ */
+void aggregate_child_meta(std::size_t level,
+                          cudf::io::orc::detail::column_hierarchy const& selected_columns,
+                          cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
+                          cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
+                          host_span<orc_column_meta const> list_col,
+                          host_span<column_buffer> out_buffers,
+                          reader_column_meta& col_meta)
+{
+  auto const num_of_stripes         = chunks.size().first;
+  auto const num_of_rowgroups       = row_groups.size().first;
+  auto const num_child_cols         = selected_columns.levels[level + 1].size();
+  auto const number_of_child_chunks = num_child_cols * num_of_stripes;
+  auto& num_child_rows              = col_meta.num_child_rows;
+  auto& parent_column_data          = col_meta.parent_column_data;
 
   // Reset the meta to store child column details.
   num_child_rows.resize(selected_columns.levels[level + 1].size());
   std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
   parent_column_data.resize(number_of_child_chunks);
-  _col_meta.parent_column_index.resize(number_of_child_chunks);
-  _col_meta.child_start_row.resize(number_of_child_chunks);
-  _col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
-  _col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
+  col_meta.parent_column_index.resize(number_of_child_chunks);
+  col_meta.child_start_row.resize(number_of_child_chunks);
+  col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
+  col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
 
   auto child_start_row = cudf::detail::host_2dspan<uint32_t>(
-    _col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
+    col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
   auto num_child_rows_per_stripe = cudf::detail::host_2dspan<uint32_t>(
-    _col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
+    col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
   auto rwgrp_meta = cudf::detail::host_2dspan<reader_column_meta::row_group_meta>(
-    _col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
+    col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
 
   int index = 0;  // number of child column processed
 
   // For each parent column, update its child column meta for each stripe.
-  std::for_each(list_col.cbegin(), list_col.cend(), [&](const auto p_col) {
-    const auto parent_col_idx = _col_meta.orc_col_map[level][p_col.id];
+  std::for_each(list_col.begin(), list_col.end(), [&](auto const p_col) {
+    auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
     auto start_row            = 0;
     auto processed_row_groups = 0;
 
-    for (size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
+    for (std::size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
       // Aggregate num_rows and start_row from processed parent columns per row groups
       if (num_of_rowgroups) {
         auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
         auto processed_child_rows  = 0;
 
-        for (size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
+        for (std::size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
              rowgroup_id++, processed_row_groups++) {
-          const auto child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
+          auto const child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
           for (size_type id = 0; id < p_col.num_children; id++) {
-            const auto child_col_idx                                  = index + id;
+            auto const child_col_idx                                  = index + id;
             rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
             rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
           }
@@ -714,9 +679,9 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
       }
 
       // Aggregate start row, number of rows per chunk and total number of rows in a column
-      const auto child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
+      auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
       for (size_type id = 0; id < p_col.num_children; id++) {
-        const auto child_col_idx = index + id;
+        auto const child_col_idx = index + id;
 
         num_child_rows[child_col_idx] += child_rows;
         num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
@@ -734,8 +699,8 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
     auto num_rows          = out_buffers[parent_col_idx].size;
 
     for (size_type id = 0; id < p_col.num_children; id++) {
-      const auto child_col_idx                     = index + id;
-      _col_meta.parent_column_index[child_col_idx] = parent_col_idx;
+      auto const child_col_idx                    = index + id;
+      col_meta.parent_column_index[child_col_idx] = parent_col_idx;
       if (type == type_id::STRUCT) {
         parent_column_data[child_col_idx] = {parent_valid_map, parent_null_count};
         // Number of rows in child will remain same as parent in case of struct column
@@ -748,111 +713,219 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
   });
 }
 
-std::string get_map_child_col_name(size_t const idx) { return (idx == 0) ? "key" : "value"; }
+/**
+ * @brief struct to store buffer data and size of list buffer
+ */
+struct list_buffer_data {
+  size_type* data;
+  size_type size;
+};
 
-std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_col_id,
-                                                          column_name_info& schema_info,
-                                                          rmm::cuda_stream_view stream)
+// Generates offsets for list buffer from number of elements in a row.
+void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_stream_view stream)
 {
-  schema_info.name = _metadata.column_name(0, orc_col_id);
-  auto const type  = to_type_id(_metadata.get_schema(orc_col_id),
-                               _use_np_dtypes,
-                               _timestamp_type.id(),
-                               decimal_column_type(decimal128_columns, _metadata, orc_col_id));
-  int32_t scale    = 0;
-  std::vector<std::unique_ptr<column>> child_columns;
-  std::unique_ptr<column> out_col = nullptr;
-  auto kind                       = _metadata.get_col_type(orc_col_id).kind;
+  for (auto& list_data : buff_data) {
+    thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                           list_data.data,
+                           list_data.data + list_data.size,
+                           list_data.data);
+  }
+}
 
+/**
+ * @brief Function that translates ORC data kind to cuDF type enum
+ */
+constexpr type_id to_cudf_type(orc::TypeKind kind,
+                               bool use_np_dtypes,
+                               type_id timestamp_type_id,
+                               type_id decimal_type_id)
+{
   switch (kind) {
-    case orc::LIST:
+    case orc::BOOLEAN: return type_id::BOOL8;
+    case orc::BYTE: return type_id::INT8;
+    case orc::SHORT: return type_id::INT16;
+    case orc::INT: return type_id::INT32;
+    case orc::LONG: return type_id::INT64;
+    case orc::FLOAT: return type_id::FLOAT32;
+    case orc::DOUBLE: return type_id::FLOAT64;
+    case orc::STRING:
+    case orc::BINARY:
+    case orc::VARCHAR:
+    case orc::CHAR:
+      // Variable-length types can all be mapped to STRING
+      return type_id::STRING;
+    case orc::TIMESTAMP:
+      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                   : type_id::TIMESTAMP_NANOSECONDS;
+    case orc::DATE:
+      // There isn't a (DAYS -> np.dtype) mapping
+      return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
+    case orc::DECIMAL: return decimal_type_id;
+    // Need to update once cuDF plans to support map type
+    case orc::MAP:
+    case orc::LIST: return type_id::LIST;
+    case orc::STRUCT: return type_id::STRUCT;
+    default: break;
+  }
+
+  return type_id::EMPTY;
+}
+
+/**
+ * @brief Determines cuDF type of an ORC Decimal column.
+ */
+type_id to_cudf_decimal_type(host_span<std::string const> decimal128_columns,
+                             cudf::io::orc::detail::aggregate_orc_metadata const& metadata,
+                             int column_index)
+{
+  if (metadata.get_col_type(column_index).kind != DECIMAL) { return type_id::EMPTY; }
+
+  if (std::find(decimal128_columns.begin(),
+                decimal128_columns.end(),
+                metadata.column_path(0, column_index)) != decimal128_columns.end()) {
+    return type_id::DECIMAL128;
+  }
+
+  auto const precision = metadata.get_col_type(column_index)
+                           .precision.value_or(cuda::std::numeric_limits<int64_t>::digits10);
+  if (precision <= cuda::std::numeric_limits<int32_t>::digits10) { return type_id::DECIMAL32; }
+  if (precision <= cuda::std::numeric_limits<int64_t>::digits10) { return type_id::DECIMAL64; }
+  return type_id::DECIMAL128;
+}
+
+std::string get_map_child_col_name(std::size_t const idx) { return (idx == 0) ? "key" : "value"; }
+
+/**
+ * @brief Create empty columns and respective schema information from the buffer.
+ */
+std::unique_ptr<column> create_empty_column(
+  size_type orc_col_id,
+  cudf::io::orc::detail::aggregate_orc_metadata const& metadata,
+  host_span<std::string const> decimal128_columns,
+  bool use_np_dtypes,
+  data_type timestamp_type,
+  column_name_info& schema_info,
+  rmm::cuda_stream_view stream)
+{
+  schema_info.name = metadata.column_name(0, orc_col_id);
+  auto const kind  = metadata.get_col_type(orc_col_id).kind;
+  auto const type  = to_cudf_type(kind,
+                                 use_np_dtypes,
+                                 timestamp_type.id(),
+                                 to_cudf_decimal_type(decimal128_columns, metadata, orc_col_id));
+
+  switch (kind) {
+    case orc::LIST: {
       schema_info.children.emplace_back("offsets");
       schema_info.children.emplace_back("");
-      out_col = make_lists_column(
-        0,
-        make_empty_column(type_id::INT32),
-        create_empty_column(
-          _metadata.get_col_type(orc_col_id).subtypes[0], schema_info.children.back(), stream),
-        0,
-        rmm::device_buffer{0, stream},
-        stream);
-      break;
+      return make_lists_column(0,
+                               make_empty_column(type_id::INT32),
+                               create_empty_column(metadata.get_col_type(orc_col_id).subtypes[0],
+                                                   metadata,
+                                                   decimal128_columns,
+                                                   use_np_dtypes,
+                                                   timestamp_type,
+                                                   schema_info.children.back(),
+                                                   stream),
+                               0,
+                               rmm::device_buffer{0, stream},
+                               stream);
+    }
     case orc::MAP: {
       schema_info.children.emplace_back("offsets");
       schema_info.children.emplace_back("struct");
-      const auto child_column_ids = _metadata.get_col_type(orc_col_id).subtypes;
-      for (size_t idx = 0; idx < _metadata.get_col_type(orc_col_id).subtypes.size(); idx++) {
-        auto& children_schema = schema_info.children.back().children;
+      auto const child_column_ids = metadata.get_col_type(orc_col_id).subtypes;
+      auto& children_schema       = schema_info.children.back().children;
+      std::vector<std::unique_ptr<column>> child_columns;
+      for (std::size_t idx = 0; idx < metadata.get_col_type(orc_col_id).subtypes.size(); idx++) {
         children_schema.emplace_back("");
-        child_columns.push_back(create_empty_column(
-          child_column_ids[idx], schema_info.children.back().children.back(), stream));
-        auto name                 = get_map_child_col_name(idx);
-        children_schema[idx].name = name;
+        child_columns.push_back(create_empty_column(child_column_ids[idx],
+                                                    metadata,
+                                                    decimal128_columns,
+                                                    use_np_dtypes,
+                                                    timestamp_type,
+                                                    schema_info.children.back().children.back(),
+                                                    stream));
+        children_schema[idx].name = get_map_child_col_name(idx);
       }
-      auto struct_col =
-        make_structs_column(0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream);
-      out_col = make_lists_column(0,
-                                  make_empty_column(type_id::INT32),
-                                  std::move(struct_col),
-                                  0,
-                                  rmm::device_buffer{0, stream},
-                                  stream);
-    } break;
+      return make_lists_column(
+        0,
+        make_empty_column(type_id::INT32),
+        make_structs_column(0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream),
+        0,
+        rmm::device_buffer{0, stream},
+        stream);
+    }
 
-    case orc::STRUCT:
-      for (const auto col : _metadata.get_col_type(orc_col_id).subtypes) {
+    case orc::STRUCT: {
+      std::vector<std::unique_ptr<column>> child_columns;
+      for (auto const col : metadata.get_col_type(orc_col_id).subtypes) {
         schema_info.children.emplace_back("");
-        child_columns.push_back(create_empty_column(col, schema_info.children.back(), stream));
+        child_columns.push_back(create_empty_column(col,
+                                                    metadata,
+                                                    decimal128_columns,
+                                                    use_np_dtypes,
+                                                    timestamp_type,
+                                                    schema_info.children.back(),
+                                                    stream));
       }
-      out_col =
-        make_structs_column(0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream);
-      break;
+      return make_structs_column(
+        0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream);
+    }
 
-    case orc::DECIMAL:
+    case orc::DECIMAL: {
+      int32_t scale = 0;
       if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
-        scale = -static_cast<int32_t>(_metadata.get_types()[orc_col_id].scale.value_or(0));
+        scale = -static_cast<int32_t>(metadata.get_types()[orc_col_id].scale.value_or(0));
       }
-      out_col = make_empty_column(data_type(type, scale));
-      break;
+      return make_empty_column(data_type(type, scale));
+    }
 
-    default: out_col = make_empty_column(type);
+    default: return make_empty_column(type);
   }
-
-  return out_col;
 }
 
-// Adds child column buffers to parent column
-column_buffer&& reader::impl::assemble_buffer(const size_type orc_col_id,
-                                              std::vector<std::vector<column_buffer>>& col_buffers,
-                                              const size_t level,
-                                              rmm::cuda_stream_view stream)
+/**
+ * @brief Assemble the buffer with child columns.
+ */
+column_buffer assemble_buffer(size_type orc_col_id,
+                              std::size_t level,
+                              reader_column_meta const& col_meta,
+                              cudf::io::orc::detail::aggregate_orc_metadata const& metadata,
+                              cudf::io::orc::detail::column_hierarchy const& selected_columns,
+                              std::vector<std::vector<column_buffer>>& col_buffers,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
-  auto const col_id = _col_meta.orc_col_map[level][orc_col_id];
+  auto const col_id = col_meta.orc_col_map[level][orc_col_id];
   auto& col_buffer  = col_buffers[level][col_id];
 
-  col_buffer.name = _metadata.column_name(0, orc_col_id);
-  auto kind       = _metadata.get_col_type(orc_col_id).kind;
+  col_buffer.name = metadata.column_name(0, orc_col_id);
+  auto kind       = metadata.get_col_type(orc_col_id).kind;
   switch (kind) {
     case orc::LIST:
-    case orc::STRUCT:
-      for (auto const& col : selected_columns.children[orc_col_id]) {
-        col_buffer.children.emplace_back(assemble_buffer(col, col_buffers, level + 1, stream));
+    case orc::STRUCT: {
+      auto const& children_indices = selected_columns.children.at(orc_col_id);
+      for (auto const child_id : children_indices) {
+        col_buffer.children.emplace_back(assemble_buffer(
+          child_id, level + 1, col_meta, metadata, selected_columns, col_buffers, stream, mr));
       }
+    } break;
 
-      break;
     case orc::MAP: {
       std::vector<column_buffer> child_col_buffers;
       // Get child buffers
-      for (size_t idx = 0; idx < selected_columns.children[orc_col_id].size(); idx++) {
-        auto name = get_map_child_col_name(idx);
-        auto col  = selected_columns.children[orc_col_id][idx];
-        child_col_buffers.emplace_back(assemble_buffer(col, col_buffers, level + 1, stream));
-        child_col_buffers.back().name = name;
+      auto const& children_indices = selected_columns.children.at(orc_col_id);
+      for (std::size_t idx = 0; idx < children_indices.size(); idx++) {
+        auto const col = children_indices[idx];
+        child_col_buffers.emplace_back(assemble_buffer(
+          col, level + 1, col_meta, metadata, selected_columns, col_buffers, stream, mr));
+        child_col_buffers.back().name = get_map_child_col_name(idx);
       }
       // Create a struct buffer
       auto num_rows = child_col_buffers[0].size;
       auto struct_buffer =
-        column_buffer(cudf::data_type(type_id::STRUCT), num_rows, false, stream, _mr);
+        column_buffer(cudf::data_type(type_id::STRUCT), num_rows, false, stream, mr);
       struct_buffer.children = std::move(child_col_buffers);
       struct_buffer.name     = "struct";
 
@@ -865,107 +938,114 @@ column_buffer&& reader::impl::assemble_buffer(const size_type orc_col_id,
   return std::move(col_buffer);
 }
 
-// creates columns along with schema information for each column
-void reader::impl::create_columns(std::vector<std::vector<column_buffer>>&& col_buffers,
-                                  std::vector<std::unique_ptr<column>>& out_columns,
-                                  std::vector<column_name_info>& schema_info,
-                                  rmm::cuda_stream_view stream)
-{
-  std::transform(selected_columns.levels[0].begin(),
-                 selected_columns.levels[0].end(),
-                 std::back_inserter(out_columns),
-                 [&](auto const col_meta) {
-                   schema_info.emplace_back("");
-                   auto col_buffer = assemble_buffer(col_meta.id, col_buffers, 0, stream);
-                   return make_column(col_buffer, &schema_info.back(), std::nullopt, stream);
-                 });
-}
+}  // namespace
 
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    orc_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : _mr(mr),
+  : _stream(stream),
+    _mr(mr),
     _sources(std::move(sources)),
     _metadata{_sources, stream},
-    selected_columns{_metadata.select_columns(options.get_columns())}
-{
-  // Override output timestamp resolution if requested
-  if (options.get_timestamp_type().id() != type_id::EMPTY) {
-    _timestamp_type = options.get_timestamp_type();
-  }
-
-  // Enable or disable attempt to use row index for parsing
-  _use_index = options.is_enabled_use_index();
-
-  // Enable or disable the conversion to numpy-compatible dtypes
-  _use_np_dtypes = options.is_enabled_use_np_dtypes();
-
-  // Control decimals conversion
-  decimal128_columns = options.get_decimal128_columns();
-}
-
-std::unique_ptr<table> reader::impl::compute_timezone_table(
-  const std::vector<cudf::io::orc::metadata::stripe_source_mapping>& selected_stripes,
-  rmm::cuda_stream_view stream)
+    _selected_columns{_metadata.select_columns(options.get_columns())},
+    _timestamp_type{options.get_timestamp_type()},
+    _use_index{options.is_enabled_use_index()},
+    _use_np_dtypes{options.is_enabled_use_np_dtypes()},
+    _decimal128_columns{options.get_decimal128_columns()},
+    _col_meta{std::make_unique<reader_column_meta>()}
 {
-  if (selected_stripes.empty()) return std::make_unique<cudf::table>();
-
-  auto const has_timestamp_column = std::any_of(
-    selected_columns.levels.cbegin(), selected_columns.levels.cend(), [&](auto& col_lvl) {
-      return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto& col_meta) {
-        return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
-      });
-    });
-  if (not has_timestamp_column) return std::make_unique<cudf::table>();
-
-  return cudf::detail::make_timezone_transition_table(
-    {}, selected_stripes[0].stripe_info[0].second->writerTimezone, stream);
 }
 
-table_with_metadata reader::impl::read(int64_t skip_rows,
-                                       std::optional<size_type> num_rows,
-                                       const std::vector<std::vector<size_type>>& stripes,
-                                       rmm::cuda_stream_view stream)
+table_with_metadata reader::impl::read(uint64_t skip_rows,
+                                       std::optional<size_type> const& num_rows_opt,
+                                       std::vector<std::vector<size_type>> const& stripes)
 {
   // Selected columns at different levels of nesting are stored in different elements
   // of `selected_columns`; thus, size == 1 means no nested columns
-  CUDF_EXPECTS(skip_rows == 0 or selected_columns.num_levels() == 1,
+  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
                "skip_rows is not supported by nested columns");
 
+  // There are no columns in the table
+  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+
+  std::vector<std::vector<column_buffer>> out_buffers(_selected_columns.num_levels());
   std::vector<std::unique_ptr<column>> out_columns;
-  // buffer and stripe data are stored as per nesting level
-  std::vector<std::vector<column_buffer>> out_buffers(selected_columns.num_levels());
-  std::vector<column_name_info> schema_info;
-  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data(selected_columns.num_levels());
-  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
   table_metadata out_metadata;
 
-  // There are no columns in the table
-  if (selected_columns.num_levels() == 0)
-    return {std::make_unique<table>(), std::move(out_metadata)};
+  // Copy user data to the output metadata.
+  std::transform(_metadata.per_file_metadata.cbegin(),
+                 _metadata.per_file_metadata.cend(),
+                 std::back_inserter(out_metadata.per_file_user_data),
+                 [](auto& meta) {
+                   std::unordered_map<std::string, std::string> kv_map;
+                   std::transform(meta.ff.metadata.cbegin(),
+                                  meta.ff.metadata.cend(),
+                                  std::inserter(kv_map, kv_map.end()),
+                                  [](auto const& kv) {
+                                    return std::pair{kv.name, kv.value};
+                                  });
+                   return kv_map;
+                 });
+  out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
+                            out_metadata.per_file_user_data[0].end()};
 
   // Select only stripes required (aka row groups)
   auto const [rows_to_skip, rows_to_read, selected_stripes] =
-    _metadata.select_stripes(stripes, skip_rows, num_rows, stream);
+    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
+
+  // If no rows or stripes to read, return empty columns
+  if (rows_to_read == 0 || selected_stripes.empty()) {
+    std::transform(_selected_columns.levels[0].begin(),
+                   _selected_columns.levels[0].end(),
+                   std::back_inserter(out_columns),
+                   [&](auto const col_meta) {
+                     out_metadata.schema_info.emplace_back("");
+                     return create_empty_column(col_meta.id,
+                                                _metadata,
+                                                _decimal128_columns,
+                                                _use_np_dtypes,
+                                                _timestamp_type,
+                                                out_metadata.schema_info.back(),
+                                                _stream);
+                   });
+    return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
+  }
 
-  auto const tz_table = compute_timezone_table(selected_stripes, stream);
+  // Set up table for converting timestamp columns from local to UTC time
+  auto const tz_table = [&, &selected_stripes = selected_stripes] {
+    auto const has_timestamp_column = std::any_of(
+      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
+        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
+          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+        });
+      });
+
+    return has_timestamp_column
+             ? cudf::detail::make_timezone_transition_table(
+                 {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream)
+             : std::make_unique<cudf::table>();
+  }();
+
+  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data(_selected_columns.num_levels());
+  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
 
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
-  for (size_t level = 0; level < selected_columns.num_levels(); level++) {
-    auto& columns_level = selected_columns.levels[level];
+  auto& col_meta = *_col_meta;
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& columns_level = _selected_columns.levels[level];
     // Association between each ORC column and its cudf::column
-    _col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
+    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
     std::vector<orc_column_meta> nested_col;
 
     // Get a list of column data types
     std::vector<data_type> column_types;
     for (auto& col : columns_level) {
-      auto col_type = to_type_id(_metadata.get_col_type(col.id),
-                                 _use_np_dtypes,
-                                 _timestamp_type.id(),
-                                 decimal_column_type(decimal128_columns, _metadata, col.id));
+      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
+                                   _use_np_dtypes,
+                                   _timestamp_type.id(),
+                                   to_cudf_decimal_type(_decimal128_columns, _metadata, col.id));
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
       if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
           col_type == type_id::DECIMAL128) {
@@ -980,320 +1060,287 @@ table_with_metadata reader::impl::read(int64_t skip_rows,
       }
 
       // Map each ORC column to its column
-      _col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
-      if (col_type == type_id::LIST or col_type == type_id::STRUCT) nested_col.emplace_back(col);
+      col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
+      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
+        nested_col.emplace_back(col);
+      }
     }
 
-    // If no rows or stripes to read, return empty columns
-    if (rows_to_read == 0 || selected_stripes.empty()) {
-      std::transform(selected_columns.levels[0].begin(),
-                     selected_columns.levels[0].end(),
-                     std::back_inserter(out_columns),
-                     [&](auto const col_meta) {
-                       schema_info.emplace_back("");
-                       return create_empty_column(col_meta.id, schema_info.back(), stream);
-                     });
-      break;
-    } else {
-      // Get the total number of stripes across all input files.
-      size_t total_num_stripes =
-        std::accumulate(selected_stripes.begin(),
-                        selected_stripes.end(),
-                        0,
-                        [](size_t sum, auto& stripe_source_mapping) {
-                          return sum + stripe_source_mapping.stripe_info.size();
-                        });
-      const auto num_columns = columns_level.size();
-      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
-        total_num_stripes, num_columns, stream);
-      memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
-
-      const bool use_index =
-        _use_index &&
-        // Do stripes have row group index
-        _metadata.is_row_grp_idx_present() &&
-        // Only use if we don't have much work with complete columns & stripes
-        // TODO: Consider nrows, gpu, and tune the threshold
-        (rows_to_read > _metadata.get_row_index_stride() &&
-         !(_metadata.get_row_index_stride() & 7) && _metadata.get_row_index_stride() > 0 &&
-         num_columns * total_num_stripes < 8 * 128) &&
-        // Only use if first row is aligned to a stripe boundary
-        // TODO: Fix logic to handle unaligned rows
-        (rows_to_skip == 0);
-
-      // Logically view streams as columns
-      std::vector<orc_stream_info> stream_info;
-
-      null_count_prefix_sums.emplace_back();
-      null_count_prefix_sums.back().reserve(selected_columns.levels[level].size());
-      std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                      selected_columns.levels[level].size(),
-                      [&]() {
-                        return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                          total_num_stripes, stream, rmm::mr::get_current_device_resource());
+    // Get the total number of stripes across all input files.
+    std::size_t total_num_stripes =
+      std::accumulate(selected_stripes.begin(),
+                      selected_stripes.end(),
+                      0,
+                      [](std::size_t sum, auto& stripe_source_mapping) {
+                        return sum + stripe_source_mapping.stripe_info.size();
                       });
-
-      // Tracker for eventually deallocating compressed and uncompressed data
-      auto& stripe_data = lvl_stripe_data[level];
-
-      size_t stripe_start_row = 0;
-      size_t num_dict_entries = 0;
-      size_t num_rowgroups    = 0;
-      int stripe_idx          = 0;
-
-      bool is_level_data_empty = true;
-      std::vector<std::pair<std::future<size_t>, size_t>> read_tasks;
-      for (auto const& stripe_source_mapping : selected_stripes) {
-        // Iterate through the source files selected stripes
-        for (auto const& stripe : stripe_source_mapping.stripe_info) {
-          const auto stripe_info   = stripe.first;
-          const auto stripe_footer = stripe.second;
-
-          auto stream_count          = stream_info.size();
-          const auto total_data_size = gather_stream_info(stripe_idx,
-                                                          stripe_info,
-                                                          stripe_footer,
-                                                          _col_meta.orc_col_map[level],
-                                                          _metadata.get_types(),
-                                                          use_index,
-                                                          &num_dict_entries,
-                                                          chunks,
-                                                          stream_info,
-                                                          level == 0);
-
-          auto const is_stripe_data_empty = total_data_size == 0;
-          if (not is_stripe_data_empty) { is_level_data_empty = false; }
-          CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                       "Invalid index rowgroup stream data");
-
-          stripe_data.emplace_back(total_data_size, stream);
-          auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
-
-          // Coalesce consecutive streams into one read
-          while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-            const auto d_dst  = dst_base + stream_info[stream_count].dst_pos;
-            const auto offset = stream_info[stream_count].offset;
-            auto len          = stream_info[stream_count].length;
+    auto const num_columns = columns_level.size();
+    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
+      total_num_stripes, num_columns, _stream);
+    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
+
+    const bool use_index =
+      _use_index &&
+      // Do stripes have row group index
+      _metadata.is_row_grp_idx_present() &&
+      // Only use if we don't have much work with complete columns & stripes
+      // TODO: Consider nrows, gpu, and tune the threshold
+      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
+       _metadata.get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
+      // Only use if first row is aligned to a stripe boundary
+      // TODO: Fix logic to handle unaligned rows
+      (rows_to_skip == 0);
+
+    // Logically view streams as columns
+    std::vector<orc_stream_info> stream_info;
+
+    null_count_prefix_sums.emplace_back();
+    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
+    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
+                    _selected_columns.levels[level].size(),
+                    [&]() {
+                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
+                    });
+
+    // Tracker for eventually deallocating compressed and uncompressed data
+    auto& stripe_data = lvl_stripe_data[level];
+
+    std::size_t stripe_start_row = 0;
+    std::size_t num_dict_entries = 0;
+    std::size_t num_rowgroups    = 0;
+    int stripe_idx               = 0;
+
+    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+    for (auto const& stripe_source_mapping : selected_stripes) {
+      // Iterate through the source files selected stripes
+      for (auto const& stripe : stripe_source_mapping.stripe_info) {
+        auto const stripe_info   = stripe.first;
+        auto const stripe_footer = stripe.second;
+
+        auto stream_count          = stream_info.size();
+        auto const total_data_size = gather_stream_info(stripe_idx,
+                                                        stripe_info,
+                                                        stripe_footer,
+                                                        col_meta.orc_col_map[level],
+                                                        _metadata.get_types(),
+                                                        use_index,
+                                                        level == 0,
+                                                        &num_dict_entries,
+                                                        stream_info,
+                                                        chunks);
+
+        auto const is_stripe_data_empty = total_data_size == 0;
+        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                     "Invalid index rowgroup stream data");
+
+        // Buffer needs to be padded.
+        // Required by `copy_uncompressed_kernel`.
+        stripe_data.emplace_back(
+          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
+        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
+
+        // Coalesce consecutive streams into one read
+        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
+          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
+          auto const offset = stream_info[stream_count].offset;
+          auto len          = stream_info[stream_count].length;
+          stream_count++;
+
+          while (stream_count < stream_info.size() &&
+                 stream_info[stream_count].offset == offset + len) {
+            len += stream_info[stream_count].length;
             stream_count++;
-
-            while (stream_count < stream_info.size() &&
-                   stream_info[stream_count].offset == offset + len) {
-              len += stream_info[stream_count].length;
-              stream_count++;
-            }
-            if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                  .source->is_device_read_preferred(len)) {
-              read_tasks.push_back(
-                std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                            .source->device_read_async(offset, len, d_dst, stream),
-                          len));
-
-            } else {
-              const auto buffer =
-                _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-                  offset, len);
-              CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-              CUDF_CUDA_TRY(
-                cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, stream.value()));
-              stream.synchronize();
-            }
-          }
-
-          const auto num_rows_per_stripe = stripe_info->numberOfRows;
-          const auto rowgroup_id         = num_rowgroups;
-          auto stripe_num_rowgroups      = 0;
-          if (use_index) {
-            stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
-                                   _metadata.get_row_index_stride();
           }
-          // Update chunks to reference streams pointers
-          for (size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-            auto& chunk = chunks[stripe_idx][col_idx];
-            // start row, number of rows in a each stripe and total number of rows
-            // may change in lower levels of nesting
-            chunk.start_row = (level == 0)
-                                ? stripe_start_row
-                                : _col_meta.child_start_row[stripe_idx * num_columns + col_idx];
-            chunk.num_rows =
-              (level == 0)
-                ? stripe_info->numberOfRows
-                : _col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
-            chunk.column_num_rows = (level == 0) ? rows_to_read : _col_meta.num_child_rows[col_idx];
-            chunk.parent_validity_info =
-              (level == 0) ? column_validity_info{} : _col_meta.parent_column_data[col_idx];
-            chunk.parent_null_count_prefix_sums =
-              (level == 0)
-                ? nullptr
-                : null_count_prefix_sums[level - 1][_col_meta.parent_column_index[col_idx]].data();
-            chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
-            chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                                .ff.types[columns_level[col_idx].id]
-                                .kind;
-            // num_child_rows for a struct column will be same, for other nested types it will be
-            // calculated.
-            chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-            chunk.dtype_id       = column_types[col_idx].id();
-            chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                                    .ff.types[columns_level[col_idx].id]
-                                    .scale.value_or(0);
-
-            chunk.rowgroup_id   = rowgroup_id;
-            chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
-                                    ? sizeof(string_index_pair)
-                                  : ((column_types[col_idx].id() == type_id::LIST) or
-                                 (column_types[col_idx].id() == type_id::STRUCT))
-                                    ? sizeof(size_type)
-                                    : cudf::size_of(column_types[col_idx]);
-            chunk.num_rowgroups = stripe_num_rowgroups;
-            if (chunk.type_kind == orc::TIMESTAMP) {
-              chunk.timestamp_type_id = _timestamp_type.id();
-            }
-            if (not is_stripe_data_empty) {
-              for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-                chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-              }
-            }
+          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                .source->is_device_read_preferred(len)) {
+            read_tasks.push_back(
+              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                          .source->device_read_async(offset, len, d_dst, _stream),
+                        len));
+
+          } else {
+            auto const buffer =
+              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
+                offset, len);
+            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
+            CUDF_CUDA_TRY(
+              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
+            _stream.synchronize();
           }
-          stripe_start_row += num_rows_per_stripe;
-          num_rowgroups += stripe_num_rowgroups;
-
-          stripe_idx++;
         }
-      }
-      for (auto& task : read_tasks) {
-        CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
-      }
 
-      // Process dataset chunk pages into output columns
-      if (stripe_data.size() != 0) {
-        auto row_groups =
-          cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, stream);
-        if (level > 0 and row_groups.size().first) {
-          cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                         num_rowgroups * num_columns);
-          auto& rw_grp_meta = _col_meta.rwgrp_meta;
-
-          // Update start row and num rows per row group
-          std::transform(rw_grp_meta.begin(),
-                         rw_grp_meta.end(),
-                         row_groups_span.begin(),
-                         rw_grp_meta.begin(),
-                         [&](auto meta, auto& row_grp) {
-                           row_grp.num_rows  = meta.num_rows;
-                           row_grp.start_row = meta.start_row;
-                           return meta;
-                         });
-        }
-        // Setup row group descriptors if using indexes
-        if (_metadata.per_file_metadata[0].ps.compression != orc::NONE and
-            not is_level_data_empty) {
-          auto decomp_data = decompress_stripe_data(chunks,
-                                                    stripe_data,
-                                                    *_metadata.per_file_metadata[0].decompressor,
-                                                    stream_info,
-                                                    total_num_stripes,
-                                                    row_groups,
-                                                    _metadata.get_row_index_stride(),
-                                                    level == 0,
-                                                    stream);
-          stripe_data.clear();
-          stripe_data.push_back(std::move(decomp_data));
-        } else {
-          if (row_groups.size().first) {
-            chunks.host_to_device(stream);
-            row_groups.host_to_device(stream);
-            gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                                    nullptr,
-                                    chunks.base_device_ptr(),
-                                    num_columns,
-                                    total_num_stripes,
-                                    num_rowgroups,
-                                    _metadata.get_row_index_stride(),
-                                    level == 0,
-                                    stream);
-          }
+        auto const num_rows_per_stripe = stripe_info->numberOfRows;
+        auto const rowgroup_id         = num_rowgroups;
+        auto stripe_num_rowgroups      = 0;
+        if (use_index) {
+          stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
+                                 _metadata.get_row_index_stride();
         }
-
-        for (size_t i = 0; i < column_types.size(); ++i) {
-          bool is_nullable = false;
-          for (size_t j = 0; j < total_num_stripes; ++j) {
-            if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
-              is_nullable = true;
-              break;
+        // Update chunks to reference streams pointers
+        for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
+          auto& chunk = chunks[stripe_idx][col_idx];
+          // start row, number of rows in a each stripe and total number of rows
+          // may change in lower levels of nesting
+          chunk.start_row = (level == 0)
+                              ? stripe_start_row
+                              : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
+          chunk.num_rows =
+            (level == 0) ? stripe_info->numberOfRows
+                         : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
+          chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
+          chunk.parent_validity_info =
+            (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
+          chunk.parent_null_count_prefix_sums =
+            (level == 0)
+              ? nullptr
+              : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
+          chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
+          chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                              .ff.types[columns_level[col_idx].id]
+                              .kind;
+          // num_child_rows for a struct column will be same, for other nested types it will be
+          // calculated.
+          chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+          chunk.dtype_id       = column_types[col_idx].id();
+          chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                                  .ff.types[columns_level[col_idx].id]
+                                  .scale.value_or(0);
+
+          chunk.rowgroup_id   = rowgroup_id;
+          chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
+                                  ? sizeof(string_index_pair)
+                                : ((column_types[col_idx].id() == type_id::LIST) or
+                               (column_types[col_idx].id() == type_id::STRUCT))
+                                  ? sizeof(size_type)
+                                  : cudf::size_of(column_types[col_idx]);
+          chunk.num_rowgroups = stripe_num_rowgroups;
+          if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
+          if (not is_stripe_data_empty) {
+            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
             }
           }
-          auto is_list_type = (column_types[i].id() == type_id::LIST);
-          auto n_rows       = (level == 0) ? rows_to_read : _col_meta.num_child_rows[i];
-          // For list column, offset column will be always size + 1
-          if (is_list_type) n_rows++;
-          out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, stream, _mr);
         }
+        stripe_start_row += num_rows_per_stripe;
+        num_rowgroups += stripe_num_rowgroups;
 
-        if (not is_level_data_empty) {
-          auto const tz_table_dview = table_device_view::create(tz_table->view(), stream);
-          decode_stream_data(chunks,
-                             num_dict_entries,
-                             rows_to_skip,
-                             *tz_table_dview,
-                             row_groups,
-                             _metadata.get_row_index_stride(),
-                             out_buffers[level],
-                             level,
-                             stream);
-        }
+        stripe_idx++;
+      }
+    }
+    for (auto& task : read_tasks) {
+      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+    }
 
-        // Extract information to process nested child columns
-        if (nested_col.size()) {
-          if (not is_level_data_empty) {
-            scan_null_counts(chunks, null_count_prefix_sums[level], stream);
-          }
-          row_groups.device_to_host(stream, true);
-          aggregate_child_meta(chunks, row_groups, out_buffers[level], nested_col, level);
-        }
+    if (stripe_data.empty()) { continue; }
+
+    // Process dataset chunk pages into output columns
+    auto row_groups =
+      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
+    if (level > 0 and row_groups.size().first) {
+      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
+                                                     num_rowgroups * num_columns);
+      auto& rw_grp_meta = col_meta.rwgrp_meta;
+
+      // Update start row and num rows per row group
+      std::transform(rw_grp_meta.begin(),
+                     rw_grp_meta.end(),
+                     row_groups_span.begin(),
+                     rw_grp_meta.begin(),
+                     [&](auto meta, auto& row_grp) {
+                       row_grp.num_rows  = meta.num_rows;
+                       row_grp.start_row = meta.start_row;
+                       return meta;
+                     });
+    }
+    // Setup row group descriptors if using indexes
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
+                                                stripe_data,
+                                                stream_info,
+                                                chunks,
+                                                row_groups,
+                                                total_num_stripes,
+                                                _metadata.get_row_index_stride(),
+                                                level == 0,
+                                                _stream);
+      stripe_data.clear();
+      stripe_data.push_back(std::move(decomp_data));
+    } else {
+      if (row_groups.size().first) {
+        chunks.host_to_device_async(_stream);
+        row_groups.host_to_device_async(_stream);
+        gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
+                                nullptr,
+                                chunks.base_device_ptr(),
+                                num_columns,
+                                total_num_stripes,
+                                num_rowgroups,
+                                _metadata.get_row_index_stride(),
+                                level == 0,
+                                _stream);
+      }
+    }
 
-        // ORC stores number of elements at each row, so we need to generate offsets from that
-        if (nested_col.size()) {
-          std::vector<list_buffer_data> buff_data;
-          std::for_each(
-            out_buffers[level].begin(), out_buffers[level].end(), [&buff_data](auto& out_buffer) {
-              if (out_buffer.type.id() == type_id::LIST) {
-                auto data = static_cast<size_type*>(out_buffer.data());
-                buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
-              }
-            });
-
-          if (buff_data.size()) {
-            auto const dev_buff_data = cudf::detail::make_device_uvector_async(
-              buff_data, stream, rmm::mr::get_current_device_resource());
-            generate_offsets_for_list(dev_buff_data, stream);
-          }
+    for (std::size_t i = 0; i < column_types.size(); ++i) {
+      bool is_nullable = false;
+      for (std::size_t j = 0; j < total_num_stripes; ++j) {
+        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+          is_nullable = true;
+          break;
         }
       }
+      auto is_list_type = (column_types[i].id() == type_id::LIST);
+      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
+      // For list column, offset column will be always size + 1
+      if (is_list_type) n_rows++;
+      out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
     }
-  }
 
-  // If out_columns is empty, then create columns from buffer.
-  if (out_columns.empty()) {
-    create_columns(std::move(out_buffers), out_columns, schema_info, stream);
-  }
+    decode_stream_data(num_dict_entries,
+                       rows_to_skip,
+                       _metadata.get_row_index_stride(),
+                       level,
+                       tz_table->view(),
+                       chunks,
+                       row_groups,
+                       out_buffers[level],
+                       _stream,
+                       _mr);
+
+    if (nested_col.size()) {
+      // Extract information to process nested child columns
+      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
+
+      row_groups.device_to_host_sync(_stream);
+      aggregate_child_meta(
+        level, _selected_columns, chunks, row_groups, nested_col, out_buffers[level], col_meta);
+
+      // ORC stores number of elements at each row, so we need to generate offsets from that
+      std::vector<list_buffer_data> buff_data;
+      std::for_each(
+        out_buffers[level].begin(), out_buffers[level].end(), [&buff_data](auto& out_buffer) {
+          if (out_buffer.type.id() == type_id::LIST) {
+            auto data = static_cast<size_type*>(out_buffer.data());
+            buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
+          }
+        });
 
-  out_metadata.schema_info = std::move(schema_info);
+      if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
+    }
+  }
 
-  std::transform(_metadata.per_file_metadata.cbegin(),
-                 _metadata.per_file_metadata.cend(),
-                 std::back_inserter(out_metadata.per_file_user_data),
-                 [](auto& meta) {
-                   std::unordered_map<std::string, std::string> kv_map;
-                   std::transform(meta.ff.metadata.cbegin(),
-                                  meta.ff.metadata.cend(),
-                                  std::inserter(kv_map, kv_map.end()),
-                                  [](auto const& kv) {
-                                    return std::pair{kv.name, kv.value};
-                                  });
-                   return kv_map;
-                 });
-  out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
-                            out_metadata.per_file_user_data[0].end()};
+  // Create columns from buffer with respective schema information.
+  std::transform(
+    _selected_columns.levels[0].begin(),
+    _selected_columns.levels[0].end(),
+    std::back_inserter(out_columns),
+    [&](auto const& orc_col_meta) {
+      out_metadata.schema_info.emplace_back("");
+      auto col_buffer = assemble_buffer(
+        orc_col_meta.id, 0, col_meta, _metadata, _selected_columns, out_buffers, _stream, _mr);
+      return make_column(col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
+    });
 
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
@@ -1303,21 +1350,17 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
+  : _impl{std::make_unique<impl>(std::move(sources), options, stream, mr)}
 {
-  _impl = std::make_unique<impl>(std::move(sources), options, stream, mr);
 }
 
 // Destructor within this translation unit
 reader::~reader() = default;
 
 // Forward to implementation
-table_with_metadata reader::read(orc_reader_options const& options, rmm::cuda_stream_view stream)
+table_with_metadata reader::read(orc_reader_options const& options)
 {
-  return _impl->read(
-    options.get_skip_rows(), options.get_num_rows(), options.get_stripes(), stream);
+  return _impl->read(options.get_skip_rows(), options.get_num_rows(), options.get_stripes());
 }
 
-}  // namespace orc
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::detail::orc
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 04a5b76a6a0..7a576d61726 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -34,44 +34,15 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace orc {
+namespace cudf::io::detail::orc {
 using namespace cudf::io::orc;
 
-// Forward declarations
-class metadata;
 namespace {
-struct orc_stream_info;
-struct stripe_source_mapping;
-}  // namespace
+struct reader_column_meta;
+}
 
 /**
- * @brief Keeps track of orc mapping and child column details.
- */
-struct reader_column_meta {
-  std::vector<std::vector<size_type>>
-    orc_col_map;                         // Mapping between column id in orc to processing order.
-  std::vector<uint32_t> num_child_rows;  // number of rows in child columns
-
-  std::vector<column_validity_info>
-    parent_column_data;  // consists of parent column valid_map and null count
-  std::vector<size_type> parent_column_index;
-
-  std::vector<uint32_t> child_start_row;  // start row of child columns [stripe][column]
-  std::vector<uint32_t>
-    num_child_rows_per_stripe;            // number of rows of child columns [stripe][column]
-  struct row_group_meta {
-    uint32_t num_rows;                    // number of rows in a column in a row group
-    uint32_t start_row;                   // start row in a column in a row group
-  };
-  // num_rowgroups * num_columns
-  std::vector<row_group_meta> rwgrp_meta;  // rowgroup metadata [rowgroup][column]
-};
-
-/**
- * @brief Implementation for ORC reader
+ * @brief Implementation for ORC reader.
  */
 class reader::impl {
  public:
@@ -80,6 +51,7 @@ class reader::impl {
    *
    * @param sources Dataset sources
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
@@ -91,143 +63,27 @@ class reader::impl {
    * @brief Read an entire set or a subset of data and returns a set of columns
    *
    * @param skip_rows Number of rows to skip from the start
-   * @param num_rows Number of rows to read
+   * @param num_rows_opt Optional number of rows to read
    * @param stripes Indices of individual stripes to load if non-empty
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   *
    * @return The set of columns along with metadata
    */
-  table_with_metadata read(int64_t skip_rows,
-                           std::optional<size_type> num_rows,
-                           const std::vector<std::vector<size_type>>& stripes,
-                           rmm::cuda_stream_view stream);
+  table_with_metadata read(uint64_t skip_rows,
+                           std::optional<size_type> const& num_rows_opt,
+                           std::vector<std::vector<size_type>> const& stripes);
 
  private:
-  /**
-   * @brief Decompresses the stripe data, at stream granularity
-   *
-   * @param chunks Vector of list of column chunk descriptors
-   * @param stripe_data List of source stripe column data
-   * @param decompressor Block decompressor
-   * @param stream_info List of stream to column mappings
-   * @param num_stripes Number of stripes making up column chunks
-   * @param row_groups Vector of list of row index descriptors
-   * @param row_index_stride Distance between each row index
-   * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   *
-   * @return Device buffer to decompressed page data
-   */
-  rmm::device_buffer decompress_stripe_data(
-    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-    const std::vector<rmm::device_buffer>& stripe_data,
-    OrcDecompressor const& decompressor,
-    std::vector<orc_stream_info>& stream_info,
-    size_t num_stripes,
-    cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-    size_t row_index_stride,
-    bool use_base_stride,
-    rmm::cuda_stream_view stream);
+  rmm::cuda_stream_view const _stream;
+  rmm::mr::device_memory_resource* const _mr;
 
-  /**
-   * @brief Converts the stripe column data and outputs to columns
-   *
-   * @param chunks Vector of list of column chunk descriptors
-   * @param num_dicts Number of dictionary entries required
-   * @param skip_rows Number of rows to offset from start
-   * @param tz_table Local time to UTC conversion table
-   * @param row_groups Vector of list of row index descriptors
-   * @param row_index_stride Distance between each row index
-   * @param out_buffers Output columns' device buffers
-   * @param level Current nesting level being processed
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   */
-  void decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                          size_t num_dicts,
-                          size_t skip_rows,
-                          table_device_view tz_table,
-                          cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-                          size_t row_index_stride,
-                          std::vector<column_buffer>& out_buffers,
-                          size_t level,
-                          rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Aggregate child metadata from parent column chunks.
-   *
-   * @param chunks Vector of list of parent column chunks.
-   * @param row_groups Vector of list of row index descriptors
-   * @param out_buffers Column buffers for columns.
-   * @param list_col Vector of column metadata of list type parent columns.
-   * @param level Current nesting level being processed.
-   */
-  void aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
-                            cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
-                            std::vector<column_buffer>& out_buffers,
-                            std::vector<orc_column_meta> const& list_col,
-                            const int32_t level);
-
-  /**
-   * @brief Assemble the buffer with child columns.
-   *
-   * @param orc_col_id Column id in orc.
-   * @param col_buffers Column buffers for columns and children.
-   * @param level Current nesting level.
-   */
-  column_buffer&& assemble_buffer(const size_type orc_col_id,
-                                  std::vector<std::vector<column_buffer>>& col_buffers,
-                                  const size_t level,
-                                  rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Create columns and respective schema information from the buffer.
-   *
-   * @param col_buffers Column buffers for columns and children.
-   * @param out_columns Vector of columns formed from column buffers.
-   * @param schema_info Vector of schema information formed from column buffers.
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   */
-  void create_columns(std::vector<std::vector<column_buffer>>&& col_buffers,
-                      std::vector<std::unique_ptr<column>>& out_columns,
-                      std::vector<column_name_info>& schema_info,
-                      rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Create empty columns and respective schema information from the buffer.
-   *
-   * @param col_buffers Column buffers for columns and children.
-   * @param schema_info Vector of schema information formed from column buffers.
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   *
-   * @return An empty column equivalent to orc column type.
-   */
-  std::unique_ptr<column> create_empty_column(const size_type orc_col_id,
-                                              column_name_info& schema_info,
-                                              rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Setup table for converting timestamp columns from local to UTC time
-   *
-   * @return Timezone table with timestamp offsets
-   */
-  std::unique_ptr<table> compute_timezone_table(
-    const std::vector<cudf::io::orc::metadata::stripe_source_mapping>& selected_stripes,
-    rmm::cuda_stream_view stream);
-
- private:
-  rmm::mr::device_memory_resource* _mr = nullptr;
-  std::vector<std::unique_ptr<datasource>> _sources;
+  std::vector<std::unique_ptr<datasource>> const _sources;  // Unused but owns data for `_metadata`
   cudf::io::orc::detail::aggregate_orc_metadata _metadata;
-  cudf::io::orc::detail::column_hierarchy selected_columns;
+  cudf::io::orc::detail::column_hierarchy const _selected_columns;  // Need to be after _metadata
 
-  bool _use_index{true};
-  bool _use_np_dtypes{true};
-  std::vector<std::string> decimal128_columns;
-  data_type _timestamp_type{type_id::EMPTY};
-  reader_column_meta _col_meta{};
+  data_type const _timestamp_type;  // Override output timestamp resolution
+  bool const _use_index;            // Enable or disable attempt to use row index for parsing
+  bool const _use_np_dtypes;        // Enable or disable the conversion to numpy-compatible dtypes
+  std::vector<std::string> const _decimal128_columns;   // Control decimals conversion
+  std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
 };
 
-}  // namespace orc
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::detail::orc
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 1303dd126ef..069841980c1 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,13 +32,13 @@ constexpr unsigned int init_threads_per_block = init_threads_per_group * init_gr
 
 __global__ void __launch_bounds__(init_threads_per_block)
   gpu_init_statistics_groups(statistics_group* groups,
-                             const stats_column_desc* cols,
+                             stats_column_desc const* cols,
                              device_2dspan<rowgroup_rows const> rowgroup_bounds)
 {
   __shared__ __align__(4) statistics_group group_g[init_groups_per_block];
-  uint32_t const col_id    = blockIdx.y;
-  uint32_t const chunk_id  = (blockIdx.x * init_groups_per_block) + threadIdx.y;
-  uint32_t const t         = threadIdx.x;
+  auto const col_id        = blockIdx.y;
+  auto const chunk_id      = (blockIdx.x * init_groups_per_block) + threadIdx.y;
+  auto const t             = threadIdx.x;
   auto const num_rowgroups = rowgroup_bounds.size().first;
   statistics_group* group  = &group_g[threadIdx.y];
   if (chunk_id < num_rowgroups and t == 0) {
@@ -69,17 +69,17 @@ constexpr unsigned int pb_fldlen_common  = 2 * pb_fld_hdrlen + pb_fldlen_int64;
 template <unsigned int block_size>
 __global__ void __launch_bounds__(block_size, 1)
   gpu_init_statistics_buffersize(statistics_merge_group* groups,
-                                 const statistics_chunk* chunks,
+                                 statistics_chunk const* chunks,
                                  uint32_t statistics_count)
 {
   using block_scan = cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>;
   __shared__ typename block_scan::TempStorage temp_storage;
   volatile uint32_t stats_size = 0;
-  uint32_t t                   = threadIdx.x;
+  auto t                       = threadIdx.x;
   __syncthreads();
-  for (uint32_t start = 0; start < statistics_count; start += block_size) {
+  for (thread_index_type start = 0; start < statistics_count; start += block_size) {
     uint32_t stats_len = 0, stats_pos;
-    uint32_t idx       = start + t;
+    auto idx           = start + t;
     if (idx < statistics_count) {
       statistics_dtype const dtype = groups[idx].stats_dtype;
       switch (dtype) {
@@ -170,7 +170,7 @@ __device__ inline uint8_t* pb_put_packed_uint(uint8_t* p, uint32_t id, uint64_t
 }
 
 // Protobuf field encoding for binary/string
-__device__ inline uint8_t* pb_put_binary(uint8_t* p, uint32_t id, const void* bytes, uint32_t len)
+__device__ inline uint8_t* pb_put_binary(uint8_t* p, uint32_t id, void const* bytes, uint32_t len)
 {
   p[0] = id * 8 + ProtofType::FIXEDLEN;
   p    = pb_encode_uint(p + 1, len);
@@ -179,7 +179,7 @@ __device__ inline uint8_t* pb_put_binary(uint8_t* p, uint32_t id, const void* by
 }
 
 // Protobuf field encoding for 64-bit raw encoding (double)
-__device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, const void* raw64)
+__device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* raw64)
 {
   p[0] = id * 8 + ProtofType::FIXED64;
   memcpy(p + 1, raw64, 8);
@@ -218,12 +218,12 @@ constexpr unsigned int encode_threads_per_block =
 __global__ void __launch_bounds__(encode_threads_per_block)
   gpu_encode_statistics(uint8_t* blob_bfr,
                         statistics_merge_group* groups,
-                        const statistics_chunk* chunks,
+                        statistics_chunk const* chunks,
                         uint32_t statistics_count)
 {
   __shared__ __align__(8) stats_state_s state_g[encode_chunks_per_block];
-  uint32_t t             = threadIdx.x;
-  uint32_t idx           = blockIdx.x * encode_chunks_per_block + threadIdx.y;
+  auto t                 = threadIdx.x;
+  auto idx               = blockIdx.x * encode_chunks_per_block + threadIdx.y;
   stats_state_s* const s = &state_g[threadIdx.y];
 
   // Encode and update actual bfr size
@@ -354,7 +354,7 @@ __global__ void __launch_bounds__(encode_threads_per_block)
 }
 
 void orc_init_statistics_groups(statistics_group* groups,
-                                const stats_column_desc* cols,
+                                stats_column_desc const* cols,
                                 device_2dspan<rowgroup_rows const> rowgroup_bounds,
                                 rmm::cuda_stream_view stream)
 {
@@ -374,7 +374,7 @@ void orc_init_statistics_groups(statistics_group* groups,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  */
 void orc_init_statistics_buffersize(statistics_merge_group* groups,
-                                    const statistics_chunk* chunks,
+                                    statistics_chunk const* chunks,
                                     uint32_t statistics_count,
                                     rmm::cuda_stream_view stream)
 {
@@ -392,7 +392,7 @@ void orc_init_statistics_buffersize(statistics_merge_group* groups,
  */
 void orc_encode_statistics(uint8_t* blob_bfr,
                            statistics_merge_group* groups,
-                           const statistics_chunk* chunks,
+                           statistics_chunk const* chunks,
                            uint32_t statistics_count,
                            rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index cb45c018185..b66ca827119 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -44,7 +44,7 @@ inline __device__ uint8_t is_rlev1(uint8_t encoding_mode) { return encoding_mode
 inline __device__ uint8_t is_dictionary(uint8_t encoding_mode) { return encoding_mode & 1; }
 
 struct orc_bytestream_s {
-  const uint8_t* base;
+  uint8_t const* base;
   uint32_t pos;
   uint32_t len;
   uint32_t fill_pos;
@@ -143,7 +143,7 @@ struct orcdec_state_s {
  * @param[in] len Stream length in bytes
  */
 static __device__ void bytestream_init(volatile orc_bytestream_s* bs,
-                                       const uint8_t* base,
+                                       uint8_t const* base,
                                        uint32_t len)
 {
   uint32_t pos   = (len > 0) ? static_cast<uint32_t>(7 & reinterpret_cast<size_t>(base)) : 0;
@@ -1108,15 +1108,15 @@ __global__ void __launch_bounds__(block_size)
   } temp_storage;
 
   orcdec_state_s* const s = &state_g;
-  const bool is_nulldec   = (blockIdx.y >= num_stripes);
-  const uint32_t column   = blockIdx.x;
-  const uint32_t stripe   = (is_nulldec) ? blockIdx.y - num_stripes : blockIdx.y;
-  const uint32_t chunk_id = stripe * num_columns + column;
+  bool const is_nulldec   = (blockIdx.y >= num_stripes);
+  uint32_t const column   = blockIdx.x;
+  uint32_t const stripe   = (is_nulldec) ? blockIdx.y - num_stripes : blockIdx.y;
+  uint32_t const chunk_id = stripe * num_columns + column;
   int t                   = threadIdx.x;
 
   if (t == 0) s->chunk = chunks[chunk_id];
   __syncthreads();
-  const size_t max_num_rows = s->chunk.column_num_rows - s->chunk.parent_validity_info.null_count;
+  size_t const max_num_rows = s->chunk.column_num_rows - s->chunk.parent_validity_info.null_count;
 
   if (is_nulldec) {
     uint32_t null_count = 0;
@@ -1206,7 +1206,7 @@ __global__ void __launch_bounds__(block_size)
       if (row_in < first_row && t < 32) {
         uint32_t skippedrows = min(static_cast<uint32_t>(first_row - row_in), nrows);
         uint32_t skip_count  = 0;
-        for (uint32_t i = t * 32; i < skippedrows; i += 32 * 32) {
+        for (thread_index_type i = t * 32; i < skippedrows; i += 32 * 32) {
           // Need to arrange the bytes to apply mask properly.
           uint32_t bits = (i + 32 <= skippedrows) ? s->vals.u32[i >> 5]
                                                   : (__byte_perm(s->vals.u32[i >> 5], 0, 0x0123) &
@@ -1313,7 +1313,7 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s,
       uint32_t rmax  = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row);
       auto r         = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
       uint32_t valid = (t < nrows && r < rmax)
-                         ? (((const uint8_t*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
+                         ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
                          : 0;
       volatile auto* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
       uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row;
@@ -1403,8 +1403,8 @@ __global__ void __launch_bounds__(block_size)
   }
   __syncthreads();
   // Struct doesn't have any data in itself, so skip
-  const bool is_valid       = s->chunk.type_kind != STRUCT;
-  const size_t max_num_rows = s->chunk.column_num_rows;
+  bool const is_valid       = s->chunk.type_kind != STRUCT;
+  size_t const max_num_rows = s->chunk.column_num_rows;
   if (t == 0 and is_valid) {
     // If we have an index, seek to the initial run and update row positions
     if (num_rowgroups > 0) {
@@ -1435,7 +1435,7 @@ __global__ void __launch_bounds__(block_size)
     s->top.data.end_row        = s->chunk.start_row + s->chunk.num_rows;
     s->top.data.buffered_count = 0;
     if (s->top.data.end_row > first_row + max_num_rows) {
-      s->top.data.end_row = static_cast<uint32_t>(first_row + max_num_rows);
+      s->top.data.end_row = first_row + max_num_rows;
     }
     if (num_rowgroups > 0) {
       s->top.data.end_row =
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index b52075e4c28..73c41e2bbcd 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -69,7 +69,7 @@ struct intrle_enc_state_s {
 struct strdata_enc_state_s {
   uint32_t char_count;
   uint32_t lengths_red[(512 / 32)];
-  const char* str_data[512];
+  char const* str_data[512];
 };
 
 struct orcenc_state_s {
@@ -88,7 +88,7 @@ struct orcenc_state_s {
     byterle_enc_state_s byterle;
     intrle_enc_state_s intrle;
     strdata_enc_state_s strenc;
-    StripeDictionary dict_stripe;
+    stripe_dictionary const* dict_stripe;
   } u;
   union {
     uint8_t u8[scratch_buffer_size];  // gblock_vminscratch buffer
@@ -144,7 +144,7 @@ static inline __device__ uint32_t CountLeadingBytes64(uint64_t v) { return __clz
  */
 template <StreamIndexType cid, uint32_t inmask>
 static __device__ void StoreBytes(
-  orcenc_state_s* s, const uint8_t* inbuf, uint32_t inpos, uint32_t count, int t)
+  orcenc_state_s* s, uint8_t const* inbuf, uint32_t inpos, uint32_t count, int t)
 {
   uint8_t* dst = s->stream.data_ptrs[cid] + s->strm_pos[cid];
   while (count > 0) {
@@ -175,7 +175,7 @@ static __device__ void StoreBytes(
  */
 template <StreamIndexType cid, uint32_t inmask>
 static __device__ uint32_t ByteRLE(
-  orcenc_state_s* s, const uint8_t* inbuf, uint32_t inpos, uint32_t numvals, uint32_t flush, int t)
+  orcenc_state_s* s, uint8_t const* inbuf, uint32_t inpos, uint32_t numvals, uint32_t flush, int t)
 {
   uint8_t* dst     = s->stream.data_ptrs[cid] + s->strm_pos[cid];
   uint32_t out_cnt = 0;
@@ -361,7 +361,7 @@ template <StreamIndexType cid,
           int block_size,
           typename Storage>
 static __device__ uint32_t IntegerRLE(
-  orcenc_state_s* s, const T* inbuf, uint32_t inpos, uint32_t numvals, int t, Storage& temp_storage)
+  orcenc_state_s* s, T const* inbuf, uint32_t inpos, uint32_t numvals, int t, Storage& temp_storage)
 {
   using block_reduce = cub::BlockReduce<T, block_size>;
   uint8_t* dst       = s->stream.data_ptrs[cid] + s->strm_pos[cid];
@@ -996,14 +996,16 @@ __global__ void __launch_bounds__(block_size)
 /**
  * @brief Encode column dictionaries
  *
- * @param[in] stripes Stripe dictionaries device array [stripe][string_column]
+ * @param[in] stripes Stripe dictionaries device array
+ * @param[in] columns Pre-order flattened device array of ORC column views
  * @param[in] chunks EncChunk device array [rowgroup][column]
  * @param[in] num_columns Number of columns
  */
 // blockDim {512,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  gpuEncodeStringDictionaries(StripeDictionary const* stripes,
+  gpuEncodeStringDictionaries(stripe_dictionary const* stripes,
+                              device_span<orc_column_device_view const> columns,
                               device_2dspan<EncChunk const> chunks,
                               device_2dspan<encoder_chunk_streams> streams)
 {
@@ -1015,20 +1017,20 @@ __global__ void __launch_bounds__(block_size)
   uint32_t cid            = (blockIdx.y) ? CI_DICTIONARY : CI_DATA2;
   int t                   = threadIdx.x;
 
-  if (t == 0) s->u.dict_stripe = stripes[stripe_id];
+  if (t == 0) s->u.dict_stripe = &stripes[stripe_id];
 
   __syncthreads();
-  auto const strm_ptr = &streams[s->u.dict_stripe.column_id][s->u.dict_stripe.start_chunk];
+  auto const strm_ptr = &streams[s->u.dict_stripe->column_idx][s->u.dict_stripe->start_rowgroup];
   if (t == 0) {
-    s->chunk         = chunks[s->u.dict_stripe.column_id][s->u.dict_stripe.start_chunk];
+    s->chunk         = chunks[s->u.dict_stripe->column_idx][s->u.dict_stripe->start_rowgroup];
     s->stream        = *strm_ptr;
     s->strm_pos[cid] = 0;
     s->numlengths    = 0;
-    s->nrows         = s->u.dict_stripe.num_strings;
+    s->nrows         = s->u.dict_stripe->entry_count;
     s->cur_row       = 0;
   }
-  auto const string_column = s->u.dict_stripe.leaf_column;
-  auto const dict_data     = s->u.dict_stripe.dict_data;
+  auto const string_column = columns[s->u.dict_stripe->column_idx];
+  auto const dict_data     = s->u.dict_stripe->data;
   __syncthreads();
   if (s->chunk.encoding_kind != DICTIONARY_V2) {
     return;  // This column isn't using dictionary encoding -> bail out
@@ -1039,10 +1041,10 @@ __global__ void __launch_bounds__(block_size)
     uint32_t string_idx = (t < numvals) ? dict_data[s->cur_row + t] : 0;
     if (cid == CI_DICTIONARY) {
       // Encoding string contents
-      const char* ptr = nullptr;
+      char const* ptr = nullptr;
       uint32_t count  = 0;
       if (t < numvals) {
-        auto string_val = string_column->element<string_view>(string_idx);
+        auto string_val = string_column.element<string_view>(string_idx);
         ptr             = string_val.data();
         count           = string_val.size_bytes();
       }
@@ -1056,7 +1058,7 @@ __global__ void __launch_bounds__(block_size)
       // Encoding string lengths
       uint32_t count =
         (t < numvals)
-          ? static_cast<uint32_t>(string_column->element<string_view>(string_idx).size_bytes())
+          ? static_cast<uint32_t>(string_column.element<string_view>(string_idx).size_bytes())
           : 0;
       uint32_t nz_idx = (s->cur_row + t) & 0x3ff;
       if (t < numvals) s->lengths.u32[nz_idx] = count;
@@ -1108,11 +1110,10 @@ __global__ void __launch_bounds__(compact_streams_block_size)
       for (uint32_t i = t; i < len; i += blockDim.x) {
         dst_ptr[i] = src_ptr[i];
       }
-
       __syncthreads();
-      if (t == 0) { streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; }
-      dst_ptr += len;
     }
+    if (t == 0) { streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; }
+    dst_ptr += len;
   }
 }
 
@@ -1136,7 +1137,7 @@ __global__ void __launch_bounds__(256)
                            device_span<device_span<uint8_t const>> inputs,
                            device_span<device_span<uint8_t>> outputs,
                            device_span<compression_result> results,
-                           uint8_t* compressed_bfr,
+                           device_span<uint8_t> compressed_bfr,
                            uint32_t comp_blk_size,
                            uint32_t max_comp_blk_size,
                            uint32_t comp_block_align)
@@ -1159,7 +1160,7 @@ __global__ void __launch_bounds__(256)
   }
   __syncthreads();
   src        = uncomp_base_g;
-  dst        = compressed_bfr + ss.bfr_offset;
+  dst        = compressed_bfr.data() + ss.bfr_offset;
   num_blocks = (ss.stream_size > 0) ? (ss.stream_size - 1) / comp_blk_size + 1 : 1;
   for (uint32_t b = t; b < num_blocks; b += 256) {
     uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
@@ -1190,26 +1191,26 @@ __global__ void __launch_bounds__(1024)
                              device_span<device_span<uint8_t const> const> inputs,
                              device_span<device_span<uint8_t> const> outputs,
                              device_span<compression_result> results,
-                             uint8_t* compressed_bfr,
+                             device_span<uint8_t> compressed_bfr,
                              uint32_t comp_blk_size,
                              uint32_t max_comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
-  __shared__ const uint8_t* volatile comp_src_g;
+  __shared__ uint8_t const* volatile comp_src_g;
   __shared__ uint32_t volatile comp_len_g;
 
   auto const stripe_id = blockIdx.x;
   auto const stream_id = blockIdx.y;
   uint32_t t           = threadIdx.x;
   uint32_t num_blocks, b, blk_size;
-  const uint8_t* src;
+  uint8_t const* src;
   uint8_t* dst;
 
   if (t == 0) ss = strm_desc[stripe_id][stream_id];
   __syncthreads();
 
   num_blocks = (ss.stream_size > 0) ? (ss.stream_size - 1) / comp_blk_size + 1 : 0;
-  dst        = compressed_bfr + ss.bfr_offset;
+  dst        = compressed_bfr.data() + ss.bfr_offset;
   b          = 0;
   do {
     if (t == 0) {
@@ -1255,7 +1256,7 @@ __global__ void __launch_bounds__(1024)
   // Update stripe stream with the compressed size
   if (t == 0) {
     strm_desc[stripe_id][stream_id].stream_size =
-      static_cast<uint32_t>(dst - (compressed_bfr + ss.bfr_offset));
+      static_cast<uint32_t>(dst - (compressed_bfr.data() + ss.bfr_offset));
   }
 }
 
@@ -1269,7 +1270,8 @@ void EncodeOrcColumnData(device_2dspan<EncChunk const> chunks,
     <<<dim_grid, dim_block, 0, stream.value()>>>(chunks, streams);
 }
 
-void EncodeStripeDictionaries(StripeDictionary const* stripes,
+void EncodeStripeDictionaries(stripe_dictionary const* stripes,
+                              device_span<orc_column_device_view const> columns,
                               device_2dspan<EncChunk const> chunks,
                               uint32_t num_string_columns,
                               uint32_t num_stripes,
@@ -1279,7 +1281,7 @@ void EncodeStripeDictionaries(StripeDictionary const* stripes,
   dim3 dim_block(512, 1);  // 512 threads per dictionary
   dim3 dim_grid(num_string_columns * num_stripes, 2);
   gpuEncodeStringDictionaries<512>
-    <<<dim_grid, dim_block, 0, stream.value()>>>(stripes, chunks, enc_streams);
+    <<<dim_grid, dim_block, 0, stream.value()>>>(stripes, columns, chunks, enc_streams);
 }
 
 void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
@@ -1291,16 +1293,18 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
   gpuCompactOrcDataStreams<<<dim_grid, dim_block, 0, stream.value()>>>(strm_desc, enc_streams);
 }
 
-void CompressOrcDataStreams(uint8_t* compressed_data,
-                            uint32_t num_compressed_blocks,
-                            CompressionKind compression,
-                            uint32_t comp_blk_size,
-                            uint32_t max_comp_blk_size,
-                            uint32_t comp_block_align,
-                            device_2dspan<StripeStream> strm_desc,
-                            device_2dspan<encoder_chunk_streams> enc_streams,
-                            device_span<compression_result> comp_res,
-                            rmm::cuda_stream_view stream)
+std::optional<writer_compression_statistics> CompressOrcDataStreams(
+  device_span<uint8_t> compressed_data,
+  uint32_t num_compressed_blocks,
+  CompressionKind compression,
+  uint32_t comp_blk_size,
+  uint32_t max_comp_blk_size,
+  uint32_t comp_block_align,
+  bool collect_statistics,
+  device_2dspan<StripeStream> strm_desc,
+  device_2dspan<encoder_chunk_streams> enc_streams,
+  device_span<compression_result> comp_res,
+  rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<device_span<uint8_t const>> comp_in(num_compressed_blocks, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(num_compressed_blocks, stream);
@@ -1356,6 +1360,12 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
   dim3 dim_block_compact(1024, 1);
   gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream.value()>>>(
     strm_desc, comp_in, comp_out, comp_res, compressed_data, comp_blk_size, max_comp_blk_size);
+
+  if (collect_statistics) {
+    return cudf::io::collect_compression_statistics(comp_in, comp_res, stream);
+  } else {
+    return std::nullopt;
+  }
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 6c0f0767b73..d8a60350356 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -55,8 +55,8 @@ __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeData(
   __syncthreads();
   if (strm_id < num_streams) {
     // Walk through the compressed blocks
-    const uint8_t* cur                   = s->info.compressed_data;
-    const uint8_t* end                   = cur + s->info.compressed_data_size;
+    uint8_t const* cur                   = s->info.compressed_data;
+    uint8_t const* end                   = cur + s->info.compressed_data_size;
     uint8_t* uncompressed                = s->info.uncompressed_data;
     size_t max_uncompressed_size         = 0;
     uint32_t max_uncompressed_block_size = 0;
@@ -154,8 +154,8 @@ __global__ void __launch_bounds__(128, 8)
       s->info.num_compressed_blocks + s->info.num_uncompressed_blocks > 0 &&
       s->info.max_uncompressed_size > 0) {
     // Walk through the compressed blocks
-    const uint8_t* cur              = s->info.compressed_data;
-    const uint8_t* end              = cur + s->info.compressed_data_size;
+    uint8_t const* cur              = s->info.compressed_data;
+    uint8_t const* end              = cur + s->info.compressed_data_size;
     auto dec_out                    = s->info.dec_out_ctl;
     auto dec_result                 = s->info.dec_res;
     uint8_t* uncompressed_actual    = s->info.uncompressed_data;
@@ -267,7 +267,7 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s* s,
   constexpr uint32_t pb_rowindexentry_id = ProtofType::FIXEDLEN + 8;
   auto const stream_order                = index_order_from_index_types(s->chunk.skip_count);
 
-  const uint8_t* cur      = start;
+  uint8_t const* cur      = start;
   row_entry_state_e state = NOT_FOUND;
   uint32_t length         = 0;
   uint32_t idx_id         = 0;
@@ -357,7 +357,7 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s* s,
  */
 static __device__ void gpuReadRowGroupIndexEntries(rowindex_state_s* s, int num_rowgroups)
 {
-  const uint8_t* index_data = s->chunk.streams[CI_INDEX];
+  uint8_t const* index_data = s->chunk.streams[CI_INDEX];
   int index_data_len        = s->chunk.strm_len[CI_INDEX];
   for (int i = 0; i < num_rowgroups; i++) {
     s->row_index_entry[0][0] = 0;
@@ -398,9 +398,9 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s,
   if (strm_len > 0) {
     int32_t compressed_offset = (t < num_rowgroups) ? s->compressed_offset[t][ci_id] : 0;
     if (compressed_offset > 0) {
-      const uint8_t* start   = s->strm_info[ci_id].compressed_data;
-      const uint8_t* cur     = start;
-      const uint8_t* end     = cur + s->strm_info[ci_id].compressed_data_size;
+      uint8_t const* start   = s->strm_info[ci_id].compressed_data;
+      uint8_t const* cur     = start;
+      uint8_t const* end     = cur + s->strm_info[ci_id].compressed_data_size;
       auto dec_result        = s->strm_info[ci_id].dec_res.data();
       uint32_t uncomp_offset = 0;
       for (;;) {
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index fef1bb23733..3d8bdb4ec97 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -72,6 +72,17 @@ namespace orc {
 using namespace cudf::io::orc;
 using namespace cudf::io;
 
+template <typename T>
+[[nodiscard]] constexpr int varint_size(T val)
+{
+  auto len = 1u;
+  while (val > 0x7f) {
+    val >>= 7;
+    ++len;
+  }
+  return len;
+}
+
 struct row_group_index_info {
   int32_t pos       = -1;  // Position
   int32_t blk_pos   = -1;  // Block Position
@@ -225,46 +236,44 @@ class orc_column_view {
 
   auto type() const noexcept { return cudf_column.type(); }
   auto is_string() const noexcept { return cudf_column.type().id() == type_id::STRING; }
-  void set_dict_stride(size_t stride) noexcept { _dict_stride = stride; }
-  [[nodiscard]] auto dict_stride() const noexcept { return _dict_stride; }
 
-  /**
-   * @brief Function that associates an existing dictionary chunk allocation
-   */
-  void attach_dict_chunk(gpu::DictionaryChunk const* host_dict,
-                         gpu::DictionaryChunk const* dev_dict)
+  void attach_rowgroup_char_counts(host_span<size_type const> counts)
   {
-    dict   = host_dict;
-    d_dict = dev_dict;
+    rowgroup_char_counts = counts;
   }
-  [[nodiscard]] auto host_dict_chunk(size_t rowgroup) const
+
+  [[nodiscard]] auto rowgroup_char_count(size_type rg_idx) const
   {
-    CUDF_EXPECTS(is_string(), "Dictionary chunks are only present in string columns.");
-    return &dict[rowgroup * _dict_stride + _str_idx];
+    return rowgroup_char_counts[rg_idx];
+  }
+
+  [[nodiscard]] auto char_count() const
+  {
+    return std::accumulate(rowgroup_char_counts.begin(), rowgroup_char_counts.end(), size_type{0});
   }
-  [[nodiscard]] auto device_dict_chunk() const { return d_dict; }
 
   [[nodiscard]] auto const& decimal_offsets() const { return d_decimal_offsets; }
   void attach_decimal_offsets(uint32_t* sizes_ptr) { d_decimal_offsets = sizes_ptr; }
 
-  /**
-   * @brief Function that associates an existing stripe dictionary allocation
-   */
-  void attach_stripe_dict(gpu::StripeDictionary* host_stripe_dict,
-                          gpu::StripeDictionary* dev_stripe_dict)
+  void attach_stripe_dicts(host_span<gpu::stripe_dictionary const> host_stripe_dicts,
+                           device_span<gpu::stripe_dictionary const> dev_stripe_dicts)
   {
-    stripe_dict   = host_stripe_dict;
-    d_stripe_dict = dev_stripe_dict;
+    stripe_dicts   = host_stripe_dicts;
+    d_stripe_dicts = dev_stripe_dicts;
   }
-  [[nodiscard]] auto host_stripe_dict(size_t stripe) const
+
+  [[nodiscard]] auto const& host_stripe_dict(size_t stripe) const
   {
     CUDF_EXPECTS(is_string(), "Stripe dictionary is only present in string columns.");
-    return &stripe_dict[stripe * _dict_stride + _str_idx];
+    return stripe_dicts[stripe];
   }
-  [[nodiscard]] auto device_stripe_dict() const noexcept { return d_stripe_dict; }
+
+  [[nodiscard]] auto const& device_stripe_dicts() const noexcept { return d_stripe_dicts; }
 
   // Index in the table
   [[nodiscard]] uint32_t index() const noexcept { return _index; }
+  // Index in the table, including only string columns
+  [[nodiscard]] uint32_t str_index() const noexcept { return _str_idx; }
   // Id in the ORC file
   [[nodiscard]] auto id() const noexcept { return _index + 1; }
 
@@ -308,12 +317,10 @@ class orc_column_view {
   int32_t _scale     = 0;
   int32_t _precision = 0;
 
-  // String dictionary-related members
-  size_t _dict_stride                        = 0;
-  gpu::DictionaryChunk const* dict           = nullptr;
-  gpu::StripeDictionary const* stripe_dict   = nullptr;
-  gpu::DictionaryChunk const* d_dict         = nullptr;
-  gpu::StripeDictionary const* d_stripe_dict = nullptr;
+  host_span<size_type const> rowgroup_char_counts;
+
+  host_span<gpu::stripe_dictionary const> stripe_dicts;
+  device_span<gpu::stripe_dictionary const> d_stripe_dicts;
 
   // Offsets for encoded decimal elements. Used to enable direct writing of encoded decimal elements
   // into the output stream.
@@ -365,7 +372,7 @@ __global__ void copy_string_data(char* string_pool,
     auto dst      = &string_pool[offsets[blockIdx.x]];
     auto src      = str_val.ptr;
 
-    for (int i = threadIdx.x; i < str_val.length; i += blockDim.x) {
+    for (thread_index_type i = threadIdx.x; i < str_val.length; i += blockDim.x) {
       dst[i] = src[i];
     }
     if (threadIdx.x == 0) { str_val.ptr = dst; }
@@ -376,7 +383,7 @@ __global__ void copy_string_data(char* string_pool,
 
 void persisted_statistics::persist(int num_table_rows,
                                    single_write_mode write_mode,
-                                   intermediate_statistics& intermediate_stats,
+                                   intermediate_statistics&& intermediate_stats,
                                    rmm::cuda_stream_view stream)
 {
   if (write_mode == single_write_mode::NO) {
@@ -440,8 +447,7 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
       std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const& col) {
         auto const rows = rowgroup_bounds[rg_idx][col.index()].size();
         if (col.is_string()) {
-          const auto dt = col.host_dict_chunk(rg_idx);
-          return total_size + rows + dt->string_char_count;
+          return total_size + rows + col.rowgroup_char_count(rg_idx);
         } else {
           return total_size + col.type_width() * rows;
         }
@@ -471,133 +477,6 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
   return {std::move(rowgroup_bounds), std::move(infos)};
 }
 
-/**
- * @brief Builds up column dictionaries indices
- *
- * @param orc_table Non-owning view of a cuDF table w/ ORC-related info
- * @param rowgroup_bounds Ranges of rows in each rowgroup [rowgroup][column]
- * @param dict_data Dictionary data memory
- * @param dict_index Dictionary index memory
- * @param dict List of dictionary chunks
- * @param stream CUDA stream used for device memory operations and kernel launches
- */
-void init_dictionaries(orc_table_view& orc_table,
-                       device_2dspan<rowgroup_rows const> rowgroup_bounds,
-                       device_span<device_span<uint32_t>> dict_data,
-                       device_span<device_span<uint32_t>> dict_index,
-                       hostdevice_2dvector<gpu::DictionaryChunk>* dict,
-                       rmm::cuda_stream_view stream)
-{
-  // Setup per-rowgroup dictionary indexes for each dictionary-aware column
-  for (auto col_idx : orc_table.string_column_indices) {
-    auto& str_column = orc_table.column(col_idx);
-    str_column.set_dict_stride(orc_table.num_string_columns());
-    str_column.attach_dict_chunk(dict->base_host_ptr(), dict->base_device_ptr());
-  }
-
-  // Allocate temporary memory for dictionary indices
-  std::vector<rmm::device_uvector<uint32_t>> dict_indices;
-  dict_indices.reserve(orc_table.num_string_columns());
-  std::transform(orc_table.string_column_indices.cbegin(),
-                 orc_table.string_column_indices.cend(),
-                 std::back_inserter(dict_indices),
-                 [&](auto& col_idx) {
-                   auto& str_column = orc_table.column(col_idx);
-                   return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                     str_column.size(), stream, rmm::mr::get_current_device_resource());
-                 });
-
-  // Create views of the temporary buffers in device memory
-  std::vector<device_span<uint32_t>> dict_indices_views;
-  dict_indices_views.reserve(dict_indices.size());
-  std::transform(
-    dict_indices.begin(), dict_indices.end(), std::back_inserter(dict_indices_views), [](auto& di) {
-      return device_span<uint32_t>{di};
-    });
-  auto d_dict_indices_views = cudf::detail::make_device_uvector_async(
-    dict_indices_views, stream, rmm::mr::get_current_device_resource());
-
-  gpu::InitDictionaryIndices(orc_table.d_columns,
-                             *dict,
-                             dict_data,
-                             dict_index,
-                             d_dict_indices_views,
-                             rowgroup_bounds,
-                             orc_table.d_string_column_indices,
-                             stream);
-  dict->device_to_host(stream, true);
-}
-
-/**
- * @brief Builds up per-stripe dictionaries for string columns.
- *
- * @param orc_table Non-owning view of a cuDF table w/ ORC-related info
- * @param stripe_bounds List of stripe boundaries
- * @param dict List of dictionary chunks [rowgroup][column]
- * @param dict_index List of dictionary indices
- * @param dictionary_enabled Whether dictionary encoding is enabled for a given column
- * @param stripe_dict List of stripe dictionaries
- * @param enable_dictionary Whether dictionary is enabled
- * @param stream CUDA stream used for device memory operations and kernel launches
- */
-void build_dictionaries(orc_table_view& orc_table,
-                        host_span<stripe_rowgroups const> stripe_bounds,
-                        hostdevice_2dvector<gpu::DictionaryChunk> const& dict,
-                        host_span<rmm::device_uvector<uint32_t>> dict_index,
-                        host_span<bool const> dictionary_enabled,
-                        hostdevice_2dvector<gpu::StripeDictionary>& stripe_dict,
-                        bool enable_dictionary,
-                        rmm::cuda_stream_view stream)
-{
-  for (size_t dict_idx = 0; dict_idx < orc_table.num_string_columns(); ++dict_idx) {
-    auto& str_column = orc_table.string_column(dict_idx);
-    str_column.attach_stripe_dict(stripe_dict.base_host_ptr(), stripe_dict.base_device_ptr());
-
-    for (auto const& stripe : stripe_bounds) {
-      auto& sd           = stripe_dict[stripe.id][dict_idx];
-      sd.dict_data       = str_column.host_dict_chunk(stripe.first)->dict_data;
-      sd.dict_index      = dict_index[dict_idx].data();  // Indexed by abs row
-      sd.column_id       = orc_table.string_column_indices[dict_idx];
-      sd.start_chunk     = stripe.first;
-      sd.num_chunks      = stripe.size;
-      sd.dict_char_count = 0;
-      sd.num_strings =
-        std::accumulate(stripe.cbegin(), stripe.cend(), 0, [&](auto dt_str_cnt, auto rg_idx) {
-          const auto& dt = dict[rg_idx][dict_idx];
-          return dt_str_cnt + dt.num_dict_strings;
-        });
-      sd.leaf_column = dict[0][dict_idx].leaf_column;
-    }
-
-    if (enable_dictionary) {
-      struct string_column_cost {
-        size_t direct     = 0;
-        size_t dictionary = 0;
-      };
-      auto const col_cost =
-        std::accumulate(stripe_bounds.front().cbegin(),
-                        stripe_bounds.back().cend(),
-                        string_column_cost{},
-                        [&](auto cost, auto rg_idx) -> string_column_cost {
-                          const auto& dt = dict[rg_idx][dict_idx];
-                          return {cost.direct + dt.string_char_count,
-                                  cost.dictionary + dt.dict_char_count + dt.num_dict_strings};
-                        });
-      // Disable dictionary if it does not reduce the output size
-      if (!dictionary_enabled[orc_table.string_column(dict_idx).index()] ||
-          col_cost.dictionary >= col_cost.direct) {
-        for (auto const& stripe : stripe_bounds) {
-          stripe_dict[stripe.id][dict_idx].dict_data = nullptr;
-        }
-      }
-    }
-  }
-
-  stripe_dict.host_to_device(stream);
-  gpu::BuildStripeDictionaries(stripe_dict, stripe_dict, dict, stream);
-  stripe_dict.device_to_host(stream, true);
-}
-
 /**
  * @brief Returns the maximum size of RLE encoded values of an integer type.
  **/
@@ -687,9 +566,9 @@ orc_streams create_streams(host_span<orc_column_view> columns,
         // For chunked write, when not provided nullability, we assume the worst case scenario
         // that all columns are nullable.
         auto const chunked_nullable = column.user_defined_nullable().value_or(true);
-        CUDF_EXPECTS(chunked_nullable or !column.nullable(),
-                     "Mismatch in metadata prescribed nullability and input column nullability. "
-                     "Metadata for nullable input column cannot prescribe nullability = false");
+        CUDF_EXPECTS(chunked_nullable or column.null_count() == 0,
+                     "Mismatch in metadata prescribed nullability and input column. "
+                     "Metadata for input column with nulls cannot prescribe nullability = false");
         return chunked_nullable;
       }
     }();
@@ -751,31 +630,22 @@ orc_streams create_streams(host_span<orc_column_view> columns,
         size_t dict_strings        = 0;
         size_t dict_lengths_div512 = 0;
         for (auto const& stripe : segmentation.stripes) {
-          const auto sd = column.host_stripe_dict(stripe.id);
-          enable_dict   = (enable_dict && sd->dict_data != nullptr);
+          auto const sd = column.host_stripe_dict(stripe.id);
+          enable_dict   = (enable_dict && sd.is_enabled);
           if (enable_dict) {
-            dict_strings += sd->num_strings;
-            dict_lengths_div512 += (sd->num_strings + 0x1ff) >> 9;
-            dict_data_size += sd->dict_char_count;
+            dict_strings += sd.entry_count;
+            dict_lengths_div512 += (sd.entry_count + 0x1ff) >> 9;
+            dict_data_size += sd.char_count;
           }
         }
 
-        auto const direct_data_size =
-          segmentation.num_stripes() == 0
-            ? 0
-            : std::accumulate(segmentation.stripes.front().cbegin(),
-                              segmentation.stripes.back().cend(),
-                              size_t{0},
-                              [&](auto data_size, auto rg_idx) {
-                                return data_size +
-                                       column.host_dict_chunk(rg_idx)->string_char_count;
-                              });
+        size_t const direct_data_size = column.char_count();
         if (enable_dict) {
           uint32_t dict_bits = 0;
           for (dict_bits = 1; dict_bits < 32; dict_bits <<= 1) {
             if (dict_strings <= (1ull << dict_bits)) break;
           }
-          const auto valid_count = column.size() - column.null_count();
+          auto const valid_count = column.size() - column.null_count();
           dict_data_size += (dict_bits * valid_count + 7) >> 3;
         }
 
@@ -952,7 +822,7 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
       }
     });
 
-  aligned_rgs.device_to_host(stream, true);
+  aligned_rgs.device_to_host_sync(stream);
 
   std::vector<std::vector<rowgroup_rows>> h_aligned_rgs;
   h_aligned_rgs.reserve(segmentation.num_rowgroups());
@@ -972,7 +842,6 @@ struct segmented_valid_cnt_input {
 };
 
 encoded_data encode_columns(orc_table_view const& orc_table,
-                            string_dictionaries&& dictionaries,
                             encoder_decimal_info&& dec_chunk_sizes,
                             file_segmentation const& segmentation,
                             orc_streams const& streams,
@@ -1000,7 +869,7 @@ encoded_data encode_columns(orc_table_view const& orc_table,
         ck.type_kind           = column.orc_kind();
         if (ck.type_kind == TypeKind::STRING) {
           ck.dict_index = (ck.encoding_kind == DICTIONARY_V2)
-                            ? column.host_stripe_dict(stripe.id)->dict_index
+                            ? column.host_stripe_dict(stripe.id).index.data()
                             : nullptr;
           ck.dtype_len  = 1;
         } else {
@@ -1011,7 +880,7 @@ encoded_data encode_columns(orc_table_view const& orc_table,
       }
     }
   }
-  chunks.host_to_device(stream);
+  chunks.host_to_device_async(stream);
   // TODO (future): pass columns separately from chunks (to skip this step)
   // and remove info from chunks that is common for the entire column
   thrust::for_each_n(
@@ -1084,18 +953,17 @@ encoded_data encode_columns(orc_table_view const& orc_table,
             if ((strm_type == gpu::CI_DICTIONARY) ||
                 (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) {
               if (rg_idx == *stripe.cbegin()) {
-                const auto stripe_dict = column.host_stripe_dict(stripe.id);
+                auto const stripe_dict = column.host_stripe_dict(stripe.id);
                 strm.lengths[strm_type] =
                   (strm_type == gpu::CI_DICTIONARY)
-                    ? stripe_dict->dict_char_count
-                    : (((stripe_dict->num_strings + 0x1ff) >> 9) * (512 * 4 + 2));
+                    ? stripe_dict.char_count
+                    : (((stripe_dict.entry_count + 0x1ff) >> 9) * (512 * 4 + 2));
               } else {
                 strm.lengths[strm_type] = 0;
               }
             } else if (strm_type == gpu::CI_DATA && ck.type_kind == TypeKind::STRING &&
                        ck.encoding_kind == DIRECT_V2) {
-              strm.lengths[strm_type] =
-                std::max(column.host_dict_chunk(rg_idx)->string_char_count, 1u);
+              strm.lengths[strm_type] = std::max(column.rowgroup_char_count(rg_idx), 1);
             } else if (strm_type == gpu::CI_DATA && streams[strm_id].length == 0 &&
                        (ck.type_kind == DOUBLE || ck.type_kind == FLOAT)) {
               // Pass-through
@@ -1142,12 +1010,13 @@ encoded_data encode_columns(orc_table_view const& orc_table,
     }
   }
 
-  chunk_streams.host_to_device(stream);
+  chunk_streams.host_to_device_async(stream);
 
   if (orc_table.num_rows() > 0) {
     if (orc_table.num_string_columns() != 0) {
-      auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
-      gpu::EncodeStripeDictionaries(d_stripe_dict,
+      auto d_stripe_dict = orc_table.string_column(0).device_stripe_dicts();
+      gpu::EncodeStripeDictionaries(d_stripe_dict.data(),
+                                    orc_table.d_columns,
                                     chunks,
                                     orc_table.num_string_columns(),
                                     segmentation.num_stripes(),
@@ -1157,9 +1026,7 @@ encoded_data encode_columns(orc_table_view const& orc_table,
 
     gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
   }
-  dictionaries.data.clear();
-  dictionaries.index.clear();
-  chunk_streams.device_to_host(stream, true /* device sync */);
+  chunk_streams.device_to_host_sync(stream);
 
   return {std::move(encoded_data), std::move(chunk_streams)};
 }
@@ -1197,7 +1064,7 @@ std::vector<StripeInformation> gather_stripes(size_t num_index_streams,
       auto const& col_streams = (enc_data->streams)[col_idx];
       // Assign stream data of column data stream(s)
       for (int k = 0; k < gpu::CI_INDEX; k++) {
-        const auto stream_id = col_streams[0].ids[k];
+        auto const stream_id = col_streams[0].ids[k];
         if (stream_id != -1) {
           auto const actual_stripe_size = std::accumulate(
             col_streams.begin() + stripe.first,
@@ -1234,11 +1101,11 @@ std::vector<StripeInformation> gather_stripes(size_t num_index_streams,
                            segmentation.rowgroups[stripe.first][0].begin;
   }
 
-  strm_desc->host_to_device(stream);
+  strm_desc->host_to_device_async(stream);
   // TODO: use cub::DeviceMemcpy::Batched
   gpu::CompactOrcDataStreams(*strm_desc, enc_data->streams, stream);
-  strm_desc->device_to_host(stream);
-  enc_data->streams.device_to_host(stream, true);
+  strm_desc->device_to_host_async(stream);
+  enc_data->streams.device_to_host_sync(stream);
 
   // move the gathered stripes to encoded_data.data for lifetime management
   for (auto stripe_id = 0ul; stripe_id < enc_data->data.size(); ++stripe_id) {
@@ -1261,8 +1128,8 @@ void set_stat_desc_leaf_cols(device_span<orc_column_device_view const> columns,
                    [=] __device__(auto idx) { stat_desc[idx].leaf_column = &columns[idx]; });
 }
 
-hostdevice_vector<uint8_t> allocate_and_encode_blobs(
-  hostdevice_vector<statistics_merge_group>& stats_merge_groups,
+cudf::detail::hostdevice_vector<uint8_t> allocate_and_encode_blobs(
+  cudf::detail::hostdevice_vector<statistics_merge_group>& stats_merge_groups,
   rmm::device_uvector<statistics_chunk>& stat_chunks,
   int num_stat_blobs,
   rmm::cuda_stream_view stream)
@@ -1272,14 +1139,15 @@ hostdevice_vector<uint8_t> allocate_and_encode_blobs(
     stats_merge_groups.device_ptr(), stat_chunks.data(), num_stat_blobs, stream);
   auto max_blobs = stats_merge_groups.element(num_stat_blobs - 1, stream);
 
-  hostdevice_vector<uint8_t> blobs(max_blobs.start_chunk + max_blobs.num_chunks, stream);
+  cudf::detail::hostdevice_vector<uint8_t> blobs(max_blobs.start_chunk + max_blobs.num_chunks,
+                                                 stream);
   gpu::orc_encode_statistics(blobs.device_ptr(),
                              stats_merge_groups.device_ptr(),
                              stat_chunks.data(),
                              num_stat_blobs,
                              stream);
-  stats_merge_groups.device_to_host(stream);
-  blobs.device_to_host(stream, true);
+  stats_merge_groups.device_to_host_async(stream);
+  blobs.device_to_host_sync(stream);
   return blobs;
 }
 
@@ -1304,9 +1172,10 @@ intermediate_statistics gather_statistic_blobs(statistics_freq const stats_freq,
     return intermediate_statistics{stream};
   }
 
-  hostdevice_vector<stats_column_desc> stat_desc(orc_table.num_columns(), stream);
-  hostdevice_vector<statistics_merge_group> rowgroup_merge(num_rowgroup_blobs, stream);
-  hostdevice_vector<statistics_merge_group> stripe_merge(num_stripe_blobs, stream);
+  cudf::detail::hostdevice_vector<stats_column_desc> stat_desc(orc_table.num_columns(), stream);
+  cudf::detail::hostdevice_vector<statistics_merge_group> rowgroup_merge(num_rowgroup_blobs,
+                                                                         stream);
+  cudf::detail::hostdevice_vector<statistics_merge_group> stripe_merge(num_stripe_blobs, stream);
   std::vector<statistics_dtype> col_stats_dtypes;
   std::vector<data_type> col_types;
   auto rowgroup_stat_merge = rowgroup_merge.host_ptr();
@@ -1361,9 +1230,9 @@ intermediate_statistics gather_statistic_blobs(statistics_freq const stats_freq,
       }
     }
   }
-  stat_desc.host_to_device(stream);
-  rowgroup_merge.host_to_device(stream);
-  stripe_merge.host_to_device(stream);
+  stat_desc.host_to_device_async(stream);
+  rowgroup_merge.host_to_device_async(stream);
+  stripe_merge.host_to_device_async(stream);
   set_stat_desc_leaf_cols(orc_table.d_columns, stat_desc, stream);
 
   // The rowgroup stat chunks are written out in each stripe. The stripe and file-level chunks are
@@ -1400,7 +1269,7 @@ intermediate_statistics gather_statistic_blobs(statistics_freq const stats_freq,
   auto rowgroup_blobs = [&]() -> std::vector<ColStatsBlob> {
     if (not is_granularity_rowgroup) { return {}; }
 
-    hostdevice_vector<uint8_t> blobs =
+    cudf::detail::hostdevice_vector<uint8_t> blobs =
       allocate_and_encode_blobs(rowgroup_merge, rowgroup_chunks, num_rowgroup_blobs, stream);
 
     std::vector<ColStatsBlob> rowgroup_blobs(num_rowgroup_blobs);
@@ -1444,7 +1313,7 @@ encoded_footer_statistics finish_statistic_blobs(int num_stripes,
 
   // merge the stripe persisted data and add file data
   rmm::device_uvector<statistics_chunk> stat_chunks(num_blobs, stream);
-  hostdevice_vector<statistics_merge_group> stats_merge(num_blobs, stream);
+  cudf::detail::hostdevice_vector<statistics_merge_group> stats_merge(num_blobs, stream);
 
   // we need to merge the stat arrays from the persisted data.
   // this needs to be done carefully because each array can contain
@@ -1496,7 +1365,7 @@ encoded_footer_statistics finish_statistic_blobs(int num_stripes,
   detail::merge_group_statistics<detail::io_file_format::ORC>(
     file_stat_chunks, stat_chunks.data(), d_file_stats_merge, num_file_blobs, stream);
 
-  hostdevice_vector<uint8_t> blobs =
+  cudf::detail::hostdevice_vector<uint8_t> blobs =
     allocate_and_encode_blobs(stats_merge, stat_chunks, num_blobs, stream);
 
   auto stripe_stat_merge = stats_merge.host_ptr();
@@ -1543,7 +1412,7 @@ void write_index_stream(int32_t stripe_id,
                         host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
                         host_2dspan<gpu::StripeStream const> strm_desc,
                         host_span<compression_result const> comp_res,
-                        std::vector<ColStatsBlob> const& rg_stats,
+                        host_span<ColStatsBlob const> rg_stats,
                         StripeInformation* stripe,
                         orc_streams* streams,
                         CompressionKind compression_kind,
@@ -1588,7 +1457,7 @@ void write_index_stream(int32_t stripe_id,
   auto kind = TypeKind::STRUCT;
   // TBD: Not sure we need an empty index stream for column 0
   if (stream_id != 0) {
-    const auto& strm = enc_streams[column_id][0];
+    auto const& strm = enc_streams[column_id][0];
     present          = find_record(strm, gpu::CI_PRESENT);
     data             = find_record(strm, gpu::CI_DATA);
     data2            = find_record(strm, gpu::CI_DATA2);
@@ -1659,13 +1528,13 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
                                     std::unique_ptr<data_sink> const& out_sink,
                                     rmm::cuda_stream_view stream)
 {
-  const auto length                                        = strm_desc.stream_size;
+  auto const length                                        = strm_desc.stream_size;
   (*streams)[enc_stream.ids[strm_desc.stream_type]].length = length;
   if (length == 0) {
     return std::async(std::launch::deferred, [] {});
   }
 
-  const auto* stream_in = (compression_kind == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
+  auto const* stream_in = (compression_kind == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
                                                      : (compressed_data + strm_desc.bfr_offset);
 
   auto write_task = [&]() {
@@ -1996,7 +1865,7 @@ hostdevice_2dvector<rowgroup_rows> calculate_rowgroup_bounds(orc_table_view cons
           }
         });
     });
-  rowgroup_bounds.device_to_host(stream, true);
+  rowgroup_bounds.device_to_host_sync(stream);
 
   return rowgroup_bounds;
 }
@@ -2026,7 +1895,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                          }();
 
                          if (col.is_null(idx) or not bit_value_or(pushdown_mask, idx, true))
-                           return 0u;
+                           return 0;
 
                          __int128_t const element =
                            col.type().id() == type_id::DECIMAL32   ? col.element<int32_t>(idx)
@@ -2036,12 +1905,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                          __int128_t const sign      = (element < 0) ? 1 : 0;
                          __uint128_t zigzaged_value = ((element ^ -sign) * 2) + sign;
 
-                         uint32_t encoded_length = 1;
-                         while (zigzaged_value > 127) {
-                           zigzaged_value >>= 7u;
-                           ++encoded_length;
-                         }
-                         return encoded_length;
+                         return varint_size(zigzaged_value);
                        });
 
       // Compute element offsets within each row group
@@ -2099,55 +1963,6 @@ std::map<uint32_t, size_t> decimal_column_sizes(
   return column_sizes;
 }
 
-string_dictionaries allocate_dictionaries(orc_table_view const& orc_table,
-                                          host_2dspan<rowgroup_rows const> rowgroup_bounds,
-                                          rmm::cuda_stream_view stream)
-{
-  thrust::host_vector<bool> is_dict_enabled(orc_table.num_columns());
-  for (auto col_idx : orc_table.string_column_indices)
-    is_dict_enabled[col_idx] = std::all_of(
-      thrust::make_counting_iterator(0ul),
-      thrust::make_counting_iterator(rowgroup_bounds.size().first),
-      [&](auto rg_idx) {
-        return rowgroup_bounds[rg_idx][col_idx].size() < std::numeric_limits<uint16_t>::max();
-      });
-
-  std::vector<rmm::device_uvector<uint32_t>> data;
-  std::transform(orc_table.string_column_indices.begin(),
-                 orc_table.string_column_indices.end(),
-                 std::back_inserter(data),
-                 [&](auto& idx) {
-                   return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                     orc_table.columns[idx].size(), stream, rmm::mr::get_current_device_resource());
-                 });
-  std::vector<rmm::device_uvector<uint32_t>> index;
-  std::transform(orc_table.string_column_indices.begin(),
-                 orc_table.string_column_indices.end(),
-                 std::back_inserter(index),
-                 [&](auto& idx) {
-                   return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                     orc_table.columns[idx].size(), stream, rmm::mr::get_current_device_resource());
-                 });
-  stream.synchronize();
-
-  std::vector<device_span<uint32_t>> data_ptrs;
-  std::transform(data.begin(), data.end(), std::back_inserter(data_ptrs), [](auto& uvec) {
-    return device_span<uint32_t>{uvec};
-  });
-  std::vector<device_span<uint32_t>> index_ptrs;
-  std::transform(index.begin(), index.end(), std::back_inserter(index_ptrs), [](auto& uvec) {
-    return device_span<uint32_t>{uvec};
-  });
-
-  return {std::move(data),
-          std::move(index),
-          cudf::detail::make_device_uvector_sync(
-            data_ptrs, stream, rmm::mr::get_current_device_resource()),
-          cudf::detail::make_device_uvector_sync(
-            index_ptrs, stream, rmm::mr::get_current_device_resource()),
-          std::move(is_dict_enabled)};
-}
-
 size_t max_compression_output_size(CompressionKind compression_kind, uint32_t compression_blocksize)
 {
   if (compression_kind == NONE) return 0;
@@ -2175,6 +1990,160 @@ std::unique_ptr<table_input_metadata> make_table_meta(table_view const& input)
   return table_meta;
 }
 
+// Computes the number of characters in each rowgroup for each string column and attaches the
+// results to the corresponding orc_column_view. The owning host vector is returned.
+auto set_rowgroup_char_counts(orc_table_view& orc_table,
+                              device_2dspan<rowgroup_rows const> rowgroup_bounds,
+                              rmm::cuda_stream_view stream)
+{
+  auto const num_rowgroups = rowgroup_bounds.size().first;
+  auto const num_str_cols  = orc_table.num_string_columns();
+
+  auto counts         = rmm::device_uvector<size_type>(num_str_cols * num_rowgroups, stream);
+  auto counts_2d_view = device_2dspan<size_type>(counts.data(), num_str_cols, num_rowgroups);
+  gpu::rowgroup_char_counts(counts_2d_view,
+                            orc_table.d_columns,
+                            rowgroup_bounds,
+                            orc_table.d_string_column_indices,
+                            stream);
+
+  auto const h_counts = cudf::detail::make_std_vector_sync(counts, stream);
+
+  for (auto col_idx : orc_table.string_column_indices) {
+    auto& str_column = orc_table.column(col_idx);
+    str_column.attach_rowgroup_char_counts(
+      {h_counts.data() + str_column.str_index() * num_rowgroups, num_rowgroups});
+  }
+
+  return h_counts;
+}
+
+// Holds the stripe dictionary descriptors and dictionary buffers.
+struct stripe_dictionaries {
+  hostdevice_2dvector<gpu::stripe_dictionary> views;       // descriptors [string_column][stripe]
+  std::vector<rmm::device_uvector<uint32_t>> data_owner;   // dictionary data owner, per stripe
+  std::vector<rmm::device_uvector<uint32_t>> index_owner;  // dictionary index owner, per stripe
+
+  // Should be called after encoding is complete to deallocate the dictionary buffers.
+  void on_encode_complete(rmm::cuda_stream_view stream)
+  {
+    data_owner.clear();
+    index_owner.clear();
+
+    for (auto& sd : views.host_view().flat_view()) {
+      sd.data  = {};
+      sd.index = {};
+    }
+    views.host_to_device_async(stream);
+  }
+};
+
+// Build stripe dictionaries for string columns
+stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
+                                       file_segmentation const& segmentation,
+                                       rmm::cuda_stream_view stream)
+{
+  std::vector<std::vector<rmm::device_uvector<gpu::slot_type>>> hash_maps_storage(
+    orc_table.string_column_indices.size());
+  for (auto col_idx : orc_table.string_column_indices) {
+    auto& str_column = orc_table.column(col_idx);
+    for (auto const& stripe : segmentation.stripes) {
+      auto const stripe_num_rows =
+        stripe.size == 0 ? 0
+                         : segmentation.rowgroups[stripe.first + stripe.size - 1][col_idx].end -
+                             segmentation.rowgroups[stripe.first][col_idx].begin;
+      hash_maps_storage[str_column.str_index()].emplace_back(stripe_num_rows * 1.43, stream);
+    }
+  }
+
+  hostdevice_2dvector<gpu::stripe_dictionary> stripe_dicts(
+    orc_table.num_string_columns(), segmentation.num_stripes(), stream);
+  if (stripe_dicts.count() == 0) return {std::move(stripe_dicts), {}, {}};
+
+  // Initialize stripe dictionaries
+  for (auto col_idx : orc_table.string_column_indices) {
+    auto& str_column       = orc_table.column(col_idx);
+    auto const str_col_idx = str_column.str_index();
+    str_column.attach_stripe_dicts(stripe_dicts[str_col_idx],
+                                   stripe_dicts.device_view()[str_col_idx]);
+    for (auto const& stripe : segmentation.stripes) {
+      auto const stripe_idx = stripe.id;
+      auto& sd              = stripe_dicts[str_col_idx][stripe_idx];
+
+      sd.map_slots      = hash_maps_storage[str_col_idx][stripe_idx];
+      sd.column_idx     = col_idx;
+      sd.start_row      = segmentation.rowgroups[stripe.first][col_idx].begin;
+      sd.start_rowgroup = stripe.first;
+      sd.num_rows =
+        segmentation.rowgroups[stripe.first + stripe.size - 1][col_idx].end - sd.start_row;
+
+      sd.entry_count = 0;
+      sd.char_count  = 0;
+    }
+  }
+  stripe_dicts.host_to_device_async(stream);
+
+  gpu::initialize_dictionary_hash_maps(stripe_dicts, stream);
+  gpu::populate_dictionary_hash_maps(stripe_dicts, orc_table.d_columns, stream);
+  // Copy the entry counts and char counts from the device to the host
+  stripe_dicts.device_to_host_sync(stream);
+
+  // Data owners; can be cleared after encode
+  std::vector<rmm::device_uvector<uint32_t>> dict_data_owner;
+  std::vector<rmm::device_uvector<uint32_t>> dict_index_owner;
+  // Make decision about which stripes to encode with dictionary encoding
+  for (auto col_idx : orc_table.string_column_indices) {
+    auto& str_column = orc_table.column(col_idx);
+    bool col_use_dictionary{false};
+    for (auto const& stripe : segmentation.stripes) {
+      auto const stripe_idx        = stripe.id;
+      auto const str_col_idx       = str_column.str_index();
+      auto& sd                     = stripe_dicts[str_col_idx][stripe_idx];
+      auto const direct_char_count = std::accumulate(
+        thrust::make_counting_iterator(stripe.first),
+        thrust::make_counting_iterator(stripe.first + stripe.size),
+        0,
+        [&](auto total, auto const& rg) { return total + str_column.rowgroup_char_count(rg); });
+      // Enable dictionary encoding if the dictionary size is smaller than the direct encode size
+      // The estimate excludes the LENGTH stream size, which is present in both cases
+      sd.is_enabled = [&]() {
+        auto const dict_index_size = varint_size(sd.entry_count);
+        return sd.char_count + dict_index_size * sd.entry_count < direct_char_count;
+      }();
+      if (sd.is_enabled) {
+        dict_data_owner.emplace_back(sd.entry_count, stream);
+        sd.data            = dict_data_owner.back();
+        col_use_dictionary = true;
+      } else {
+        // Clear hash map storage as dictionary encoding is not used for this stripe
+        hash_maps_storage[str_col_idx][stripe_idx] = rmm::device_uvector<gpu::slot_type>(0, stream);
+        sd.map_slots                               = {};
+      }
+    }
+    // If any stripe uses dictionary encoding, allocate index storage for the whole column
+    if (col_use_dictionary) {
+      dict_index_owner.emplace_back(str_column.size(), stream);
+      for (auto& sd : stripe_dicts[str_column.str_index()]) {
+        sd.index = dict_index_owner.back();
+      }
+    }
+  }
+  stripe_dicts.host_to_device_async(stream);
+
+  gpu::collect_map_entries(stripe_dicts, stream);
+  gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream);
+
+  // Clear map slots; hash map storage is deallocated at the end of this function
+  auto device_dicts_flat = stripe_dicts.device_view().flat_view();
+  thrust::for_each(rmm::exec_policy(stream),
+                   device_dicts_flat.begin(),
+                   device_dicts_flat.end(),
+                   [] __device__(auto& sd) { sd.map_slots = {}; });
+  stripe_dicts.device_to_host_async(stream);
+
+  return {std::move(stripe_dicts), std::move(dict_data_owner), std::move(dict_index_owner)};
+}
+
 /**
  * @brief Perform the processing steps needed to convert the input table into the output ORC data
  * for writing, such as compression and ORC encoding.
@@ -2187,6 +2156,7 @@ std::unique_ptr<table_input_metadata> make_table_meta(table_view const& input)
  * @param compression_kind The compression kind
  * @param compression_blocksize The block size used for compression
  * @param stats_freq Column statistics granularity type for parquet/orc writers
+ * @param collect_compression_stats Flag to indicate if compression statistics should be collected
  * @param write_mode Flag to indicate if there is only a single table write
  * @param out_sink Sink for writing data
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -2200,6 +2170,7 @@ auto convert_table_to_orc_data(table_view const& input,
                                CompressionKind compression_kind,
                                size_t compression_blocksize,
                                statistics_freq stats_freq,
+                               bool collect_compression_stats,
                                single_write_mode write_mode,
                                data_sink const& out_sink,
                                rmm::cuda_stream_view stream)
@@ -2213,37 +2184,14 @@ auto convert_table_to_orc_data(table_view const& input,
 
   auto rowgroup_bounds = calculate_rowgroup_bounds(orc_table, row_index_stride, stream);
 
-  // Build per-column dictionary indices
-  auto dictionaries = allocate_dictionaries(orc_table, rowgroup_bounds, stream);
-  hostdevice_2dvector<gpu::DictionaryChunk> dict(
-    rowgroup_bounds.size().first, orc_table.num_string_columns(), stream);
-  if (not dict.is_empty()) {
-    init_dictionaries(orc_table,
-                      rowgroup_bounds,
-                      dictionaries.d_data_view,
-                      dictionaries.d_index_view,
-                      &dict,
-                      stream);
-  }
+  [[maybe_unused]] auto const rg_char_counts_data =
+    set_rowgroup_char_counts(orc_table, rowgroup_bounds, stream);
 
-  // Decide stripe boundaries based on rowgroups and dict chunks
+  // Decide stripe boundaries based on rowgroups and char counts
   auto segmentation =
     calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size);
 
-  // Build stripe-level dictionaries
-  hostdevice_2dvector<gpu::StripeDictionary> stripe_dict(
-    segmentation.num_stripes(), orc_table.num_string_columns(), stream);
-  if (not stripe_dict.is_empty()) {
-    build_dictionaries(orc_table,
-                       segmentation.stripes,
-                       dict,
-                       dictionaries.index,
-                       dictionaries.dictionary_enabled,
-                       stripe_dict,
-                       enable_dictionary,
-                       stream);
-  }
-
+  auto stripe_dicts    = build_dictionaries(orc_table, segmentation, stream);
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
 
   auto const uncompressed_block_align = uncomp_block_alignment(compression_kind);
@@ -2255,19 +2203,16 @@ auto convert_table_to_orc_data(table_view const& input,
                                 enable_dictionary,
                                 compression_kind,
                                 write_mode);
-  auto enc_data = encode_columns(orc_table,
-                                 std::move(dictionaries),
-                                 std::move(dec_chunk_sizes),
-                                 segmentation,
-                                 streams,
-                                 uncompressed_block_align,
-                                 stream);
+  auto enc_data = encode_columns(
+    orc_table, std::move(dec_chunk_sizes), segmentation, streams, uncompressed_block_align, stream);
+
+  stripe_dicts.on_encode_complete(stream);
 
   auto const num_rows = input.num_rows();
 
   // Assemble individual disparate column chunks into contiguous data streams
   size_type const num_index_streams = (orc_table.num_columns() + 1);
-  const auto num_data_streams       = streams.size() - num_index_streams;
+  auto const num_data_streams       = streams.size() - num_index_streams;
   hostdevice_2dvector<gpu::StripeStream> strm_descs(
     segmentation.num_stripes(), num_data_streams, stream);
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data, &strm_descs, stream);
@@ -2276,13 +2221,14 @@ auto convert_table_to_orc_data(table_view const& input,
     return std::tuple{std::move(enc_data),
                       std::move(segmentation),
                       std::move(orc_table),
-                      rmm::device_uvector<uint8_t>{0, stream},  // compressed_data
-                      hostdevice_vector<compression_result>{},  // comp_results
+                      rmm::device_uvector<uint8_t>{0, stream},                // compressed_data
+                      cudf::detail::hostdevice_vector<compression_result>{},  // comp_results
                       std::move(strm_descs),
                       intermediate_statistics{stream},
+                      std::optional<writer_compression_statistics>{},
                       std::move(streams),
                       std::move(stripes),
-                      std::move(stripe_dict),
+                      std::move(stripe_dicts.views),
                       cudf::detail::pinned_host_vector<uint8_t>()};
   }
 
@@ -2323,29 +2269,31 @@ auto convert_table_to_orc_data(table_view const& input,
 
   // Compress the data streams
   rmm::device_uvector<uint8_t> compressed_data(compressed_bfr_size, stream);
-  hostdevice_vector<compression_result> comp_results(num_compressed_blocks, stream);
+  cudf::detail::hostdevice_vector<compression_result> comp_results(num_compressed_blocks, stream);
+  std::optional<writer_compression_statistics> compression_stats;
   thrust::fill(rmm::exec_policy(stream),
                comp_results.d_begin(),
                comp_results.d_end(),
                compression_result{0, compression_status::FAILURE});
   if (compression_kind != NONE) {
-    strm_descs.host_to_device(stream);
-    gpu::CompressOrcDataStreams(compressed_data.data(),
-                                num_compressed_blocks,
-                                compression_kind,
-                                compression_blocksize,
-                                max_compressed_block_size,
-                                compressed_block_align,
-                                strm_descs,
-                                enc_data.streams,
-                                comp_results,
-                                stream);
+    strm_descs.host_to_device_async(stream);
+    compression_stats = gpu::CompressOrcDataStreams(compressed_data,
+                                                    num_compressed_blocks,
+                                                    compression_kind,
+                                                    compression_blocksize,
+                                                    max_compressed_block_size,
+                                                    compressed_block_align,
+                                                    collect_compression_stats,
+                                                    strm_descs,
+                                                    enc_data.streams,
+                                                    comp_results,
+                                                    stream);
 
     // deallocate encoded data as it is not needed anymore
     enc_data.data.clear();
 
-    strm_descs.device_to_host(stream);
-    comp_results.device_to_host(stream, true);
+    strm_descs.device_to_host_async(stream);
+    comp_results.device_to_host_sync(stream);
   }
 
   auto intermediate_stats = gather_statistic_blobs(stats_freq, orc_table, segmentation, stream);
@@ -2357,9 +2305,10 @@ auto convert_table_to_orc_data(table_view const& input,
                     std::move(comp_results),
                     std::move(strm_descs),
                     std::move(intermediate_stats),
+                    std::move(compression_stats),
                     std::move(streams),
                     std::move(stripes),
-                    std::move(stripe_dict),
+                    std::move(stripe_dicts.views),
                     std::move(bounce_buffer)};
 }
 
@@ -2374,6 +2323,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     _row_index_stride{options.get_row_index_stride()},
     _compression_kind(to_orc_compression(options.get_compression())),
     _compression_blocksize(compression_block_size(_compression_kind)),
+    _compression_statistics(options.get_compression_statistics()),
     _stats_freq(options.get_statistics_freq()),
     _single_write_mode(mode),
     _kv_meta(options.get_key_value_metadata()),
@@ -2394,6 +2344,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     _row_index_stride{options.get_row_index_stride()},
     _compression_kind(to_orc_compression(options.get_compression())),
     _compression_blocksize(compression_block_size(_compression_kind)),
+    _compression_statistics(options.get_compression_statistics()),
     _stats_freq(options.get_statistics_freq()),
     _single_write_mode(mode),
     _kv_meta(options.get_key_value_metadata()),
@@ -2431,9 +2382,10 @@ void writer::impl::write(table_view const& input)
                          comp_results,
                          strm_descs,
                          intermediate_stats,
+                         compression_stats,
                          streams,
                          stripes,
-                         stripe_dict, /* unused, but its data will be accessed via pointer later */
+                         stripe_dicts, /* unused, but its data will be accessed via pointer later */
                          bounce_buffer] = [&] {
     try {
       return convert_table_to_orc_data(input,
@@ -2444,6 +2396,7 @@ void writer::impl::write(table_view const& input)
                                        _compression_kind,
                                        _compression_blocksize,
                                        _stats_freq,
+                                       _compression_statistics != nullptr,
                                        _single_write_mode,
                                        *_out_sink,
                                        _stream);
@@ -2462,13 +2415,31 @@ void writer::impl::write(table_view const& input)
                          compressed_data,
                          comp_results,
                          strm_descs,
-                         intermediate_stats,
+                         intermediate_stats.rowgroup_blobs,
                          streams,
                          stripes,
                          bounce_buffer);
 
   // Update data into the footer. This needs to be called even when num_rows==0.
   add_table_to_footer_data(orc_table, stripes);
+
+  // Update file-level and compression statistics
+  update_statistics(orc_table.num_rows(), std::move(intermediate_stats), compression_stats);
+}
+
+void writer::impl::update_statistics(
+  size_type num_rows,
+  intermediate_statistics&& intermediate_stats,
+  std::optional<writer_compression_statistics> const& compression_stats)
+{
+  if (intermediate_stats.stripe_stat_chunks.size() > 0) {
+    _persisted_stripe_statistics.persist(
+      num_rows, _single_write_mode, std::move(intermediate_stats), _stream);
+  }
+
+  if (compression_stats.has_value() and _compression_statistics != nullptr) {
+    *_compression_statistics += compression_stats.value();
+  }
 }
 
 void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
@@ -2477,18 +2448,13 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
                                           device_span<uint8_t const> compressed_data,
                                           host_span<compression_result const> comp_results,
                                           host_2dspan<gpu::StripeStream const> strm_descs,
-                                          intermediate_statistics& intermediate_stats,
+                                          host_span<ColStatsBlob const> rg_stats,
                                           orc_streams& streams,
                                           host_span<StripeInformation> stripes,
                                           host_span<uint8_t> bounce_buffer)
 {
   if (orc_table.num_rows() == 0) { return; }
 
-  if (intermediate_stats.stripe_stat_chunks.size() > 0) {
-    _persisted_stripe_statistics.persist(
-      orc_table.num_rows(), _single_write_mode, intermediate_stats, _stream);
-  }
-
   // Write stripes
   std::vector<std::future<void>> write_tasks;
   for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
@@ -2506,7 +2472,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
                          enc_data.streams,
                          strm_descs,
                          comp_results,
-                         intermediate_stats.rowgroup_blobs,
+                         rg_stats,
                          &stripe,
                          &streams,
                          _compression_kind,
@@ -2537,7 +2503,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
       sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
       sf.columns[i].dictionarySize =
         (sf.columns[i].kind == DICTIONARY_V2)
-          ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
+          ? orc_table.column(i - 1).host_stripe_dict(stripe_id).entry_count
           : 0;
       if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
     }
@@ -2659,7 +2625,7 @@ void writer::impl::close()
                  });
 
   // Write statistics metadata
-  if (_orc_meta.stripeStats.size() != 0) {
+  if (not _orc_meta.stripeStats.empty()) {
     ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0);
     pbw.write(_orc_meta);
     add_uncompressed_block_headers(_compression_kind, _compression_blocksize, pbw.buffer());
@@ -2679,7 +2645,7 @@ void writer::impl::close()
   ps.version              = {0, 12};
   ps.magic                = MAGIC;
 
-  const auto ps_length = static_cast<uint8_t>(pbw.write(ps));
+  auto const ps_length = static_cast<uint8_t>(pbw.write(ps));
   pbw.put_byte(ps_length);
   _out_sink->host_write(pbw.data(), pbw.size());
   _out_sink->flush();
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index cdcc092549a..67c65eb9a37 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -173,7 +173,7 @@ struct intermediate_statistics {
 
   intermediate_statistics(std::vector<ColStatsBlob> rb,
                           rmm::device_uvector<statistics_chunk> sc,
-                          hostdevice_vector<statistics_merge_group> smg,
+                          cudf::detail::hostdevice_vector<statistics_merge_group> smg,
                           std::vector<statistics_dtype> sdt,
                           std::vector<data_type> sct)
     : rowgroup_blobs(std::move(rb)),
@@ -188,7 +188,7 @@ struct intermediate_statistics {
   std::vector<ColStatsBlob> rowgroup_blobs;
 
   rmm::device_uvector<statistics_chunk> stripe_stat_chunks;
-  hostdevice_vector<statistics_merge_group> stripe_stat_merge;
+  cudf::detail::hostdevice_vector<statistics_merge_group> stripe_stat_merge;
   std::vector<statistics_dtype> stats_dtypes;
   std::vector<data_type> col_types;
 };
@@ -210,11 +210,11 @@ struct persisted_statistics {
 
   void persist(int num_table_rows,
                single_write_mode write_mode,
-               intermediate_statistics& intermediate_stats,
+               intermediate_statistics&& intermediate_stats,
                rmm::cuda_stream_view stream);
 
   std::vector<rmm::device_uvector<statistics_chunk>> stripe_stat_chunks;
-  std::vector<hostdevice_vector<statistics_merge_group>> stripe_stat_merge;
+  std::vector<cudf::detail::hostdevice_vector<statistics_merge_group>> stripe_stat_merge;
   std::vector<rmm::device_uvector<char>> string_pools;
   std::vector<statistics_dtype> stats_dtypes;
   std::vector<data_type> col_types;
@@ -235,7 +235,7 @@ struct encoded_footer_statistics {
  */
 class writer::impl {
   // ORC datasets start with a 3 byte header
-  static constexpr const char* MAGIC = "ORC";
+  static constexpr char const* MAGIC = "ORC";
 
  public:
   /**
@@ -299,7 +299,7 @@ class writer::impl {
    * @param[in] compressed_data Compressed stream data
    * @param[in] comp_results Status of data compression
    * @param[in] strm_descs List of stream descriptors
-   * @param[in,out] intermediate_stats Statistics data stored between calls to write
+   * @param[in] rg_stats row group level statistics
    * @param[in,out] streams List of stream descriptors
    * @param[in,out] stripes List of stripe description
    * @param[out] bounce_buffer Temporary host output buffer
@@ -310,7 +310,7 @@ class writer::impl {
                               device_span<uint8_t const> compressed_data,
                               host_span<compression_result const> comp_results,
                               host_2dspan<gpu::StripeStream const> strm_descs,
-                              intermediate_statistics& intermediate_stats,
+                              host_span<ColStatsBlob const> rg_stats,
                               orc_streams& streams,
                               host_span<StripeInformation> stripes,
                               host_span<uint8_t> bounce_buffer);
@@ -324,6 +324,17 @@ class writer::impl {
   void add_table_to_footer_data(orc_table_view const& orc_table,
                                 std::vector<StripeInformation>& stripes);
 
+  /**
+   * @brief Update writer-level statistics with data from the current table.
+   *
+   * @param num_rows Number of rows in the current table
+   * @param single_table_stats Statistics data from the current table
+   * @param compression_stats Compression statistics from the current table
+   */
+  void update_statistics(size_type num_rows,
+                         intermediate_statistics&& single_table_stats,
+                         std::optional<writer_compression_statistics> const& compression_stats);
+
  private:
   // CUDA stream.
   rmm::cuda_stream_view const _stream;
@@ -333,6 +344,7 @@ class writer::impl {
   size_type const _row_index_stride;
   CompressionKind const _compression_kind;
   size_t const _compression_blocksize;
+  std::shared_ptr<writer_compression_statistics> _compression_statistics;  // Optional output
   statistics_freq const _stats_freq;
   single_write_mode const _single_write_mode;  // Special parameter only used by `write()` to
                                                // indicate that we are guaranteeing a single table
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index a4d9673e8fa..9ff1869edde 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -36,10 +36,10 @@ template <int block_size>
 __global__ void __launch_bounds__(block_size)
   initialize_chunk_hash_maps_kernel(device_span<EncColumnChunk> chunks)
 {
-  auto chunk = chunks[blockIdx.x];
-  auto t     = threadIdx.x;
+  auto const chunk = chunks[blockIdx.x];
+  auto const t     = threadIdx.x;
   // fut: Now that per-chunk dict is same size as ck.num_values, try to not use one block per chunk
-  for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
+  for (thread_index_type i = 0; i < chunk.dict_map_size; i += block_size) {
     if (t + i < chunk.dict_map_size) {
       new (&chunk.dict_map_slots[t + i].first) map_type::atomic_key_type{KEY_SENTINEL};
       new (&chunk.dict_map_slots[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL};
@@ -63,7 +63,7 @@ struct hash_functor {
   column_device_view const& col;
   __device__ auto operator()(size_type idx) const
   {
-    return cudf::detail::MurmurHash3_32<T>{}(col.element<T>(idx));
+    return cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
   }
 };
 
@@ -131,7 +131,7 @@ __global__ void __launch_bounds__(block_size)
                                                         cuco::empty_value{VALUE_SENTINEL});
 
   __shared__ size_type total_num_dict_entries;
-  size_type val_idx = s_start_value_idx + t;
+  thread_index_type val_idx = s_start_value_idx + t;
   while (val_idx - block_size < end_value_idx) {
     auto const is_valid =
       val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
@@ -252,11 +252,9 @@ __global__ void __launch_bounds__(block_size)
                                    cuco::empty_key{KEY_SENTINEL},
                                    cuco::empty_value{VALUE_SENTINEL});
 
-  auto val_idx = s_start_value_idx + t;
+  thread_index_type val_idx = s_start_value_idx + t;
   while (val_idx < end_value_idx) {
-    auto const is_valid = val_idx < data_col.size() and data_col.is_valid(val_idx);
-
-    if (is_valid) {
+    if (data_col.is_valid(val_idx)) {
       auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx);
       cudf_assert(found_slot != map.end() &&
                   "Unable to find value in map in dictionary index construction");
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 38ee6482e22..92fcd151925 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -23,7 +23,7 @@
 namespace cudf {
 namespace io {
 namespace parquet {
-const uint8_t CompactProtocolReader::g_list2struct[16] = {0,
+uint8_t const CompactProtocolReader::g_list2struct[16] = {0,
                                                           1,
                                                           2,
                                                           ST_FLD_BYTE,
@@ -90,7 +90,7 @@ struct FunctionSwitchImpl {
   template <typename... Operator>
   static inline bool run(CompactProtocolReader* cpr,
                          int field_type,
-                         const int& field,
+                         int const& field,
                          std::tuple<Operator...>& ops)
   {
     if (field == std::get<index>(ops).field()) {
@@ -106,7 +106,7 @@ struct FunctionSwitchImpl<0> {
   template <typename... Operator>
   static inline bool run(CompactProtocolReader* cpr,
                          int field_type,
-                         const int& field,
+                         int const& field,
                          std::tuple<Operator...>& ops)
   {
     if (field == std::get<0>(ops).field()) {
@@ -245,7 +245,7 @@ bool CompactProtocolReader::read(ColumnChunkMetaData* c)
                             ParquetFieldInt64(9, c->data_page_offset),
                             ParquetFieldInt64(10, c->index_page_offset),
                             ParquetFieldInt64(11, c->dictionary_page_offset),
-                            ParquetFieldStructBlob(12, c->statistics_blob));
+                            ParquetFieldStruct(12, c->statistics));
   return function_builder(this, op);
 }
 
@@ -255,7 +255,8 @@ bool CompactProtocolReader::read(PageHeader* p)
                             ParquetFieldInt32(2, p->uncompressed_page_size),
                             ParquetFieldInt32(3, p->compressed_page_size),
                             ParquetFieldStruct(5, p->data_page_header),
-                            ParquetFieldStruct(7, p->dictionary_page_header));
+                            ParquetFieldStruct(7, p->dictionary_page_header),
+                            ParquetFieldStruct(8, p->data_page_header_v2));
   return function_builder(this, op);
 }
 
@@ -275,6 +276,18 @@ bool CompactProtocolReader::read(DictionaryPageHeader* d)
   return function_builder(this, op);
 }
 
+bool CompactProtocolReader::read(DataPageHeaderV2* d)
+{
+  auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values),
+                            ParquetFieldInt32(2, d->num_nulls),
+                            ParquetFieldInt32(3, d->num_rows),
+                            ParquetFieldEnum<Encoding>(4, d->encoding),
+                            ParquetFieldInt32(5, d->definition_levels_byte_length),
+                            ParquetFieldInt32(6, d->repetition_levels_byte_length),
+                            ParquetFieldBool(7, d->is_compressed));
+  return function_builder(this, op);
+}
+
 bool CompactProtocolReader::read(KeyValue* k)
 {
   auto op = std::make_tuple(ParquetFieldString(1, k->key), ParquetFieldString(2, k->value));
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 74565b2f244..62ccacaac37 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,8 +44,8 @@ class CompactProtocolReader {
   static const uint8_t g_list2struct[16];
 
  public:
-  explicit CompactProtocolReader(const uint8_t* base = nullptr, size_t len = 0) { init(base, len); }
-  void init(const uint8_t* base, size_t len)
+  explicit CompactProtocolReader(uint8_t const* base = nullptr, size_t len = 0) { init(base, len); }
+  void init(uint8_t const* base, size_t len)
   {
     m_base = m_cur = base;
     m_end          = base + len;
@@ -114,6 +114,7 @@ class CompactProtocolReader {
   bool read(PageHeader* p);
   bool read(DataPageHeader* d);
   bool read(DictionaryPageHeader* d);
+  bool read(DataPageHeaderV2* d);
   bool read(KeyValue* k);
   bool read(PageLocation* p);
   bool read(OffsetIndex* o);
@@ -135,9 +136,9 @@ class CompactProtocolReader {
                  int max_rep_level = 0);
 
  protected:
-  const uint8_t* m_base = nullptr;
-  const uint8_t* m_cur  = nullptr;
-  const uint8_t* m_end  = nullptr;
+  uint8_t const* m_base = nullptr;
+  uint8_t const* m_cur  = nullptr;
+  uint8_t const* m_end  = nullptr;
 
   friend class ParquetFieldBool;
   friend class ParquetFieldBoolList;
@@ -383,7 +384,7 @@ class ParquetFieldString {
     if (field_type != ST_FLD_BINARY) return true;
     uint32_t n = cpr->get_u32();
     if (n < (size_t)(cpr->m_end - cpr->m_cur)) {
-      val.assign((const char*)cpr->m_cur, n);
+      val.assign((char const*)cpr->m_cur, n);
       cpr->m_cur += n;
       return false;
     } else {
@@ -560,7 +561,7 @@ class ParquetFieldStringList {
     for (int32_t i = 0; i < n; i++) {
       uint32_t l = cpr->get_u32();
       if (l < (size_t)(cpr->m_end - cpr->m_cur)) {
-        val[i].assign((const char*)cpr->m_cur, l);
+        val[i].assign((char const*)cpr->m_cur, l);
         cpr->m_cur += l;
       } else
         return true;
@@ -649,7 +650,7 @@ class ParquetFieldStructBlob {
   inline bool operator()(CompactProtocolReader* cpr, int field_type)
   {
     if (field_type != ST_FLD_STRUCT) return true;
-    const uint8_t* start = cpr->m_cur;
+    uint8_t const* start = cpr->m_cur;
     cpr->skip_struct_field(field_type);
     if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); }
     return false;
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 762e1c523b7..b2a89129645 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -24,15 +24,15 @@ namespace parquet {
  * @brief Parquet CompactProtocolWriter class
  */
 
-size_t CompactProtocolWriter::write(const FileMetaData& f)
+size_t CompactProtocolWriter::write(FileMetaData const& f)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_int(1, f.version);
   c.field_struct_list(2, f.schema);
   c.field_int(3, f.num_rows);
   c.field_struct_list(4, f.row_groups);
-  if (f.key_value_metadata.size() != 0) { c.field_struct_list(5, f.key_value_metadata); }
-  if (f.created_by.size() != 0) { c.field_string(6, f.created_by); }
+  if (not f.key_value_metadata.empty()) { c.field_struct_list(5, f.key_value_metadata); }
+  if (not f.created_by.empty()) { c.field_string(6, f.created_by); }
   if (f.column_order_listsize != 0) {
     // Dummy list of struct containing an empty field1 struct
     c.put_field_header(7, c.current_field(), ST_FLD_LIST);
@@ -48,7 +48,7 @@ size_t CompactProtocolWriter::write(const FileMetaData& f)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const DecimalType& decimal)
+size_t CompactProtocolWriter::write(DecimalType const& decimal)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_int(1, decimal.scale);
@@ -56,7 +56,7 @@ size_t CompactProtocolWriter::write(const DecimalType& decimal)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const TimeUnit& time_unit)
+size_t CompactProtocolWriter::write(TimeUnit const& time_unit)
 {
   CompactProtocolFieldWriter c(*this);
   auto const isset = time_unit.isset;
@@ -70,7 +70,7 @@ size_t CompactProtocolWriter::write(const TimeUnit& time_unit)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const TimeType& time)
+size_t CompactProtocolWriter::write(TimeType const& time)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_bool(1, time.isAdjustedToUTC);
@@ -78,7 +78,7 @@ size_t CompactProtocolWriter::write(const TimeType& time)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const TimestampType& timestamp)
+size_t CompactProtocolWriter::write(TimestampType const& timestamp)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_bool(1, timestamp.isAdjustedToUTC);
@@ -86,7 +86,7 @@ size_t CompactProtocolWriter::write(const TimestampType& timestamp)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const IntType& integer)
+size_t CompactProtocolWriter::write(IntType const& integer)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_int8(1, integer.bitWidth);
@@ -94,7 +94,7 @@ size_t CompactProtocolWriter::write(const IntType& integer)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const LogicalType& logical_type)
+size_t CompactProtocolWriter::write(LogicalType const& logical_type)
 {
   CompactProtocolFieldWriter c(*this);
   auto const isset = logical_type.isset;
@@ -126,7 +126,7 @@ size_t CompactProtocolWriter::write(const LogicalType& logical_type)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const SchemaElement& s)
+size_t CompactProtocolWriter::write(SchemaElement const& s)
 {
   CompactProtocolFieldWriter c(*this);
   if (s.type != UNDEFINED_TYPE) {
@@ -154,7 +154,7 @@ size_t CompactProtocolWriter::write(const SchemaElement& s)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const RowGroup& r)
+size_t CompactProtocolWriter::write(RowGroup const& r)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_struct_list(1, r.columns);
@@ -163,18 +163,18 @@ size_t CompactProtocolWriter::write(const RowGroup& r)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const KeyValue& k)
+size_t CompactProtocolWriter::write(KeyValue const& k)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_string(1, k.key);
-  if (k.value.size() != 0) { c.field_string(2, k.value); }
+  if (not k.value.empty()) { c.field_string(2, k.value); }
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const ColumnChunk& s)
+size_t CompactProtocolWriter::write(ColumnChunk const& s)
 {
   CompactProtocolFieldWriter c(*this);
-  if (s.file_path.size() != 0) { c.field_string(1, s.file_path); }
+  if (not s.file_path.empty()) { c.field_string(1, s.file_path); }
   c.field_int(2, s.file_offset);
   c.field_struct(3, s.meta_data);
   if (s.offset_index_length != 0) {
@@ -188,7 +188,7 @@ size_t CompactProtocolWriter::write(const ColumnChunk& s)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const ColumnChunkMetaData& s)
+size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_int(1, s.type);
@@ -201,11 +201,23 @@ size_t CompactProtocolWriter::write(const ColumnChunkMetaData& s)
   c.field_int(9, s.data_page_offset);
   if (s.index_page_offset != 0) { c.field_int(10, s.index_page_offset); }
   if (s.dictionary_page_offset != 0) { c.field_int(11, s.dictionary_page_offset); }
-  if (s.statistics_blob.size() != 0) { c.field_struct_blob(12, s.statistics_blob); }
+  c.field_struct(12, s.statistics);
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const PageLocation& s)
+size_t CompactProtocolWriter::write(Statistics const& s)
+{
+  CompactProtocolFieldWriter c(*this);
+  if (not s.max.empty()) { c.field_binary(1, s.max); }
+  if (not s.min.empty()) { c.field_binary(2, s.min); }
+  if (s.null_count != -1) { c.field_int(3, s.null_count); }
+  if (s.distinct_count != -1) { c.field_int(4, s.distinct_count); }
+  if (not s.max_value.empty()) { c.field_binary(5, s.max_value); }
+  if (not s.min_value.empty()) { c.field_binary(6, s.min_value); }
+  return c.value();
+}
+
+size_t CompactProtocolWriter::write(PageLocation const& s)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_int(1, s.offset);
@@ -214,7 +226,7 @@ size_t CompactProtocolWriter::write(const PageLocation& s)
   return c.value();
 }
 
-size_t CompactProtocolWriter::write(const OffsetIndex& s)
+size_t CompactProtocolWriter::write(OffsetIndex const& s)
 {
   CompactProtocolFieldWriter c(*this);
   c.field_struct_list(1, s.page_locations);
@@ -223,7 +235,7 @@ size_t CompactProtocolWriter::write(const OffsetIndex& s)
 
 void CompactProtocolFieldWriter::put_byte(uint8_t v) { writer.m_buf.push_back(v); }
 
-void CompactProtocolFieldWriter::put_byte(const uint8_t* raw, uint32_t len)
+void CompactProtocolFieldWriter::put_byte(uint8_t const* raw, uint32_t len)
 {
   for (uint32_t i = 0; i < len; i++)
     writer.m_buf.push_back(raw[i]);
@@ -285,7 +297,7 @@ inline void CompactProtocolFieldWriter::field_int(int field, int64_t val)
 }
 
 template <typename Enum>
-inline void CompactProtocolFieldWriter::field_int_list(int field, const std::vector<Enum>& val)
+inline void CompactProtocolFieldWriter::field_int_list(int field, std::vector<Enum> const& val)
 {
   put_field_header(field, current_field_value, ST_FLD_LIST);
   put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_I32));
@@ -297,7 +309,7 @@ inline void CompactProtocolFieldWriter::field_int_list(int field, const std::vec
 }
 
 template <typename T>
-inline void CompactProtocolFieldWriter::field_struct(int field, const T& val)
+inline void CompactProtocolFieldWriter::field_struct(int field, T const& val)
 {
   put_field_header(field, current_field_value, ST_FLD_STRUCT);
   if constexpr (not std::is_empty_v<T>) {
@@ -309,7 +321,7 @@ inline void CompactProtocolFieldWriter::field_struct(int field, const T& val)
 }
 
 template <typename T>
-inline void CompactProtocolFieldWriter::field_struct_list(int field, const std::vector<T>& val)
+inline void CompactProtocolFieldWriter::field_struct_list(int field, std::vector<T> const& val)
 {
   put_field_header(field, current_field_value, ST_FLD_LIST);
   put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_STRUCT));
@@ -327,25 +339,33 @@ inline size_t CompactProtocolFieldWriter::value()
 }
 
 inline void CompactProtocolFieldWriter::field_struct_blob(int field,
-                                                          const std::vector<uint8_t>& val)
+                                                          std::vector<uint8_t> const& val)
 {
   put_field_header(field, current_field_value, ST_FLD_STRUCT);
-  put_byte(val.data(), (uint32_t)val.size());
+  put_byte(val.data(), static_cast<uint32_t>(val.size()));
   put_byte(0);
   current_field_value = field;
 }
 
-inline void CompactProtocolFieldWriter::field_string(int field, const std::string& val)
+inline void CompactProtocolFieldWriter::field_binary(int field, std::vector<uint8_t> const& val)
+{
+  put_field_header(field, current_field_value, ST_FLD_BINARY);
+  put_uint(val.size());
+  put_byte(val.data(), static_cast<uint32_t>(val.size()));
+  current_field_value = field;
+}
+
+inline void CompactProtocolFieldWriter::field_string(int field, std::string const& val)
 {
   put_field_header(field, current_field_value, ST_FLD_BINARY);
   put_uint(val.size());
   // FIXME : replace reinterpret_cast
-  put_byte(reinterpret_cast<const uint8_t*>(val.data()), (uint32_t)val.size());
+  put_byte(reinterpret_cast<uint8_t const*>(val.data()), static_cast<uint32_t>(val.size()));
   current_field_value = field;
 }
 
 inline void CompactProtocolFieldWriter::field_string_list(int field,
-                                                          const std::vector<std::string>& val)
+                                                          std::vector<std::string> const& val)
 {
   put_field_header(field, current_field_value, ST_FLD_LIST);
   put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_BINARY));
@@ -353,14 +373,14 @@ inline void CompactProtocolFieldWriter::field_string_list(int field,
   for (auto& v : val) {
     put_uint(v.size());
     // FIXME : replace reinterpret_cast
-    put_byte(reinterpret_cast<const uint8_t*>(v.data()), (uint32_t)v.size());
+    put_byte(reinterpret_cast<uint8_t const*>(v.data()), static_cast<uint32_t>(v.size()));
   }
   current_field_value = field;
 }
 
 inline int CompactProtocolFieldWriter::current_field() { return current_field_value; }
 
-inline void CompactProtocolFieldWriter::set_current_field(const int& field)
+inline void CompactProtocolFieldWriter::set_current_field(int const& field)
 {
   current_field_value = field;
 }
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 739e4615099..8d7b0961934 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,20 +38,21 @@ class CompactProtocolWriter {
  public:
   CompactProtocolWriter(std::vector<uint8_t>* output) : m_buf(*output) {}
 
-  size_t write(const FileMetaData&);
-  size_t write(const DecimalType&);
-  size_t write(const TimeUnit&);
-  size_t write(const TimeType&);
-  size_t write(const TimestampType&);
-  size_t write(const IntType&);
-  size_t write(const LogicalType&);
-  size_t write(const SchemaElement&);
-  size_t write(const RowGroup&);
-  size_t write(const KeyValue&);
-  size_t write(const ColumnChunk&);
-  size_t write(const ColumnChunkMetaData&);
-  size_t write(const PageLocation&);
-  size_t write(const OffsetIndex&);
+  size_t write(FileMetaData const&);
+  size_t write(DecimalType const&);
+  size_t write(TimeUnit const&);
+  size_t write(TimeType const&);
+  size_t write(TimestampType const&);
+  size_t write(IntType const&);
+  size_t write(LogicalType const&);
+  size_t write(SchemaElement const&);
+  size_t write(RowGroup const&);
+  size_t write(KeyValue const&);
+  size_t write(ColumnChunk const&);
+  size_t write(ColumnChunkMetaData const&);
+  size_t write(Statistics const&);
+  size_t write(PageLocation const&);
+  size_t write(OffsetIndex const&);
 
  protected:
   std::vector<uint8_t>& m_buf;
@@ -71,7 +72,7 @@ class CompactProtocolFieldWriter {
 
   void put_byte(uint8_t v);
 
-  void put_byte(const uint8_t* raw, uint32_t len);
+  void put_byte(uint8_t const* raw, uint32_t len);
 
   uint32_t put_uint(uint64_t v);
 
@@ -88,25 +89,27 @@ class CompactProtocolFieldWriter {
   inline void field_int(int field, int64_t val);
 
   template <typename Enum>
-  inline void field_int_list(int field, const std::vector<Enum>& val);
+  inline void field_int_list(int field, std::vector<Enum> const& val);
 
   template <typename T>
-  inline void field_struct(int field, const T& val);
+  inline void field_struct(int field, T const& val);
 
   template <typename T>
-  inline void field_struct_list(int field, const std::vector<T>& val);
+  inline void field_struct_list(int field, std::vector<T> const& val);
 
   inline size_t value();
 
-  inline void field_struct_blob(int field, const std::vector<uint8_t>& val);
+  inline void field_struct_blob(int field, std::vector<uint8_t> const& val);
 
-  inline void field_string(int field, const std::string& val);
+  inline void field_binary(int field, std::vector<uint8_t> const& val);
 
-  inline void field_string_list(int field, const std::vector<std::string>& val);
+  inline void field_string(int field, std::string const& val);
+
+  inline void field_string_list(int field, std::vector<std::string> const& val);
 
   inline int current_field();
 
-  inline void set_current_field(const int& field);
+  inline void set_current_field(int const& field);
 };
 
 }  // namespace parquet
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
new file mode 100644
index 00000000000..8de3702bc2e
--- /dev/null
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "page_decode.cuh"
+
+#include <io/utilities/column_buffer.hpp>
+
+#include <cudf/hashing/detail/default_hash.cuh>
+
+#include <rmm/exec_policy.hpp>
+#include <thrust/reduce.h>
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace gpu {
+
+namespace {
+
+// # of threads we're decoding with
+constexpr int preprocess_block_size = 512;
+
+// the required number of runs in shared memory we will need to provide the
+// rle_stream object
+constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<preprocess_block_size>();
+
+// the size of the rolling batch buffer
+constexpr int rolling_buf_size = LEVEL_DECODE_BUF_SIZE;
+
+using unused_state_buf = page_state_buffers_s<0, 0, 0>;
+
+/**
+ *
+ * This function expects the dictionary position to be at 0 and will traverse
+ * the entire thing.
+ *
+ * Operates on a single warp only. Expects t < 32
+ *
+ * @param s The local page info
+ * @param t Thread index
+ */
+__device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
+{
+  size_type target_pos = s->num_input_values;
+  size_type str_len    = 0;
+  if (s->dict_base) {
+    auto const [new_target_pos, len] =
+      gpuDecodeDictionaryIndices<true, unused_state_buf>(s, nullptr, target_pos, t);
+    target_pos = new_target_pos;
+    str_len    = len;
+  } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
+    str_len = gpuInitStringDescriptors<true, unused_state_buf>(s, nullptr, target_pos, t);
+  }
+  if (!t) { *(int32_t volatile*)&s->dict_pos = target_pos; }
+  return str_len;
+}
+
+/**
+ * @brief Update output column sizes for every nesting level based on a batch
+ * of incoming decoded definition and repetition level values.
+ *
+ * If bounds_set is true, computes skipped_values and skipped_leaf_values for the
+ * page to indicate where we need to skip to based on min/max row.
+ *
+ * Operates at the block level.
+ *
+ * @param s The local page info
+ * @param target_value_count The target value count to process up to
+ * @param rep Repetition level buffer
+ * @param def Definition level buffer
+ * @param t Thread index
+ * @param bounds_set A boolean indicating whether or not min/max row bounds have been set
+ */
+template <typename level_t>
+static __device__ void gpuUpdatePageSizes(page_state_s* s,
+                                          int target_value_count,
+                                          level_t const* const rep,
+                                          level_t const* const def,
+                                          int t,
+                                          bool bounds_set)
+{
+  // max nesting depth of the column
+  int const max_depth = s->col.max_nesting_depth;
+
+  constexpr int num_warps      = preprocess_block_size / 32;
+  constexpr int max_batch_size = num_warps * 32;
+
+  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
+  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  // how many input level values we've processed in the page so far
+  int value_count = s->input_value_count;
+  // how many rows we've processed in the page so far
+  int row_count = s->input_row_count;
+  // how many leaf values we've processed in the page so far
+  int leaf_count = s->input_leaf_count;
+  // whether or not we need to continue checking for the first row
+  bool skipped_values_set = s->page.skipped_values >= 0;
+
+  while (value_count < target_value_count) {
+    int const batch_size = min(max_batch_size, target_value_count - value_count);
+
+    // start/end depth
+    int start_depth, end_depth, d;
+    get_nesting_bounds<rolling_buf_size, level_t>(
+      start_depth, end_depth, d, s, rep, def, value_count, value_count + batch_size, t);
+
+    // is this thread within row bounds? in the non skip_rows/num_rows case this will always
+    // be true.
+    int in_row_bounds = 1;
+
+    // if we are in the skip_rows/num_rows case, we need to check against these limits
+    if (bounds_set) {
+      // get absolute thread row index
+      int const is_new_row = start_depth == 0;
+      int thread_row_count, block_row_count;
+      block_scan(temp_storage.scan_storage)
+        .InclusiveSum(is_new_row, thread_row_count, block_row_count);
+      __syncthreads();
+
+      // get absolute thread leaf index
+      int const is_new_leaf = (d >= s->nesting_info[max_depth - 1].max_def_level);
+      int thread_leaf_count, block_leaf_count;
+      block_scan(temp_storage.scan_storage)
+        .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count);
+      __syncthreads();
+
+      // if this thread is in row bounds
+      int const row_index = (thread_row_count + row_count) - 1;
+      in_row_bounds =
+        (row_index >= s->row_index_lower_bound) && (row_index < (s->first_row + s->num_rows));
+
+      // if we have not set skipped values yet, see if we found the first in-bounds row
+      if (!skipped_values_set) {
+        int local_count, global_count;
+        block_scan(temp_storage.scan_storage)
+          .InclusiveSum(in_row_bounds, local_count, global_count);
+        __syncthreads();
+
+        // we found it
+        if (global_count > 0) {
+          // this is the thread that represents the first row.
+          if (local_count == 1 && in_row_bounds) {
+            s->page.skipped_values = value_count + t;
+            s->page.skipped_leaf_values =
+              leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
+          }
+          skipped_values_set = true;
+        }
+      }
+
+      row_count += block_row_count;
+      leaf_count += block_leaf_count;
+    }
+
+    // increment value counts across all nesting depths
+    for (int s_idx = 0; s_idx < max_depth; s_idx++) {
+      int const in_nesting_bounds = (s_idx >= start_depth && s_idx <= end_depth && in_row_bounds);
+      int const count = block_reduce(temp_storage.reduce_storage).Sum(in_nesting_bounds);
+      __syncthreads();
+      if (!t) {
+        PageNestingInfo* pni = &s->page.nesting[s_idx];
+        pni->batch_size += count;
+      }
+    }
+
+    value_count += batch_size;
+  }
+
+  // update final outputs
+  if (!t) {
+    s->input_value_count = value_count;
+
+    // only used in the skip_rows/num_rows case
+    s->input_leaf_count = leaf_count;
+    s->input_row_count  = row_count;
+  }
+}
+
+/**
+ * @brief Kernel for computing per-page column size information for all nesting levels.
+ *
+ * This function will write out the size field for each level of nesting.
+ *
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read. Pass as INT_MAX to guarantee reading all rows
+ * @param is_base_pass Whether or not this is the base pass.  We first have to compute
+ * the full size information of every page before we come through in a second (trim) pass
+ * to determine what subset of rows in this page we should be reading
+ * @param compute_string_sizes Whether or not we should be computing string sizes
+ * (PageInfo::str_bytes) as part of the pass
+ */
+template <typename level_t>
+__global__ void __launch_bounds__(preprocess_block_size)
+  gpuComputePageSizes(PageInfo* pages,
+                      device_span<ColumnChunkDesc const> chunks,
+                      size_t min_row,
+                      size_t num_rows,
+                      bool is_base_pass,
+                      bool compute_string_sizes)
+{
+  __shared__ __align__(16) page_state_s state_g;
+
+  page_state_s* const s = &state_g;
+  int page_idx          = blockIdx.x;
+  int t                 = threadIdx.x;
+  PageInfo* pp          = &pages[page_idx];
+
+  // whether or not we have repetition levels (lists)
+  bool has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // the level stream decoders
+  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
+  __shared__ rle_run<level_t> rep_runs[rle_run_buffer_size];
+  rle_stream<level_t, preprocess_block_size> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs},
+                                                                                      {rep_runs}};
+
+  // setup page info
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, all_types_filter{}, false)) { return; }
+
+  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  // the size of the rolling batch buffer
+  int const max_batch_size = rolling_buf_size;
+  level_t* rep             = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
+  level_t* def             = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
+  decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
+                                        s->abs_lvl_start[level_type::DEFINITION],
+                                        s->abs_lvl_end[level_type::DEFINITION],
+                                        max_batch_size,
+                                        def,
+                                        s->page.num_input_values);
+  if (has_repetition) {
+    decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION],
+                                          s->abs_lvl_start[level_type::REPETITION],
+                                          s->abs_lvl_end[level_type::REPETITION],
+                                          max_batch_size,
+                                          rep,
+                                          s->page.num_input_values);
+  }
+  __syncthreads();
+
+  if (!t) {
+    s->page.skipped_values      = -1;
+    s->page.skipped_leaf_values = 0;
+    s->page.str_bytes           = 0;
+    s->input_row_count          = 0;
+    s->input_value_count        = 0;
+
+    // in the base pass, we're computing the number of rows, make sure we visit absolutely
+    // everything
+    if (is_base_pass) {
+      s->first_row             = 0;
+      s->num_rows              = INT_MAX;
+      s->row_index_lower_bound = -1;
+    }
+  }
+
+  // we only need to preprocess hierarchies with repetition in them (ie, hierarchies
+  // containing lists anywhere within).
+  compute_string_sizes =
+    compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4);
+
+  // early out optimizations:
+
+  // - if this is a flat hierarchy (no lists) and is not a string column. in this case we don't need
+  // to do the expensive work of traversing the level data to determine sizes.  we can just compute
+  // it directly.
+  if (!has_repetition && !compute_string_sizes) {
+    int depth = 0;
+    while (depth < s->page.num_output_nesting_levels) {
+      auto const thread_depth = depth + t;
+      if (thread_depth < s->page.num_output_nesting_levels) {
+        if (is_base_pass) { pp->nesting[thread_depth].size = pp->num_input_values; }
+        pp->nesting[thread_depth].batch_size = pp->num_input_values;
+      }
+      depth += blockDim.x;
+    }
+    return;
+  }
+
+  // in the trim pass, for anything with lists, we only need to fully process bounding pages (those
+  // at the beginning or the end of the row bounds)
+  if (!is_base_pass && !is_bounds_page(s, min_row, num_rows, has_repetition)) {
+    int depth = 0;
+    while (depth < s->page.num_output_nesting_levels) {
+      auto const thread_depth = depth + t;
+      if (thread_depth < s->page.num_output_nesting_levels) {
+        // if we are not a bounding page (as checked above) then we are either
+        // returning all rows/values from this page, or 0 of them
+        pp->nesting[thread_depth].batch_size =
+          (s->num_rows == 0 && !is_page_contained(s, min_row, num_rows))
+            ? 0
+            : pp->nesting[thread_depth].size;
+      }
+      depth += blockDim.x;
+    }
+    return;
+  }
+
+  // zero sizes
+  int depth = 0;
+  while (depth < s->page.num_output_nesting_levels) {
+    auto const thread_depth = depth + t;
+    if (thread_depth < s->page.num_output_nesting_levels) {
+      s->page.nesting[thread_depth].batch_size = 0;
+    }
+    depth += blockDim.x;
+  }
+  __syncthreads();
+
+  // the core loop. decode batches of level stream data using rle_stream objects
+  // and pass the results to gpuUpdatePageSizes
+  int processed = 0;
+  while (processed < s->page.num_input_values) {
+    // TODO:  it would not take much more work to make it so that we could run both of these
+    // decodes concurrently. there are a couple of shared variables internally that would have to
+    // get dealt with but that's about it.
+    if (has_repetition) {
+      decoders[level_type::REPETITION].decode_next(t);
+      __syncthreads();
+    }
+    // the # of rep/def levels will always be the same size
+    processed += decoders[level_type::DEFINITION].decode_next(t);
+    __syncthreads();
+
+    // update page sizes
+    gpuUpdatePageSizes<level_t>(s, processed, rep, def, t, !is_base_pass);
+    __syncthreads();
+  }
+
+  // retrieve total string size.
+  // TODO: make this block-based instead of just 1 warp
+  if (compute_string_sizes) {
+    if (t < 32) { s->page.str_bytes = gpuDecodeTotalPageStringSize(s, t); }
+  }
+
+  // update output results:
+  // - real number of rows for the whole page
+  // - nesting sizes for the whole page
+  // - skipped value information for trimmed pages
+  // - string bytes
+  if (is_base_pass) {
+    // nesting level 0 is the root column, so the size is also the # of rows
+    if (!t) { pp->num_rows = s->page.nesting[0].batch_size; }
+
+    // store off this batch size as the "full" size
+    int depth = 0;
+    while (depth < s->page.num_output_nesting_levels) {
+      auto const thread_depth = depth + t;
+      if (thread_depth < s->page.num_output_nesting_levels) {
+        pp->nesting[thread_depth].size = pp->nesting[thread_depth].batch_size;
+      }
+      depth += blockDim.x;
+    }
+  }
+
+  if (!t) {
+    pp->skipped_values      = s->page.skipped_values;
+    pp->skipped_leaf_values = s->page.skipped_leaf_values;
+    pp->str_bytes           = s->page.str_bytes;
+  }
+}
+
+}  // anonymous namespace
+
+/**
+ * @copydoc cudf::io::parquet::gpu::ComputePageSizes
+ */
+void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                      cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                      size_t min_row,
+                      size_t num_rows,
+                      bool compute_num_rows,
+                      bool compute_string_sizes,
+                      int level_type_size,
+                      rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(preprocess_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  // computes:
+  // PageNestingInfo::size for each level of nesting, for each page.
+  // This computes the size for the entire page, not taking row bounds into account.
+  // If uses_custom_row_bounds is set to true, we have to do a second pass later that "trims"
+  // the starting and ending read values to account for these bounds.
+  if (level_type_size == 1) {
+    gpuComputePageSizes<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes);
+  } else {
+    gpuComputePageSizes<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes);
+  }
+}
+
+}  // namespace gpu
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
new file mode 100644
index 00000000000..4fc8b9cfb8e
--- /dev/null
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "page_decode.cuh"
+
+namespace cudf::io::parquet::gpu {
+
+// DELTA_XXX encoding support
+//
+// DELTA_BINARY_PACKED is used for INT32 and INT64 data types. Encoding begins with a header
+// containing a block size, number of mini-blocks in each block, total value count, and first
+// value. The first three are ULEB128 variable length ints, and the last is a zigzag ULEB128
+// varint.
+//   -- the block size is a multiple of 128
+//   -- the mini-block count is chosen so that each mini-block will contain a multiple of 32 values
+//   -- the value count includes the first value stored in the header
+//
+// It seems most Parquet encoders will stick with a block size of 128, and 4 mini-blocks of 32
+// elements each. arrow-rs will use a block size of 256 for 64-bit ints.
+//
+// Following the header are the data blocks. Each block is further divided into mini-blocks, with
+// each mini-block having its own encoding bitwidth. Each block begins with a header containing a
+// zigzag ULEB128 encoded minimum delta value, followed by an array of uint8 bitwidths, one entry
+// per mini-block. While encoding, the lowest delta value is subtracted from all the deltas in the
+// block to ensure that all encoded values are positive. The deltas for each mini-block are bit
+// packed using the same encoding as the RLE/Bit-Packing Hybrid encoder.
+//
+// DELTA_BYTE_ARRAY encoding (incremental encoding or front compression), is used for BYTE_ARRAY
+// columns. For each element in a sequence of strings, a prefix length from the preceding string
+// and a suffix is stored. The prefix lengths are DELTA_BINARY_PACKED encoded. The suffixes are
+// encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix
+// lengths, followed by the concatenated suffix data.
+
+// TODO: The delta encodings use ULEB128 integers, but for now we're only
+// using max 64 bits. Need to see what the performance impact is of using
+// __int128_t rather than int64_t.
+using uleb128_t   = uint64_t;
+using zigzag128_t = int64_t;
+
+// we decode one mini-block at a time. max mini-block size seen is 64.
+constexpr int delta_rolling_buf_size = 128;
+
+/**
+ * @brief Read a ULEB128 varint integer
+ *
+ * @param[in,out] cur The current data position, updated after the read
+ * @param[in] end The end data position
+ *
+ * @return The value read
+ */
+inline __device__ uleb128_t get_uleb128(uint8_t const*& cur, uint8_t const* end)
+{
+  uleb128_t v = 0, l = 0, c;
+  while (cur < end) {
+    c = *cur++;
+    v |= (c & 0x7f) << l;
+    l += 7;
+    if ((c & 0x80) == 0) { return v; }
+  }
+  return v;
+}
+
+/**
+ * @brief Read a ULEB128 zig-zag encoded varint integer
+ *
+ * @param[in,out] cur The current data position, updated after the read
+ * @param[in] end The end data position
+ *
+ * @return The value read
+ */
+inline __device__ zigzag128_t get_zz128(uint8_t const*& cur, uint8_t const* end)
+{
+  uleb128_t u = get_uleb128(cur, end);
+  return static_cast<zigzag128_t>((u >> 1u) ^ -static_cast<zigzag128_t>(u & 1));
+}
+
+struct delta_binary_decoder {
+  uint8_t const* block_start;    // start of data, but updated as data is read
+  uint8_t const* block_end;      // end of data
+  uleb128_t block_size;          // usually 128, must be multiple of 128
+  uleb128_t mini_block_count;    // usually 4, chosen such that block_size/mini_block_count is a
+                                 // multiple of 32
+  uleb128_t value_count;         // total values encoded in the block
+  zigzag128_t last_value;        // last value decoded, initialized to first_value from header
+
+  uint32_t values_per_mb;        // block_size / mini_block_count, must be multiple of 32
+  uint32_t current_value_idx;    // current value index, initialized to 0 at start of block
+
+  zigzag128_t cur_min_delta;     // min delta for the block
+  uint32_t cur_mb;               // index of the current mini-block within the block
+  uint8_t const* cur_mb_start;   // pointer to the start of the current mini-block data
+  uint8_t const* cur_bitwidths;  // pointer to the bitwidth array in the block
+
+  uleb128_t value[delta_rolling_buf_size];  // circular buffer of delta values
+
+  // returns the number of values encoded in the block data. when all_values is true,
+  // account for the first value in the header. otherwise just count the values encoded
+  // in the mini-block data.
+  constexpr uint32_t num_encoded_values(bool all_values)
+  {
+    return value_count == 0 ? 0 : all_values ? value_count : value_count - 1;
+  }
+
+  // read mini-block header into state object. should only be called from init_binary_block or
+  // setup_next_mini_block. header format is:
+  //
+  // | min delta (int) | bit-width array (1 byte * mini_block_count) |
+  //
+  // on exit db->cur_mb is 0 and db->cur_mb_start points to the first mini-block of data, or
+  // nullptr if out of data.
+  // is_decode indicates whether this is being called from initialization code (false) or
+  // the actual decoding (true)
+  inline __device__ void init_mini_block(bool is_decode)
+  {
+    cur_mb       = 0;
+    cur_mb_start = nullptr;
+
+    if (current_value_idx < num_encoded_values(is_decode)) {
+      auto d_start  = block_start;
+      cur_min_delta = get_zz128(d_start, block_end);
+      cur_bitwidths = d_start;
+
+      d_start += mini_block_count;
+      cur_mb_start = d_start;
+    }
+  }
+
+  // read delta binary header into state object. should be called on thread 0. header format is:
+  //
+  // | block size (uint) | mini-block count (uint) | value count (uint) | first value (int) |
+  //
+  // also initializes the first mini-block before exit
+  inline __device__ void init_binary_block(uint8_t const* d_start, uint8_t const* d_end)
+  {
+    block_end        = d_end;
+    block_size       = get_uleb128(d_start, d_end);
+    mini_block_count = get_uleb128(d_start, d_end);
+    value_count      = get_uleb128(d_start, d_end);
+    last_value       = get_zz128(d_start, d_end);
+
+    current_value_idx = 0;
+    values_per_mb     = block_size / mini_block_count;
+
+    // init the first mini-block
+    block_start = d_start;
+
+    // only call init if there are actually encoded values
+    if (value_count > 1) { init_mini_block(false); }
+  }
+
+  // skip to the start of the next mini-block. should only be called on thread 0.
+  // calls init_binary_block if currently on the last mini-block in a block.
+  // is_decode indicates whether this is being called from initialization code (false) or
+  // the actual decoding (true)
+  inline __device__ void setup_next_mini_block(bool is_decode)
+  {
+    if (current_value_idx >= num_encoded_values(is_decode)) { return; }
+
+    current_value_idx += values_per_mb;
+
+    // just set pointer to start of next mini_block
+    if (cur_mb < mini_block_count - 1) {
+      cur_mb_start += cur_bitwidths[cur_mb] * values_per_mb / 8;
+      cur_mb++;
+    }
+    // out of mini-blocks, start a new block
+    else {
+      block_start = cur_mb_start + cur_bitwidths[cur_mb] * values_per_mb / 8;
+      init_mini_block(is_decode);
+    }
+  }
+
+  // decode the current mini-batch of deltas, and convert to values.
+  // called by all threads in a warp, currently only one warp supported.
+  inline __device__ void calc_mini_block_values(int lane_id)
+  {
+    using cudf::detail::warp_size;
+    if (current_value_idx >= value_count) { return; }
+
+    // need to save first value from header on first pass
+    if (current_value_idx == 0) {
+      if (lane_id == 0) {
+        current_value_idx++;
+        value[0] = last_value;
+      }
+      __syncwarp();
+      if (current_value_idx >= value_count) { return; }
+    }
+
+    uint32_t const mb_bits = cur_bitwidths[cur_mb];
+
+    // need to do in multiple passes if values_per_mb != 32
+    uint32_t const num_pass = values_per_mb / warp_size;
+
+    auto d_start = cur_mb_start;
+
+    for (int i = 0; i < num_pass; i++) {
+      // position at end of the current mini-block since the following calculates
+      // negative indexes
+      d_start += (warp_size * mb_bits) / 8;
+
+      // unpack deltas. modified from version in gpuDecodeDictionaryIndices(), but
+      // that one only unpacks up to bitwidths of 24. simplified some since this
+      // will always do batches of 32.
+      // NOTE: because this needs to handle up to 64 bits, the branching used in the other
+      // implementation has been replaced with a loop. While this uses more registers, the
+      // looping version is just as fast and easier to read. Might need to revisit this when
+      // DELTA_BYTE_ARRAY is implemented.
+      zigzag128_t delta = 0;
+      if (lane_id + current_value_idx < value_count) {
+        int32_t ofs      = (lane_id - warp_size) * mb_bits;
+        uint8_t const* p = d_start + (ofs >> 3);
+        ofs &= 7;
+        if (p < block_end) {
+          uint32_t c = 8 - ofs;  // 0 - 7 bits
+          delta      = (*p++) >> ofs;
+
+          while (c < mb_bits && p < block_end) {
+            delta |= static_cast<zigzag128_t>(*p++) << c;
+            c += 8;
+          }
+          delta &= (static_cast<zigzag128_t>(1) << mb_bits) - 1;
+        }
+      }
+
+      // add min delta to get true delta
+      delta += cur_min_delta;
+
+      // do inclusive scan to get value - first_value at each position
+      __shared__ cub::WarpScan<int64_t>::TempStorage temp_storage;
+      cub::WarpScan<int64_t>(temp_storage).InclusiveSum(delta, delta);
+
+      // now add first value from header or last value from previous block to get true value
+      delta += last_value;
+      int const value_idx =
+        rolling_index<delta_rolling_buf_size>(current_value_idx + warp_size * i + lane_id);
+      value[value_idx] = delta;
+
+      // save value from last lane in warp. this will become the 'first value' added to the
+      // deltas calculated in the next iteration (or invocation).
+      if (lane_id == warp_size - 1) { last_value = delta; }
+      __syncwarp();
+    }
+  }
+
+  // decodes and skips values until the block containing the value after `skip` is reached.
+  // called by all threads in a thread block.
+  inline __device__ void skip_values(int skip)
+  {
+    using cudf::detail::warp_size;
+    int const t       = threadIdx.x;
+    int const lane_id = t % warp_size;
+
+    while (current_value_idx < skip && current_value_idx < num_encoded_values(true)) {
+      if (t < warp_size) {
+        calc_mini_block_values(lane_id);
+        if (lane_id == 0) { setup_next_mini_block(true); }
+      }
+      __syncthreads();
+    }
+  }
+
+  // decodes the current mini block and stores the values obtained. should only be called by
+  // a single warp.
+  inline __device__ void decode_batch()
+  {
+    using cudf::detail::warp_size;
+    int const t       = threadIdx.x;
+    int const lane_id = t % warp_size;
+
+    // unpack deltas and save in db->value
+    calc_mini_block_values(lane_id);
+
+    // set up for next mini-block
+    if (lane_id == 0) { setup_next_mini_block(true); }
+  }
+};
+
+}  // namespace cudf::io::parquet::gpu
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 8cb01d5a34b..c26802aa3c2 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -14,30 +14,14 @@
  * limitations under the License.
  */
 
-#include "parquet_gpu.hpp"
-#include <io/utilities/block_utils.cuh>
+#include "page_decode.cuh"
+
 #include <io/utilities/column_buffer.hpp>
 
-#include <cuda/std/tuple>
-#include <cudf/detail/utilities/assert.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/strings/string_view.hpp>
-#include <cudf/utilities/bit.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 
-#include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-
-#include <thrust/functional.h>
-#include <thrust/iterator/iterator_categories.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/reduce.h>
-#include <thrust/scan.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-#include <thrust/tuple.h>
 
 namespace cudf {
 namespace io {
@@ -46,557 +30,8 @@ namespace gpu {
 
 namespace {
 
-constexpr int block_size           = 128;
-constexpr int non_zero_buffer_size = block_size * 2;
-
-constexpr int rolling_index(int index) { return index & (non_zero_buffer_size - 1); }
-
-struct page_state_s {
-  const uint8_t* data_start;
-  const uint8_t* data_end;
-  const uint8_t* lvl_end;
-  const uint8_t* dict_base;    // ptr to dictionary page data
-  int32_t dict_size;           // size of dictionary data
-  int32_t first_row;           // First row in page to output
-  int32_t num_rows;            // Rows in page to decode (including rows to be skipped)
-  int32_t first_output_value;  // First value in page to output
-  int32_t num_input_values;    // total # of input/level values in the page
-  int32_t dtype_len;           // Output data type length
-  int32_t dtype_len_in;        // Can be larger than dtype_len if truncating 32-bit into 8-bit
-  int32_t dict_bits;           // # of bits to store dictionary indices
-  uint32_t dict_run;
-  int32_t dict_val;
-  uint32_t initial_rle_run[NUM_LEVEL_TYPES];   // [def,rep]
-  int32_t initial_rle_value[NUM_LEVEL_TYPES];  // [def,rep]
-  int32_t error;
-  PageInfo page;
-  ColumnChunkDesc col;
-
-  // (leaf) value decoding
-  int32_t nz_count;  // number of valid entries in nz_idx (write position in circular buffer)
-  int32_t dict_pos;  // write position of dictionary indices
-  int32_t src_pos;   // input read position of final output value
-  int32_t ts_scale;  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
-
-  // repetition/definition level decoding
-  int32_t input_value_count;                  // how many values of the input we've processed
-  int32_t input_row_count;                    // how many rows of the input we've processed
-  int32_t input_leaf_count;                   // how many leaf values of the input we've processed
-  uint32_t rep[non_zero_buffer_size];         // circular buffer of repetition level values
-  uint32_t def[non_zero_buffer_size];         // circular buffer of definition level values
-  const uint8_t* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  int32_t lvl_count[NUM_LEVEL_TYPES];         // how many of each of the streams we've decoded
-  int32_t row_index_lower_bound;              // lower bound of row indices we should process
-
-  // a shared-memory cache of frequently used data when decoding. The source of this data is
-  // normally stored in global memory which can yield poor performance. So, when possible
-  // we copy that info here prior to decoding
-  PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info];
-  // points to either nesting_decode_cache above when possible, or to the global source otherwise
-  PageNestingDecodeInfo* nesting_info;
-};
-
-// buffers only used in the decode kernel.  separated from page_state_s to keep
-// shared memory usage in other kernels (eg, gpuComputePageSizes) down.
-struct page_state_buffers_s {
-  uint32_t nz_idx[non_zero_buffer_size];    // circular buffer of non-null value positions
-  uint32_t dict_idx[non_zero_buffer_size];  // Dictionary index, boolean, or string offset values
-  uint32_t str_len[non_zero_buffer_size];   // String length for plain encoding of strings
-};
-
-/**
- * @brief Returns whether or not a page spans either the beginning or the end of the
- * specified row bounds
- *
- * @param s The page to be checked
- * @param start_row The starting row index
- * @param num_rows The number of rows
- *
- * @return True if the page spans the beginning or the end of the row bounds
- */
-inline __device__ bool is_bounds_page(page_state_s* const s, size_t start_row, size_t num_rows)
-{
-  size_t const page_begin = s->col.start_row + s->page.chunk_row;
-  size_t const page_end   = page_begin + s->page.num_rows;
-  size_t const begin      = start_row;
-  size_t const end        = start_row + num_rows;
-
-  return ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end));
-}
-
-/**
- * @brief Returns whether or not a page is completely contained within the specified
- * row bounds
- *
- * @param s The page to be checked
- * @param start_row The starting row index
- * @param num_rows The number of rows
- *
- * @return True if the page is completely contained within the row bounds
- */
-inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row, size_t num_rows)
-{
-  size_t const page_begin = s->col.start_row + s->page.chunk_row;
-  size_t const page_end   = page_begin + s->page.num_rows;
-  size_t const begin      = start_row;
-  size_t const end        = start_row + num_rows;
-
-  return page_begin >= begin && page_end <= end;
-}
-
-/**
- * @brief Read a 32-bit varint integer
- *
- * @param[in,out] cur The current data position, updated after the read
- * @param[in] end The end data position
- *
- * @return The 32-bit value read
- */
-inline __device__ uint32_t get_vlq32(const uint8_t*& cur, const uint8_t* end)
-{
-  uint32_t v = *cur++;
-  if (v >= 0x80 && cur < end) {
-    v = (v & 0x7f) | ((*cur++) << 7);
-    if (v >= (0x80 << 7) && cur < end) {
-      v = (v & ((0x7f << 7) | 0x7f)) | ((*cur++) << 14);
-      if (v >= (0x80 << 14) && cur < end) {
-        v = (v & ((0x7f << 14) | (0x7f << 7) | 0x7f)) | ((*cur++) << 21);
-        if (v >= (0x80 << 21) && cur < end) {
-          v = (v & ((0x7f << 21) | (0x7f << 14) | (0x7f << 7) | 0x7f)) | ((*cur++) << 28);
-        }
-      }
-    }
-  }
-  return v;
-}
-
-/**
- * @brief Parse the beginning of the level section (definition or repetition),
- * initializes the initial RLE run & value, and returns the section length
- *
- * @param[in,out] s The page state
- * @param[in] cur The current data position
- * @param[in] end The end of the data
- * @param[in] level_bits The bits required
- *
- * @return The length of the section
- */
-__device__ uint32_t InitLevelSection(page_state_s* s,
-                                     const uint8_t* cur,
-                                     const uint8_t* end,
-                                     level_type lvl)
-{
-  int32_t len;
-  int level_bits    = s->col.level_bits[lvl];
-  Encoding encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding
-                                                    : s->page.repetition_level_encoding;
-
-  if (level_bits == 0) {
-    len                       = 0;
-    s->initial_rle_run[lvl]   = s->page.num_input_values * 2;  // repeated value
-    s->initial_rle_value[lvl] = 0;
-    s->lvl_start[lvl]         = cur;
-  } else if (encoding == Encoding::RLE) {
-    // V2 only uses RLE encoding, so only perform check here
-    if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) {
-      len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes;
-    } else if (cur + 4 < end) {
-      len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
-      cur += 4;
-    } else {
-      len      = 0;
-      s->error = 2;
-    }
-    if (!s->error) {
-      uint32_t run            = get_vlq32(cur, end);
-      s->initial_rle_run[lvl] = run;
-      if (!(run & 1)) {
-        int v = (cur < end) ? cur[0] : 0;
-        cur++;
-        if (level_bits > 8) {
-          v |= ((cur < end) ? cur[0] : 0) << 8;
-          cur++;
-        }
-        s->initial_rle_value[lvl] = v;
-      }
-      s->lvl_start[lvl] = cur;
-      if (cur > end) { s->error = 2; }
-    }
-  } else if (encoding == Encoding::BIT_PACKED) {
-    len                       = (s->page.num_input_values * level_bits + 7) >> 3;
-    s->initial_rle_run[lvl]   = ((s->page.num_input_values + 7) >> 3) * 2 + 1;  // literal run
-    s->initial_rle_value[lvl] = 0;
-    s->lvl_start[lvl]         = cur;
-  } else {
-    s->error = 3;
-    len      = 0;
-  }
-  return static_cast<uint32_t>(len);
-}
-
-/**
- * @brief Decode values out of a definition or repetition stream
- *
- * @param[in,out] s Page state input/output
- * @param[in] t target_count Target count of stream values on output
- * @param[in] t Warp0 thread ID (0..31)
- * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
- */
-__device__ void gpuDecodeStream(
-  uint32_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
-{
-  const uint8_t* cur_def    = s->lvl_start[lvl];
-  const uint8_t* end        = s->lvl_end;
-  uint32_t level_run        = s->initial_rle_run[lvl];
-  int32_t level_val         = s->initial_rle_value[lvl];
-  int level_bits            = s->col.level_bits[lvl];
-  int32_t num_input_values  = s->num_input_values;
-  int32_t value_count       = s->lvl_count[lvl];
-  int32_t batch_coded_count = 0;
-
-  while (value_count < target_count && value_count < num_input_values) {
-    int batch_len;
-    if (level_run <= 1) {
-      // Get a new run symbol from the byte stream
-      int sym_len = 0;
-      if (!t) {
-        const uint8_t* cur = cur_def;
-        if (cur < end) { level_run = get_vlq32(cur, end); }
-        if (!(level_run & 1)) {
-          if (cur < end) level_val = cur[0];
-          cur++;
-          if (level_bits > 8) {
-            if (cur < end) level_val |= cur[0] << 8;
-            cur++;
-          }
-        }
-        if (cur > end || level_run <= 1) { s->error = 0x10; }
-        sym_len = (int32_t)(cur - cur_def);
-        __threadfence_block();
-      }
-      sym_len   = shuffle(sym_len);
-      level_val = shuffle(level_val);
-      level_run = shuffle(level_run);
-      cur_def += sym_len;
-    }
-    if (s->error) { break; }
-
-    batch_len = min(num_input_values - value_count, 32);
-    if (level_run & 1) {
-      // Literal run
-      int batch_len8;
-      batch_len  = min(batch_len, (level_run >> 1) * 8);
-      batch_len8 = (batch_len + 7) >> 3;
-      if (t < batch_len) {
-        int bitpos         = t * level_bits;
-        const uint8_t* cur = cur_def + (bitpos >> 3);
-        bitpos &= 7;
-        if (cur < end) level_val = cur[0];
-        cur++;
-        if (level_bits > 8 - bitpos && cur < end) {
-          level_val |= cur[0] << 8;
-          cur++;
-          if (level_bits > 16 - bitpos && cur < end) level_val |= cur[0] << 16;
-        }
-        level_val = (level_val >> bitpos) & ((1 << level_bits) - 1);
-      }
-      level_run -= batch_len8 * 2;
-      cur_def += batch_len8 * level_bits;
-    } else {
-      // Repeated value
-      batch_len = min(batch_len, level_run >> 1);
-      level_run -= batch_len * 2;
-    }
-    if (t < batch_len) {
-      int idx                    = value_count + t;
-      output[rolling_index(idx)] = level_val;
-    }
-    batch_coded_count += batch_len;
-    value_count += batch_len;
-  }
-
-  // update the stream info
-  if (!t) {
-    s->lvl_start[lvl]         = cur_def;
-    s->initial_rle_run[lvl]   = level_run;
-    s->initial_rle_value[lvl] = level_val;
-    s->lvl_count[lvl]         = value_count;
-  }
-}
-
-/**
- * @brief Performs RLE decoding of dictionary indexes
- *
- * @param[in,out] s Page state input/output
- * @param[out] sb Page state buffer output
- * @param[in] target_pos Target index position in dict_idx buffer (may exceed this value by up to
- * 31)
- * @param[in] t Warp1 thread ID (0..31)
- *
- * @return A pair containing the new output position, and the total length of strings decoded (this
- * will only be valid on thread 0 and if sizes_only is true). In the event that this function
- * decodes strings beyond target_pos, the total length of strings returned will include these
- * additional values.
- */
-template <bool sizes_only>
-__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
-  volatile page_state_s* s,
-  [[maybe_unused]] volatile page_state_buffers_s* sb,
-  int target_pos,
-  int t)
-{
-  const uint8_t* end = s->data_end;
-  int dict_bits      = s->dict_bits;
-  int pos            = s->dict_pos;
-  int str_len        = 0;
-
-  while (pos < target_pos) {
-    int is_literal, batch_len;
-    if (!t) {
-      uint32_t run       = s->dict_run;
-      const uint8_t* cur = s->data_start;
-      if (run <= 1) {
-        run = (cur < end) ? get_vlq32(cur, end) : 0;
-        if (!(run & 1)) {
-          // Repeated value
-          int bytecnt = (dict_bits + 7) >> 3;
-          if (cur + bytecnt <= end) {
-            int32_t run_val = cur[0];
-            if (bytecnt > 1) {
-              run_val |= cur[1] << 8;
-              if (bytecnt > 2) {
-                run_val |= cur[2] << 16;
-                if (bytecnt > 3) { run_val |= cur[3] << 24; }
-              }
-            }
-            s->dict_val = run_val & ((1 << dict_bits) - 1);
-          }
-          cur += bytecnt;
-        }
-      }
-      if (run & 1) {
-        // Literal batch: must output a multiple of 8, except for the last batch
-        int batch_len_div8;
-        batch_len      = max(min(32, (int)(run >> 1) * 8), 1);
-        batch_len_div8 = (batch_len + 7) >> 3;
-        run -= batch_len_div8 * 2;
-        cur += batch_len_div8 * dict_bits;
-      } else {
-        batch_len = max(min(32, (int)(run >> 1)), 1);
-        run -= batch_len * 2;
-      }
-      s->dict_run   = run;
-      s->data_start = cur;
-      is_literal    = run & 1;
-      __threadfence_block();
-    }
-    __syncwarp();
-    is_literal = shuffle(is_literal);
-    batch_len  = shuffle(batch_len);
-
-    // compute dictionary index.
-    int dict_idx = 0;
-    if (t < batch_len) {
-      dict_idx = s->dict_val;
-      if (is_literal) {
-        int32_t ofs      = (t - ((batch_len + 7) & ~7)) * dict_bits;
-        const uint8_t* p = s->data_start + (ofs >> 3);
-        ofs &= 7;
-        if (p < end) {
-          uint32_t c = 8 - ofs;
-          dict_idx   = (*p++) >> ofs;
-          if (c < dict_bits && p < end) {
-            dict_idx |= (*p++) << c;
-            c += 8;
-            if (c < dict_bits && p < end) {
-              dict_idx |= (*p++) << c;
-              c += 8;
-              if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; }
-            }
-          }
-          dict_idx &= (1 << dict_bits) - 1;
-        }
-      }
-
-      // if we're not computing sizes, store off the dictionary index
-      if constexpr (!sizes_only) { sb->dict_idx[rolling_index(pos + t)] = dict_idx; }
-    }
-
-    // if we're computing sizes, add the length(s)
-    if constexpr (sizes_only) {
-      int const len = [&]() {
-        if (t >= batch_len || (pos + t >= target_pos)) { return 0; }
-        uint32_t const dict_pos = (s->dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
-        if (dict_pos < (uint32_t)s->dict_size) {
-          const auto* src = reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
-          return src->second;
-        }
-        return 0;
-      }();
-
-      using WarpReduce = cub::WarpReduce<size_type>;
-      __shared__ typename WarpReduce::TempStorage temp_storage;
-      // note: str_len will only be valid on thread 0.
-      str_len += WarpReduce(temp_storage).Sum(len);
-    }
-
-    pos += batch_len;
-  }
-  return {pos, str_len};
-}
-
-/**
- * @brief Performs RLE decoding of dictionary indexes, for when dict_size=1
- *
- * @param[in,out] s Page state input/output
- * @param[out] sb Page state buffer output
- * @param[in] target_pos Target write position
- * @param[in] t Thread ID
- *
- * @return The new output position
- */
-__device__ int gpuDecodeRleBooleans(volatile page_state_s* s,
-                                    volatile page_state_buffers_s* sb,
-                                    int target_pos,
-                                    int t)
-{
-  const uint8_t* end = s->data_end;
-  int pos            = s->dict_pos;
-
-  while (pos < target_pos) {
-    int is_literal, batch_len;
-    if (!t) {
-      uint32_t run       = s->dict_run;
-      const uint8_t* cur = s->data_start;
-      if (run <= 1) {
-        run = (cur < end) ? get_vlq32(cur, end) : 0;
-        if (!(run & 1)) {
-          // Repeated value
-          s->dict_val = (cur < end) ? cur[0] & 1 : 0;
-          cur++;
-        }
-      }
-      if (run & 1) {
-        // Literal batch: must output a multiple of 8, except for the last batch
-        int batch_len_div8;
-        batch_len = max(min(32, (int)(run >> 1) * 8), 1);
-        if (batch_len >= 8) { batch_len &= ~7; }
-        batch_len_div8 = (batch_len + 7) >> 3;
-        run -= batch_len_div8 * 2;
-        cur += batch_len_div8;
-      } else {
-        batch_len = max(min(32, (int)(run >> 1)), 1);
-        run -= batch_len * 2;
-      }
-      s->dict_run   = run;
-      s->data_start = cur;
-      is_literal    = run & 1;
-      __threadfence_block();
-    }
-    __syncwarp();
-    is_literal = shuffle(is_literal);
-    batch_len  = shuffle(batch_len);
-    if (t < batch_len) {
-      int dict_idx;
-      if (is_literal) {
-        int32_t ofs      = t - ((batch_len + 7) & ~7);
-        const uint8_t* p = s->data_start + (ofs >> 3);
-        dict_idx         = (p < end) ? (p[0] >> (ofs & 7u)) & 1 : 0;
-      } else {
-        dict_idx = s->dict_val;
-      }
-      sb->dict_idx[rolling_index(pos + t)] = dict_idx;
-    }
-    pos += batch_len;
-  }
-  return pos;
-}
-
-/**
- * @brief Parses the length and position of strings and returns total length of all strings
- * processed
- *
- * @param[in,out] s Page state input/output
- * @param[out] sb Page state buffer output
- * @param[in] target_pos Target output position
- * @param[in] t Thread ID
- *
- * @return Total length of strings processed
- */
-template <bool sizes_only>
-__device__ size_type gpuInitStringDescriptors(volatile page_state_s* s,
-                                              [[maybe_unused]] volatile page_state_buffers_s* sb,
-                                              int target_pos,
-                                              int t)
-{
-  int pos       = s->dict_pos;
-  int total_len = 0;
-
-  // This step is purely serial
-  if (!t) {
-    const uint8_t* cur = s->data_start;
-    int dict_size      = s->dict_size;
-    int k              = s->dict_val;
-
-    while (pos < target_pos) {
-      int len;
-      if (k + 4 <= dict_size) {
-        len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
-        k += 4;
-        if (k + len > dict_size) { len = 0; }
-      } else {
-        len = 0;
-      }
-      if constexpr (!sizes_only) {
-        sb->dict_idx[rolling_index(pos)] = k;
-        sb->str_len[rolling_index(pos)]  = len;
-      }
-      k += len;
-      total_len += len;
-      pos++;
-    }
-    s->dict_val = k;
-    __threadfence_block();
-  }
-
-  return total_len;
-}
-
-/**
- * @brief Retrieves string information for a string at the specified source position
- *
- * @param[in] s Page state input
- * @param[out] sb Page state buffer output
- * @param[in] src_pos Source position
- *
- * @return A pair containing a pointer to the string and its length
- */
-inline __device__ cuda::std::pair<const char*, size_t> gpuGetStringData(
-  volatile page_state_s* s, volatile page_state_buffers_s* sb, int src_pos)
-{
-  const char* ptr = nullptr;
-  size_t len      = 0;
-
-  if (s->dict_base) {
-    // String dictionary
-    uint32_t dict_pos =
-      (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] * sizeof(string_index_pair) : 0;
-    if (dict_pos < (uint32_t)s->dict_size) {
-      const auto* src = reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
-      ptr             = src->first;
-      len             = src->second;
-    }
-  } else {
-    // Plain encoding
-    uint32_t dict_pos = sb->dict_idx[rolling_index(src_pos)];
-    if (dict_pos <= (uint32_t)s->dict_size) {
-      ptr = reinterpret_cast<const char*>(s->data_start + dict_pos);
-      len = sb->str_len[rolling_index(src_pos)];
-    }
-  }
-
-  return {ptr, len};
-}
+constexpr int decode_block_size = 128;
+constexpr int rolling_buf_size  = decode_block_size * 2;
 
 /**
  * @brief Output a string descriptor
@@ -606,18 +41,21 @@ inline __device__ cuda::std::pair<const char*, size_t> gpuGetStringData(
  * @param[in] src_pos Source position
  * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash)
  */
+template <typename state_buf>
 inline __device__ void gpuOutputString(volatile page_state_s* s,
-                                       volatile page_state_buffers_s* sb,
+                                       volatile state_buf* sb,
                                        int src_pos,
                                        void* dstv)
 {
   auto [ptr, len] = gpuGetStringData(s, sb, src_pos);
-  if (s->dtype_len == 4) {
+  // make sure to only hash `BYTE_ARRAY` when specified with the output type size
+  if (s->dtype_len == 4 and (s->col.data_type & 7) == BYTE_ARRAY) {
     // Output hash. This hash value is used if the option to convert strings to
     // categoricals is enabled. The seed value is chosen arbitrarily.
     uint32_t constexpr hash_seed = 33;
     cudf::string_view const sv{ptr, static_cast<size_type>(len)};
-    *static_cast<uint32_t*>(dstv) = cudf::detail::MurmurHash3_32<cudf::string_view>{hash_seed}(sv);
+    *static_cast<uint32_t*>(dstv) =
+      cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{hash_seed}(sv);
   } else {
     // Output string descriptor
     auto* dst   = static_cast<string_index_pair*>(dstv);
@@ -633,11 +71,10 @@ inline __device__ void gpuOutputString(volatile page_state_s* s,
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
  */
-inline __device__ void gpuOutputBoolean(volatile page_state_buffers_s* sb,
-                                        int src_pos,
-                                        uint8_t* dst)
+template <typename state_buf>
+inline __device__ void gpuOutputBoolean(volatile state_buf* sb, int src_pos, uint8_t* dst)
 {
-  *dst = sb->dict_idx[rolling_index(src_pos)];
+  *dst = sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)];
 }
 
 /**
@@ -649,7 +86,7 @@ inline __device__ void gpuOutputBoolean(volatile page_state_buffers_s* sb,
  * @param[in] dict_size size of dictionary
  */
 inline __device__ void gpuStoreOutput(uint32_t* dst,
-                                      const uint8_t* src8,
+                                      uint8_t const* src8,
                                       uint32_t dict_pos,
                                       uint32_t dict_size)
 {
@@ -658,9 +95,9 @@ inline __device__ void gpuStoreOutput(uint32_t* dst,
   src8 -= ofs;  // align to 32-bit boundary
   ofs <<= 3;    // bytes -> bits
   if (dict_pos < dict_size) {
-    bytebuf = *reinterpret_cast<const uint32_t*>(src8 + dict_pos);
+    bytebuf = *reinterpret_cast<uint32_t const*>(src8 + dict_pos);
     if (ofs) {
-      uint32_t bytebufnext = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
+      uint32_t bytebufnext = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 4);
       bytebuf              = __funnelshift_r(bytebuf, bytebufnext, ofs);
     }
   } else {
@@ -678,7 +115,7 @@ inline __device__ void gpuStoreOutput(uint32_t* dst,
  * @param[in] dict_size size of dictionary
  */
 inline __device__ void gpuStoreOutput(uint2* dst,
-                                      const uint8_t* src8,
+                                      uint8_t const* src8,
                                       uint32_t dict_pos,
                                       uint32_t dict_size)
 {
@@ -687,10 +124,10 @@ inline __device__ void gpuStoreOutput(uint2* dst,
   src8 -= ofs;  // align to 32-bit boundary
   ofs <<= 3;    // bytes -> bits
   if (dict_pos < dict_size) {
-    v.x = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 0);
-    v.y = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
+    v.x = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 0);
+    v.y = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 4);
     if (ofs) {
-      uint32_t next = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 8);
+      uint32_t next = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 8);
       v.x           = __funnelshift_r(v.x, v.y, ofs);
       v.y           = __funnelshift_r(v.y, next, ofs);
     }
@@ -708,20 +145,22 @@ inline __device__ void gpuStoreOutput(uint2* dst,
  * @param[in] src_pos Source position
  * @param[out] dst Pointer to row output data
  */
+template <typename state_buf>
 inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s,
-                                               volatile page_state_buffers_s* sb,
+                                               volatile state_buf* sb,
                                                int src_pos,
                                                int64_t* dst)
 {
   using cuda::std::chrono::duration_cast;
 
-  const uint8_t* src8;
+  uint8_t const* src8;
   uint32_t dict_pos, dict_size = s->dict_size, ofs;
 
   if (s->dict_base) {
     // Dictionary
-    dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0;
-    src8     = s->dict_base;
+    dict_pos =
+      (s->dict_bits > 0) ? sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)] : 0;
+    src8 = s->dict_base;
   } else {
     // Plain
     dict_pos = src_pos;
@@ -739,11 +178,11 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s,
 
   uint3 v;
   int64_t nanos, days;
-  v.x = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 0);
-  v.y = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
-  v.z = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 8);
+  v.x = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 0);
+  v.y = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 4);
+  v.z = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 8);
   if (ofs) {
-    uint32_t next = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 12);
+    uint32_t next = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 12);
     v.x           = __funnelshift_r(v.x, v.y, ofs);
     v.y           = __funnelshift_r(v.y, v.z, ofs);
     v.z           = __funnelshift_r(v.z, next, ofs);
@@ -781,19 +220,21 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s,
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
  */
+template <typename state_buf>
 inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s,
-                                               volatile page_state_buffers_s* sb,
+                                               volatile state_buf* sb,
                                                int src_pos,
                                                int64_t* dst)
 {
-  const uint8_t* src8;
+  uint8_t const* src8;
   uint32_t dict_pos, dict_size = s->dict_size, ofs;
   int64_t ts;
 
   if (s->dict_base) {
     // Dictionary
-    dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0;
-    src8     = s->dict_base;
+    dict_pos =
+      (s->dict_bits > 0) ? sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)] : 0;
+    src8 = s->dict_base;
   } else {
     // Plain
     dict_pos = src_pos;
@@ -807,10 +248,10 @@ inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s,
     uint2 v;
     int64_t val;
     int32_t ts_scale;
-    v.x = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 0);
-    v.y = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
+    v.x = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 0);
+    v.y = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 4);
     if (ofs) {
-      uint32_t next = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 8);
+      uint32_t next = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 8);
       v.x           = __funnelshift_r(v.x, v.y, ofs);
       v.y           = __funnelshift_r(v.y, next, ofs);
     }
@@ -862,16 +303,18 @@ __device__ void gpuOutputByteArrayAsInt(char const* ptr, int32_t len, T* dst)
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
  */
-template <typename T>
+template <typename T, typename state_buf>
 __device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s,
-                                                volatile page_state_buffers_s* sb,
+                                                volatile state_buf* sb,
                                                 int src_pos,
                                                 T* dst)
 {
   uint32_t const dtype_len_in = s->dtype_len_in;
   uint8_t const* data         = s->dict_base ? s->dict_base : s->data_start;
   uint32_t const pos =
-    (s->dict_base ? ((s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0) : src_pos) *
+    (s->dict_base
+       ? ((s->dict_bits > 0) ? sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)] : 0)
+       : src_pos) *
     dtype_len_in;
   uint32_t const dict_size = s->dict_size;
 
@@ -897,19 +340,20 @@ __device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s,
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
  */
-template <typename T>
+template <typename T, typename state_buf>
 inline __device__ void gpuOutputFast(volatile page_state_s* s,
-                                     volatile page_state_buffers_s* sb,
+                                     volatile state_buf* sb,
                                      int src_pos,
                                      T* dst)
 {
-  const uint8_t* dict;
+  uint8_t const* dict;
   uint32_t dict_pos, dict_size = s->dict_size;
 
   if (s->dict_base) {
     // Dictionary
-    dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0;
-    dict     = s->dict_base;
+    dict_pos =
+      (s->dict_bits > 0) ? sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)] : 0;
+    dict = s->dict_base;
   } else {
     // Plain
     dict_pos = src_pos;
@@ -928,16 +372,18 @@ inline __device__ void gpuOutputFast(volatile page_state_s* s,
  * @param[in] dst8 Pointer to row output data
  * @param[in] len Length of element
  */
+template <typename state_buf>
 static __device__ void gpuOutputGeneric(
-  volatile page_state_s* s, volatile page_state_buffers_s* sb, int src_pos, uint8_t* dst8, int len)
+  volatile page_state_s* s, volatile state_buf* sb, int src_pos, uint8_t* dst8, int len)
 {
-  const uint8_t* dict;
+  uint8_t const* dict;
   uint32_t dict_pos, dict_size = s->dict_size;
 
   if (s->dict_base) {
     // Dictionary
-    dict_pos = (s->dict_bits > 0) ? sb->dict_idx[rolling_index(src_pos)] : 0;
-    dict     = s->dict_base;
+    dict_pos =
+      (s->dict_bits > 0) ? sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)] : 0;
+    dict = s->dict_base;
   } else {
     // Plain
     dict_pos = src_pos;
@@ -951,16 +397,16 @@ static __device__ void gpuOutputGeneric(
     }
   } else {
     // Copy 4 bytes at a time
-    const uint8_t* src8 = dict;
+    uint8_t const* src8 = dict;
     unsigned int ofs    = 3 & reinterpret_cast<size_t>(src8);
     src8 -= ofs;  // align to 32-bit boundary
     ofs <<= 3;    // bytes -> bits
     for (unsigned int i = 0; i < len; i += 4) {
       uint32_t bytebuf;
       if (dict_pos < dict_size) {
-        bytebuf = *reinterpret_cast<const uint32_t*>(src8 + dict_pos);
+        bytebuf = *reinterpret_cast<uint32_t const*>(src8 + dict_pos);
         if (ofs) {
-          uint32_t bytebufnext = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
+          uint32_t bytebufnext = *reinterpret_cast<uint32_t const*>(src8 + dict_pos + 4);
           bytebuf              = __funnelshift_r(bytebuf, bytebufnext, ofs);
         }
       } else {
@@ -973,947 +419,7 @@ static __device__ void gpuOutputGeneric(
 }
 
 /**
- * @brief Sets up block-local page state information from the global pages.
- *
- * @param[in, out] s The local page state to be filled in
- * @param[in] p The global page to be copied from
- * @param[in] chunks The global list of chunks
- * @param[in] min_row Crop all rows below min_row
- * @param[in] num_rows Maximum number of rows to read
- * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess
- * step)
- */
-static __device__ bool setupLocalPageInfo(page_state_s* const s,
-                                          PageInfo const* p,
-                                          device_span<ColumnChunkDesc const> chunks,
-                                          size_t min_row,
-                                          size_t num_rows,
-                                          bool is_decode_step)
-{
-  int t = threadIdx.x;
-  int chunk_idx;
-
-  // Fetch page info
-  if (!t) {
-    s->page         = *p;
-    s->nesting_info = nullptr;
-  }
-  __syncthreads();
-
-  if (s->page.flags & PAGEINFO_FLAGS_DICTIONARY) { return false; }
-  // Fetch column chunk info
-  chunk_idx = s->page.chunk_idx;
-  if (!t) { s->col = chunks[chunk_idx]; }
-
-  // if we can use the decode cache, set it up now
-  auto const can_use_decode_cache = s->page.nesting_info_size <= max_cacheable_nesting_decode_info;
-  if (can_use_decode_cache) {
-    int depth = 0;
-    while (depth < s->page.nesting_info_size) {
-      int const thread_depth = depth + t;
-      if (thread_depth < s->page.nesting_info_size) {
-        // these values need to be copied over from global
-        s->nesting_decode_cache[thread_depth].max_def_level =
-          s->page.nesting_decode[thread_depth].max_def_level;
-        s->nesting_decode_cache[thread_depth].page_start_value =
-          s->page.nesting_decode[thread_depth].page_start_value;
-        s->nesting_decode_cache[thread_depth].start_depth =
-          s->page.nesting_decode[thread_depth].start_depth;
-        s->nesting_decode_cache[thread_depth].end_depth =
-          s->page.nesting_decode[thread_depth].end_depth;
-      }
-      depth += blockDim.x;
-    }
-  }
-  if (!t) {
-    s->nesting_info = can_use_decode_cache ? s->nesting_decode_cache : s->page.nesting_decode;
-  }
-  __syncthreads();
-
-  // zero counts
-  int depth = 0;
-  while (depth < s->page.num_output_nesting_levels) {
-    int const thread_depth = depth + t;
-    if (thread_depth < s->page.num_output_nesting_levels) {
-      s->nesting_info[thread_depth].valid_count = 0;
-      s->nesting_info[thread_depth].value_count = 0;
-      s->nesting_info[thread_depth].null_count  = 0;
-    }
-    depth += blockDim.x;
-  }
-  __syncthreads();
-
-  if (!t) {
-    s->error = 0;
-
-    // our starting row (absolute index) is
-    // col.start_row == absolute row index
-    // page.chunk-row == relative row index within the chunk
-    size_t page_start_row = s->col.start_row + s->page.chunk_row;
-
-    // IMPORTANT : nested schemas can have 0 rows in a page but still have
-    // values. The case is:
-    // - On page N-1, the last row starts, with 2/6 values encoded
-    // - On page N, the remaining 4/6 values are encoded, but there are no new rows.
-    // if (s->page.num_input_values > 0 && s->page.num_rows > 0) {
-    if (s->page.num_input_values > 0) {
-      uint8_t* cur = s->page.page_data;
-      uint8_t* end = cur + s->page.uncompressed_page_size;
-
-      uint32_t dtype_len_out = s->col.data_type >> 3;
-      s->ts_scale            = 0;
-      // Validate data type
-      auto const data_type = s->col.data_type & 7;
-      switch (data_type) {
-        case BOOLEAN:
-          s->dtype_len = 1;  // Boolean are stored as 1 byte on the output
-          break;
-        case INT32: [[fallthrough]];
-        case FLOAT: s->dtype_len = 4; break;
-        case INT64:
-          if (s->col.ts_clock_rate) {
-            int32_t units = 0;
-            // Duration types are not included because no scaling is done when reading
-            if (s->col.converted_type == TIMESTAMP_MILLIS) {
-              units = cudf::timestamp_ms::period::den;
-            } else if (s->col.converted_type == TIMESTAMP_MICROS) {
-              units = cudf::timestamp_us::period::den;
-            } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) {
-              units = cudf::timestamp_ns::period::den;
-            }
-            if (units and units != s->col.ts_clock_rate) {
-              s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate)
-                                                           : (s->col.ts_clock_rate / units);
-            }
-          }
-          [[fallthrough]];
-        case DOUBLE: s->dtype_len = 8; break;
-        case INT96: s->dtype_len = 12; break;
-        case BYTE_ARRAY:
-          if (s->col.converted_type == DECIMAL) {
-            auto const decimal_precision = s->col.decimal_precision;
-            s->dtype_len                 = [decimal_precision]() {
-              if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
-                return sizeof(int32_t);
-              } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
-                return sizeof(int64_t);
-              } else {
-                return sizeof(__int128_t);
-              }
-            }();
-          } else {
-            s->dtype_len = sizeof(string_index_pair);
-          }
-          break;
-        default:  // FIXED_LEN_BYTE_ARRAY:
-          s->dtype_len = dtype_len_out;
-          s->error |= (s->dtype_len <= 0);
-          break;
-      }
-      // Special check for downconversions
-      s->dtype_len_in = s->dtype_len;
-      if (s->col.converted_type == DECIMAL && data_type == FIXED_LEN_BYTE_ARRAY) {
-        s->dtype_len = [dtype_len = s->dtype_len]() {
-          if (dtype_len <= sizeof(int32_t)) {
-            return sizeof(int32_t);
-          } else if (dtype_len <= sizeof(int64_t)) {
-            return sizeof(int64_t);
-          } else {
-            return sizeof(__int128_t);
-          }
-        }();
-      } else if (data_type == INT32) {
-        if (dtype_len_out == 1) {
-          // INT8 output
-          s->dtype_len = 1;
-        } else if (dtype_len_out == 2) {
-          // INT16 output
-          s->dtype_len = 2;
-        } else if (s->col.converted_type == TIME_MILLIS) {
-          // INT64 output
-          s->dtype_len = 8;
-        }
-      } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) {
-        s->dtype_len = 4;  // HASH32 output
-      } else if (data_type == INT96) {
-        s->dtype_len = 8;  // Convert to 64-bit timestamp
-      }
-
-      // NOTE: s->page.num_rows, s->col.chunk_row, s->first_row and s->num_rows will be
-      // invalid/bogus during first pass of the preprocess step for nested types. this is ok
-      // because we ignore these values in that stage.
-      {
-        auto const max_row = min_row + num_rows;
-
-        // if we are totally outside the range of the input, do nothing
-        if ((page_start_row > max_row) || (page_start_row + s->page.num_rows < min_row)) {
-          s->first_row = 0;
-          s->num_rows  = 0;
-        }
-        // otherwise
-        else {
-          s->first_row             = page_start_row >= min_row ? 0 : min_row - page_start_row;
-          auto const max_page_rows = s->page.num_rows - s->first_row;
-          s->num_rows              = (page_start_row + s->first_row) + max_page_rows <= max_row
-                                       ? max_page_rows
-                                       : max_row - (page_start_row + s->first_row);
-        }
-      }
-
-      // during the decoding step we need to offset the global output buffers
-      // for each level of nesting so that we write to the section this page
-      // is responsible for.
-      // - for flat schemas, we can do this directly by using row counts
-      // - for nested schemas, these offsets are computed during the preprocess step
-      //
-      // NOTE: in a chunked read situation, s->col.column_data_base and s->col.valid_map_base
-      // will be aliased to memory that has been freed when we get here in the non-decode step, so
-      // we cannot check against nullptr.  we'll just check a flag directly.
-      if (is_decode_step) {
-        int max_depth = s->col.max_nesting_depth;
-        for (int idx = 0; idx < max_depth; idx++) {
-          PageNestingDecodeInfo* nesting_info = &s->nesting_info[idx];
-
-          size_t output_offset;
-          // schemas without lists
-          if (s->col.max_level[level_type::REPETITION] == 0) {
-            output_offset = page_start_row >= min_row ? page_start_row - min_row : 0;
-          }
-          // for schemas with lists, we've already got the exact value precomputed
-          else {
-            output_offset = nesting_info->page_start_value;
-          }
-
-          nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
-
-          if (nesting_info->data_out != nullptr) {
-            // anything below max depth with a valid data pointer must be a list, so the
-            // element size is the size of the offset type.
-            uint32_t len = idx < max_depth - 1 ? sizeof(cudf::size_type) : s->dtype_len;
-            nesting_info->data_out += (output_offset * len);
-          }
-          nesting_info->valid_map = s->col.valid_map_base[idx];
-          if (nesting_info->valid_map != nullptr) {
-            nesting_info->valid_map += output_offset >> 5;
-            nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f);
-          }
-        }
-      }
-      s->first_output_value = 0;
-
-      // Find the compressed size of repetition levels
-      cur += InitLevelSection(s, cur, end, level_type::REPETITION);
-      // Find the compressed size of definition levels
-      cur += InitLevelSection(s, cur, end, level_type::DEFINITION);
-
-      s->dict_bits = 0;
-      s->dict_base = nullptr;
-      s->dict_size = 0;
-      // NOTE:  if additional encodings are supported in the future, modifications must
-      // be made to is_supported_encoding() in reader_impl_preprocess.cu
-      switch (s->page.encoding) {
-        case Encoding::PLAIN_DICTIONARY:
-        case Encoding::RLE_DICTIONARY:
-          // RLE-packed dictionary indices, first byte indicates index length in bits
-          if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
-            // String dictionary: use index
-            s->dict_base = reinterpret_cast<const uint8_t*>(s->col.str_dict_index);
-            s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
-          } else {
-            s->dict_base =
-              s->col.page_info[0].page_data;  // dictionary is always stored in the first page
-            s->dict_size = s->col.page_info[0].uncompressed_page_size;
-          }
-          s->dict_run  = 0;
-          s->dict_val  = 0;
-          s->dict_bits = (cur < end) ? *cur++ : 0;
-          if (s->dict_bits > 32 || !s->dict_base) { s->error = (10 << 8) | s->dict_bits; }
-          break;
-        case Encoding::PLAIN:
-          s->dict_size = static_cast<int32_t>(end - cur);
-          s->dict_val  = 0;
-          if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
-          break;
-        case Encoding::RLE: s->dict_run = 0; break;
-        default:
-          s->error = 1;  // Unsupported encoding
-          break;
-      }
-      if (cur > end) { s->error = 1; }
-      s->lvl_end    = cur;
-      s->data_start = cur;
-      s->data_end   = end;
-    } else {
-      s->error = 1;
-    }
-
-    s->lvl_count[level_type::REPETITION] = 0;
-    s->lvl_count[level_type::DEFINITION] = 0;
-    s->nz_count                          = 0;
-    s->num_input_values                  = s->page.num_input_values;
-    s->dict_pos                          = 0;
-    s->src_pos                           = 0;
-
-    // for flat hierarchies, we can't know how many leaf values to skip unless we do a full
-    // preprocess of the definition levels (since nulls will have no actual decodable value, there
-    // is no direct correlation between # of rows and # of decodable values).  so we will start
-    // processing at the beginning of the value stream and disregard any indices that start
-    // before the first row.
-    if (s->col.max_level[level_type::REPETITION] == 0) {
-      s->page.skipped_values      = 0;
-      s->page.skipped_leaf_values = 0;
-      s->input_value_count        = 0;
-      s->input_row_count          = 0;
-      s->input_leaf_count         = 0;
-
-      s->row_index_lower_bound = -1;
-    }
-    // for nested hierarchies, we have run a preprocess that lets us skip directly to the values
-    // we need to start decoding at
-    else {
-      // input_row_count translates to "how many rows we have processed so far", so since we are
-      // skipping directly to where we want to start decoding, set it to first_row
-      s->input_row_count = s->first_row;
-
-      // return the lower bound to compare (page-relative) thread row index against. Explanation:
-      // In the case of nested schemas, rows can span page boundaries.  That is to say,
-      // we can encounter the first value for row X on page M, but the last value for page M
-      // might not be the last value for row X. page M+1 (or further) may contain the last value.
-      //
-      // This means that the first values we encounter for a given page (M+1) may not belong to the
-      // row indicated by chunk_row, but to the row before it that spanned page boundaries. If that
-      // previous row is within the overall row bounds, include the values by allowing relative row
-      // index -1
-      int const max_row = (min_row + num_rows) - 1;
-      if (min_row < page_start_row && max_row >= page_start_row - 1) {
-        s->row_index_lower_bound = -1;
-      } else {
-        s->row_index_lower_bound = s->first_row;
-      }
-
-      // if we're in the decoding step, jump directly to the first
-      // value we care about
-      if (is_decode_step) {
-        s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0;
-      } else {
-        s->input_value_count = 0;
-        s->input_leaf_count  = 0;
-        s->page.skipped_values =
-          -1;  // magic number to indicate it hasn't been set for use inside UpdatePageSizes
-        s->page.skipped_leaf_values = 0;
-      }
-    }
-
-    __threadfence_block();
-  }
-  __syncthreads();
-
-  return true;
-}
-
-/**
- * @brief Store a validity mask containing value_count bits into the output validity buffer of the
- * page.
- *
- * @param[in,out] nesting_info The page/nesting information to store the mask in. The validity map
- * offset is also updated
- * @param[in] valid_mask The validity mask to be stored
- * @param[in] value_count # of bits in the validity mask
- */
-static __device__ void store_validity(PageNestingDecodeInfo* nesting_info,
-                                      uint32_t valid_mask,
-                                      int32_t value_count)
-{
-  int word_offset = nesting_info->valid_map_offset / 32;
-  int bit_offset  = nesting_info->valid_map_offset % 32;
-  // if we fit entirely in the output word
-  if (bit_offset + value_count <= 32) {
-    auto relevant_mask = static_cast<uint32_t>((static_cast<uint64_t>(1) << value_count) - 1);
-
-    if (relevant_mask == ~0) {
-      nesting_info->valid_map[word_offset] = valid_mask;
-    } else {
-      atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset));
-      atomicOr(nesting_info->valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset);
-    }
-  }
-  // we're going to spill over into the next word.
-  // note : writing both values here is the lazy/slow way.  we could be writing just
-  // the first word and rolling the remaining bits over into the next call.
-  // however, some basic performance tests shows almost no difference between these two
-  // methods. More detailed performance testing might be worthwhile here.
-  else {
-    uint32_t bits_left = 32 - bit_offset;
-
-    // first word. strip bits_left bits off the beginning and store that
-    uint32_t relevant_mask = ((1 << bits_left) - 1);
-    uint32_t mask_word0    = valid_mask & relevant_mask;
-    atomicAnd(nesting_info->valid_map + word_offset, ~(relevant_mask << bit_offset));
-    atomicOr(nesting_info->valid_map + word_offset, mask_word0 << bit_offset);
-
-    // second word. strip the remainder of the bits off the end and store that
-    relevant_mask       = ((1 << (value_count - bits_left)) - 1);
-    uint32_t mask_word1 = valid_mask & (relevant_mask << bits_left);
-    atomicAnd(nesting_info->valid_map + word_offset + 1, ~(relevant_mask));
-    atomicOr(nesting_info->valid_map + word_offset + 1, mask_word1 >> bits_left);
-  }
-
-  nesting_info->valid_map_offset += value_count;
-}
-
-/**
- * @brief Compute the nesting bounds within the hierarchy to add values to, and the definition level
- * D to which we should considered them null or not.
- *
- * @param[out] start_depth The start nesting depth
- * @param[out] end_depth The end nesting depth (inclusive)
- * @param[out] d The definition level up to which added values are not-null. if t is out of bounds,
- * d will be -1
- * @param[in] s Local page information
- * @param[in] input_value_count The current count of input level values we have processed
- * @param[in] target_input_value_count The desired # of input level values we want to process
- * @param[in] t Thread index
- */
-inline __device__ void get_nesting_bounds(int& start_depth,
-                                          int& end_depth,
-                                          int& d,
-                                          page_state_s* s,
-                                          int input_value_count,
-                                          int32_t target_input_value_count,
-                                          int t)
-{
-  start_depth = -1;
-  end_depth   = -1;
-  d           = -1;
-  if (input_value_count + t < target_input_value_count) {
-    int index = rolling_index(input_value_count + t);
-    d         = s->def[index];
-    // if we have repetition (there are list columns involved) we have to
-    // bound what nesting levels we apply values to
-    if (s->col.max_level[level_type::REPETITION] > 0) {
-      int r       = s->rep[index];
-      start_depth = s->nesting_info[r].start_depth;
-      end_depth   = s->nesting_info[d].end_depth;
-    }
-    // for columns without repetition (even ones involving structs) we always
-    // traverse the entire hierarchy.
-    else {
-      start_depth = 0;
-      end_depth   = s->col.max_nesting_depth - 1;
-    }
-  }
-}
-
-/**
- * @brief Process a batch of incoming repetition/definition level values and generate
- *        validity, nested column offsets (where appropriate) and decoding indices.
- *
- * @param[in] target_input_value_count The # of repetition/definition levels to process up to
- * @param[in] s Local page information
- * @param[out] sb Page state buffer output
- * @param[in] t Thread index
- */
-static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
-                                                             page_state_s* s,
-                                                             page_state_buffers_s* sb,
-                                                             int t)
-{
-  // max nesting depth of the column
-  int const max_depth       = s->col.max_nesting_depth;
-  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
-  // how many (input) values we've processed in the page so far
-  int input_value_count = s->input_value_count;
-  // how many rows we've processed in the page so far
-  int input_row_count = s->input_row_count;
-
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-
-  // process until we've reached the target
-  while (input_value_count < target_input_value_count) {
-    // determine the nesting bounds for this thread (the range of nesting depths we
-    // will generate new value indices and validity bits for)
-    int start_depth, end_depth, d;
-    get_nesting_bounds(
-      start_depth, end_depth, d, s, input_value_count, target_input_value_count, t);
-
-    // 4 interesting things to track:
-    // thread_value_count : # of output values from the view of this thread
-    // warp_value_count   : # of output values for the whole warp
-    //
-    // thread_valid_count : # of valid values from the view of this thread
-    // warp_valid_count   : # of valid values for the whole warp
-    uint32_t thread_value_count, warp_value_count;
-    uint32_t thread_valid_count, warp_valid_count;
-
-    // track (page-relative) row index for the thread so we can compare against input bounds
-    // keep track of overall # of rows we've read.
-    int const is_new_row               = start_depth == 0 ? 1 : 0;
-    uint32_t const warp_row_count_mask = ballot(is_new_row);
-    int32_t const thread_row_index =
-      input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1);
-    input_row_count += __popc(warp_row_count_mask);
-    // is this thread within read row bounds?
-    int const in_row_bounds = thread_row_index >= s->row_index_lower_bound &&
-                                  thread_row_index < (s->first_row + s->num_rows)
-                                ? 1
-                                : 0;
-
-    // compute warp and thread value counts
-    uint32_t const warp_count_mask =
-      ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0);
-
-    warp_value_count = __popc(warp_count_mask);
-    // Note : ((1 << t) - 1) implies "for all threads before me"
-    thread_value_count = __popc(warp_count_mask & ((1 << t) - 1));
-
-    // walk from 0 to max_depth
-    uint32_t next_thread_value_count, next_warp_value_count;
-    for (int s_idx = 0; s_idx < max_depth; s_idx++) {
-      PageNestingDecodeInfo* nesting_info = &nesting_info_base[s_idx];
-
-      // if we are within the range of nesting levels we should be adding value indices for
-      int const in_nesting_bounds =
-        ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0;
-
-      // everything up to the max_def_level is a non-null value
-      uint32_t const is_valid = d >= nesting_info->max_def_level && in_nesting_bounds ? 1 : 0;
-
-      // compute warp and thread valid counts
-      uint32_t const warp_valid_mask =
-        // for flat schemas, a simple ballot_sync gives us the correct count and bit positions
-        // because every value in the input matches to a value in the output
-        !has_repetition
-          ? ballot(is_valid)
-          :
-          // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
-          // however not all of them will necessarily represent a value at this nesting level. so
-          // the validity bit for thread t might actually represent output value t-6. the correct
-          // position for thread t's bit is cur_value_count. for cuda 11 we could use
-          // __reduce_or_sync(), but until then we have to do a warp reduce.
-          WarpReduceOr32(is_valid << thread_value_count);
-
-      thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1));
-      warp_valid_count   = __popc(warp_valid_mask);
-
-      // if this is the value column emit an index for value decoding
-      if (is_valid && s_idx == max_depth - 1) {
-        int const src_pos = nesting_info->valid_count + thread_valid_count;
-        int const dst_pos = nesting_info->value_count + thread_value_count;
-        // nz_idx is a mapping of src buffer indices to destination buffer indices
-        sb->nz_idx[rolling_index(src_pos)] = dst_pos;
-      }
-
-      // compute warp and thread value counts for the -next- nesting level. we need to
-      // do this for nested schemas so that we can emit an offset for the -current- nesting
-      // level. more concretely : the offset for the current nesting level == current length of the
-      // next nesting level
-      if (s_idx < max_depth - 1) {
-        uint32_t const next_warp_count_mask =
-          ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0);
-        next_warp_value_count   = __popc(next_warp_count_mask);
-        next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1));
-
-        // if we're -not- at a leaf column and we're within nesting/row bounds
-        // and we have a valid data_out pointer, it implies this is a list column, so
-        // emit an offset.
-        if (in_nesting_bounds && nesting_info->data_out != nullptr) {
-          int const idx             = nesting_info->value_count + thread_value_count;
-          cudf::size_type const ofs = nesting_info_base[s_idx + 1].value_count +
-                                      next_thread_value_count +
-                                      nesting_info_base[s_idx + 1].page_start_value;
-          (reinterpret_cast<cudf::size_type*>(nesting_info->data_out))[idx] = ofs;
-        }
-      }
-
-      // nested schemas always read and write to the same bounds (that is, read and write positions
-      // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the
-      // first value, even if that is before first_row, because we cannot trivially jump to
-      // the correct position to start reading. since we are about to write the validity vector here
-      // we need to adjust our computed mask to take into account the write row bounds.
-      int const in_write_row_bounds =
-        !has_repetition
-          ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows)
-          : in_row_bounds;
-      int const first_thread_in_write_range =
-        !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0;
-
-      // # of bits to of the validity mask to write out
-      int const warp_valid_mask_bit_count =
-        first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range;
-
-      // increment count of valid values, count of total values, and update validity mask
-      if (!t) {
-        if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) {
-          uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range;
-          store_validity(nesting_info, warp_output_valid_mask, warp_valid_mask_bit_count);
-
-          nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask);
-        }
-        nesting_info->valid_count += warp_valid_count;
-        nesting_info->value_count += warp_value_count;
-      }
-
-      // propagate value counts for the next level
-      warp_value_count   = next_warp_value_count;
-      thread_value_count = next_thread_value_count;
-    }
-
-    input_value_count += min(32, (target_input_value_count - input_value_count));
-    __syncwarp();
-  }
-
-  // update
-  if (!t) {
-    // update valid value count for decoding and total # of values we've processed
-    s->nz_count          = nesting_info_base[max_depth - 1].valid_count;
-    s->input_value_count = input_value_count;
-    s->input_row_count   = input_row_count;
-  }
-}
-
-/**
- * @brief Process repetition and definition levels up to the target count of leaf values.
- *
- * In order to decode actual leaf values from the input stream, we need to generate the
- * list of non-null value positions (page_state_s::nz_idx). We do this by processing
- * the repetition and definition level streams.  This process also generates validity information,
- * and offset column values in the case of nested schemas. Because of the way the streams
- * are encoded, this function may generate slightly more than target_leaf_count.
- *
- * Only runs on 1 warp.
- *
- * @param[in] s The local page state
- * @param[out] sb Page state buffer output
- * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for
- * @param[in] t Thread index
- */
-__device__ void gpuDecodeLevels(page_state_s* s,
-                                page_state_buffers_s* sb,
-                                int32_t target_leaf_count,
-                                int t)
-{
-  bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
-
-  constexpr int batch_size = 32;
-  int cur_leaf_count       = target_leaf_count;
-  while (!s->error && s->nz_count < target_leaf_count &&
-         s->input_value_count < s->num_input_values) {
-    if (has_repetition) { gpuDecodeStream(s->rep, s, cur_leaf_count, t, level_type::REPETITION); }
-    gpuDecodeStream(s->def, s, cur_leaf_count, t, level_type::DEFINITION);
-    __syncwarp();
-
-    // because the rep and def streams are encoded separately, we cannot request an exact
-    // # of values to be decoded at once. we can only process the lowest # of decoded rep/def
-    // levels we get.
-    int actual_leaf_count = has_repetition ? min(s->lvl_count[level_type::REPETITION],
-                                                 s->lvl_count[level_type::DEFINITION])
-                                           : s->lvl_count[level_type::DEFINITION];
-
-    // process what we got back
-    gpuUpdateValidityOffsetsAndRowIndices(actual_leaf_count, s, sb, t);
-    cur_leaf_count = actual_leaf_count + batch_size;
-    __syncwarp();
-  }
-}
-
-/**
- * @brief Process a batch of incoming repetition/definition level values to generate
- *        per-nesting level output column size for this page.
- *
- * Each page represents one piece of the overall output column. The total output (cudf)
- * column sizes are the sum of the values in each individual page.
- *
- * @param[in] s The local page info
- * @param[in] target_input_value_count The # of repetition/definition levels to process up to
- * @param[in] t Thread index
- * @param[in] bounds_set Whether or not s->row_index_lower_bound, s->first_row and s->num_rows
- * have been computed for this page (they will only be set in the second/trim pass).
- */
-static __device__ void gpuUpdatePageSizes(page_state_s* s,
-                                          int32_t target_input_value_count,
-                                          int t,
-                                          bool bounds_set)
-{
-  // max nesting depth of the column
-  int const max_depth = s->col.max_nesting_depth;
-  // how many input level values we've processed in the page so far
-  int input_value_count = s->input_value_count;
-  // how many leaf values we've processed in the page so far
-  int input_leaf_count = s->input_leaf_count;
-  // how many rows we've processed in the page so far
-  int input_row_count = s->input_row_count;
-
-  while (input_value_count < target_input_value_count) {
-    int start_depth, end_depth, d;
-    get_nesting_bounds(
-      start_depth, end_depth, d, s, input_value_count, target_input_value_count, t);
-
-    // count rows and leaf values
-    int const is_new_row               = start_depth == 0 ? 1 : 0;
-    uint32_t const warp_row_count_mask = ballot(is_new_row);
-    int const is_new_leaf = (d >= s->nesting_info[max_depth - 1].max_def_level) ? 1 : 0;
-    uint32_t const warp_leaf_count_mask = ballot(is_new_leaf);
-
-    // is this thread within row bounds? on the first pass we don't know the bounds, so we will be
-    // computing the full size of the column.  on the second pass, we will know our actual row
-    // bounds, so the computation will cap sizes properly.
-    int in_row_bounds = 1;
-    if (bounds_set) {
-      // absolute row index
-      int32_t thread_row_index =
-        input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1);
-      in_row_bounds = thread_row_index >= s->row_index_lower_bound &&
-                          thread_row_index < (s->first_row + s->num_rows)
-                        ? 1
-                        : 0;
-
-      uint32_t const row_bounds_mask  = ballot(in_row_bounds);
-      int const first_thread_in_range = __ffs(row_bounds_mask) - 1;
-
-      // if we've found the beginning of the first row, mark down the position
-      // in the def/repetition buffer (skipped_values) and the data buffer (skipped_leaf_values)
-      if (!t && first_thread_in_range >= 0 && s->page.skipped_values < 0) {
-        // how many values we've skipped in the rep/def levels
-        s->page.skipped_values = input_value_count + first_thread_in_range;
-        // how many values we've skipped in the actual data stream
-        s->page.skipped_leaf_values =
-          input_leaf_count + __popc(warp_leaf_count_mask & ((1 << first_thread_in_range) - 1));
-      }
-    }
-
-    // increment value counts across all nesting depths
-    for (int s_idx = 0; s_idx < max_depth; s_idx++) {
-      PageNestingInfo* pni = &s->page.nesting[s_idx];
-
-      // if we are within the range of nesting levels we should be adding value indices for
-      int const in_nesting_bounds =
-        (s_idx >= start_depth && s_idx <= end_depth && in_row_bounds) ? 1 : 0;
-      uint32_t const count_mask = ballot(in_nesting_bounds);
-      if (!t) { pni->batch_size += __popc(count_mask); }
-    }
-
-    input_value_count += min(32, (target_input_value_count - input_value_count));
-    input_row_count += __popc(warp_row_count_mask);
-    input_leaf_count += __popc(warp_leaf_count_mask);
-  }
-
-  // update final page value count
-  if (!t) {
-    s->input_value_count = target_input_value_count;
-    s->input_leaf_count  = input_leaf_count;
-    s->input_row_count   = input_row_count;
-  }
-}
-
-/**
- * @brief Returns the total size in bytes of string char data in the page.
- *
- * This function expects the dictionary position to be at 0 and will traverse
- * the entire thing.
- *
- * @param s The local page info
- * @param t Thread index
- */
-__device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
-{
-  size_type target_pos = s->num_input_values;
-  size_type str_len    = 0;
-  if (s->dict_base) {
-    auto const [new_target_pos, len] = gpuDecodeDictionaryIndices<true>(s, nullptr, target_pos, t);
-    target_pos                       = new_target_pos;
-    str_len                          = len;
-  } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
-    str_len = gpuInitStringDescriptors<true>(s, nullptr, target_pos, t);
-  }
-  if (!t) { *(volatile int32_t*)&s->dict_pos = target_pos; }
-  return str_len;
-}
-
-/**
- * @brief Kernel for computing per-page column size information for all nesting levels.
- *
- * This function will write out the size field for each level of nesting.
- *
- * @param pages List of pages
- * @param chunks List of column chunks
- * @param min_row Row index to start reading at
- * @param num_rows Maximum number of rows to read. Pass as INT_MAX to guarantee reading all rows
- * @param is_base_pass Whether or not this is the base pass.  We first have to compute
- * the full size information of every page before we come through in a second (trim) pass
- * to determine what subset of rows in this page we should be reading
- * @param compute_string_sizes Whether or not we should be computing string sizes
- * (PageInfo::str_bytes) as part of the pass
- */
-__global__ void __launch_bounds__(block_size)
-  gpuComputePageSizes(PageInfo* pages,
-                      device_span<ColumnChunkDesc const> chunks,
-                      size_t min_row,
-                      size_t num_rows,
-                      bool is_base_pass,
-                      bool compute_string_sizes)
-{
-  __shared__ __align__(16) page_state_s state_g;
-
-  page_state_s* const s = &state_g;
-  int page_idx          = blockIdx.x;
-  int t                 = threadIdx.x;
-  PageInfo* pp          = &pages[page_idx];
-
-  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false)) { return; }
-
-  if (!t) {
-    s->page.skipped_values      = -1;
-    s->page.skipped_leaf_values = 0;
-    s->page.str_bytes           = 0;
-    s->input_row_count          = 0;
-    s->input_value_count        = 0;
-
-    // in the base pass, we're computing the number of rows, make sure we visit absolutely
-    // everything
-    if (is_base_pass) {
-      s->first_row             = 0;
-      s->num_rows              = INT_MAX;
-      s->row_index_lower_bound = -1;
-    }
-  }
-
-  // we only need to preprocess hierarchies with repetition in them (ie, hierarchies
-  // containing lists anywhere within).
-  bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
-  compute_string_sizes =
-    compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4);
-
-  // early out optimizations:
-
-  // - if this is a flat hierarchy (no lists) and is not a string column. in this case we don't need
-  // to do the expensive work of traversing the level data to determine sizes.  we can just compute
-  // it directly.
-  if (!has_repetition && !compute_string_sizes) {
-    int depth = 0;
-    while (depth < s->page.num_output_nesting_levels) {
-      auto const thread_depth = depth + t;
-      if (thread_depth < s->page.num_output_nesting_levels) {
-        if (is_base_pass) { pp->nesting[thread_depth].size = pp->num_input_values; }
-        pp->nesting[thread_depth].batch_size = pp->num_input_values;
-      }
-      depth += blockDim.x;
-    }
-    return;
-  }
-
-  // in the trim pass, for anything with lists, we only need to fully process bounding pages (those
-  // at the beginning or the end of the row bounds)
-  if (!is_base_pass && !is_bounds_page(s, min_row, num_rows)) {
-    int depth = 0;
-    while (depth < s->page.num_output_nesting_levels) {
-      auto const thread_depth = depth + t;
-      if (thread_depth < s->page.num_output_nesting_levels) {
-        // if we are not a bounding page (as checked above) then we are either
-        // returning all rows/values from this page, or 0 of them
-        pp->nesting[thread_depth].batch_size =
-          (s->num_rows == 0 && !is_page_contained(s, min_row, num_rows))
-            ? 0
-            : pp->nesting[thread_depth].size;
-      }
-      depth += blockDim.x;
-    }
-    return;
-  }
-
-  // zero sizes
-  int depth = 0;
-  while (depth < s->page.num_output_nesting_levels) {
-    auto const thread_depth = depth + t;
-    if (thread_depth < s->page.num_output_nesting_levels) {
-      s->page.nesting[thread_depth].batch_size = 0;
-    }
-    depth += blockDim.x;
-  }
-
-  __syncthreads();
-
-  // optimization : it might be useful to have a version of gpuDecodeStream that could go wider than
-  // 1 warp.  Currently it only uses 1 warp so that it can overlap work with the value decoding step
-  // when in the actual value decoding kernel. However, during this preprocess step we have no such
-  // limits -  we could go as wide as block_size
-  if (t < 32) {
-    constexpr int batch_size = 32;
-    int target_input_count   = batch_size;
-    while (!s->error && s->input_value_count < s->num_input_values) {
-      // decode repetition and definition levels. these will attempt to decode at
-      // least up to the target, but may decode a few more.
-      if (has_repetition) {
-        gpuDecodeStream(s->rep, s, target_input_count, t, level_type::REPETITION);
-      }
-      gpuDecodeStream(s->def, s, target_input_count, t, level_type::DEFINITION);
-      __syncwarp();
-
-      // we may have decoded different amounts from each stream, so only process what we've been
-      int actual_input_count = has_repetition ? min(s->lvl_count[level_type::REPETITION],
-                                                    s->lvl_count[level_type::DEFINITION])
-                                              : s->lvl_count[level_type::DEFINITION];
-
-      // process what we got back
-      gpuUpdatePageSizes(s, actual_input_count, t, !is_base_pass);
-      target_input_count = actual_input_count + batch_size;
-      __syncwarp();
-    }
-
-    // retrieve total string size.
-    // TODO: investigate if it is possible to do this with a separate warp at the same time levels
-    // are being decoded above.
-    if (compute_string_sizes) { s->page.str_bytes = gpuDecodeTotalPageStringSize(s, t); }
-  }
-
-  // update output results:
-  // - real number of rows for the whole page
-  // - nesting sizes for the whole page
-  // - skipped value information for trimmed pages
-  // - string bytes
-  if (is_base_pass) {
-    // nesting level 0 is the root column, so the size is also the # of rows
-    if (!t) { pp->num_rows = s->page.nesting[0].batch_size; }
-
-    // store off this batch size as the "full" size
-    int depth = 0;
-    while (depth < s->page.num_output_nesting_levels) {
-      auto const thread_depth = depth + t;
-      if (thread_depth < s->page.num_output_nesting_levels) {
-        pp->nesting[thread_depth].size = pp->nesting[thread_depth].batch_size;
-      }
-      depth += blockDim.x;
-    }
-  }
-
-  if (!t) {
-    pp->skipped_values      = s->page.skipped_values;
-    pp->skipped_leaf_values = s->page.skipped_leaf_values;
-    pp->str_bytes           = s->page.str_bytes;
-  }
-}
-
-// Copies null counts back to `nesting_decode` at the end of scope
-struct null_count_back_copier {
-  page_state_s* s;
-  int t;
-  __device__ ~null_count_back_copier()
-  {
-    if (s->nesting_info != nullptr and s->nesting_info == s->nesting_decode_cache) {
-      int depth = 0;
-      while (depth < s->page.num_output_nesting_levels) {
-        int const thread_depth = depth + t;
-        if (thread_depth < s->page.num_output_nesting_levels) {
-          s->page.nesting_decode[thread_depth].null_count =
-            s->nesting_decode_cache[thread_depth].null_count;
-        }
-        depth += blockDim.x;
-      }
-    }
-  }
-};
-
-/**
- * @brief Kernel for co the column data stored in the pages
+ * @brief Kernel for computing the column data stored in the pages
  *
  * This function will write the page data and the page data's validity to the
  * output specified in the page's column chunk. If necessary, additional
@@ -1925,47 +431,45 @@ struct null_count_back_copier {
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
  */
-__global__ void __launch_bounds__(block_size) gpuDecodePageData(
+template <int lvl_buf_size, typename level_t>
+__global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s state_buffers;
+  __shared__ __align__(16)
+    page_state_buffers_s<rolling_buf_size, rolling_buf_size, rolling_buf_size>
+      state_buffers;
 
-  page_state_s* const s          = &state_g;
-  page_state_buffers_s* const sb = &state_buffers;
-  int page_idx                   = blockIdx.x;
-  int t                          = threadIdx.x;
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int page_idx          = blockIdx.x;
+  int t                 = threadIdx.x;
   int out_thread0;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
-
-  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
-
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  //
-  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
-  // starts before this page and ends after this page:
-  //       P0        P1        P2
-  //  |---------|---------|----------|
-  //        ^------------------^
-  //      row start           row end
-  // P1 will contain 0 rows
-  //
-  if (s->num_rows == 0 && !(has_repetition && (is_bounds_page(s, min_row, num_rows) ||
-                                               is_page_contained(s, min_row, num_rows)))) {
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_GENERAL}, true)) {
     return;
   }
 
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
   if (s->dict_base) {
     out_thread0 = (s->dict_bits > 0) ? 64 : 32;
   } else {
-    out_thread0 =
-      ((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32;
+    switch (s->col.data_type & 7) {
+      case BOOLEAN: [[fallthrough]];
+      case BYTE_ARRAY: [[fallthrough]];
+      case FIXED_LEN_BYTE_ARRAY: out_thread0 = 64; break;
+      default: out_thread0 = 32;
+    }
   }
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
 
+  __shared__ level_t rep[rolling_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[rolling_buf_size];  // circular buffer of definition level values
+
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
   while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
@@ -1973,10 +477,10 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
     int src_pos = s->src_pos;
 
     if (t < out_thread0) {
-      target_pos =
-        min(src_pos + 2 * (block_size - out_thread0), s->nz_count + (block_size - out_thread0));
+      target_pos = min(src_pos + 2 * (decode_block_size - out_thread0),
+                       s->nz_count + (decode_block_size - out_thread0));
     } else {
-      target_pos = min(s->nz_count, src_pos + block_size - out_thread0);
+      target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
       if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
     }
     __syncthreads();
@@ -1985,7 +489,7 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
       // - update validity vectors
       // - updates offsets (for nested columns)
       // - produces non-NULL value indices in s->nz_idx for subsequent decoding
-      gpuDecodeLevels(s, sb, target_pos, t);
+      gpuDecodeLevels<lvl_buf_size, level_t>(s, sb, target_pos, rep, def, t);
     } else if (t < out_thread0) {
       // skipped_leaf_values will always be 0 for flat hierarchies.
       uint32_t src_target_pos = target_pos + skipped_leaf_values;
@@ -1995,17 +499,18 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
         src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
       } else if ((s->col.data_type & 7) == BOOLEAN) {
         src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f);
-      } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
+      } else if ((s->col.data_type & 7) == BYTE_ARRAY or
+                 (s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
         gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
       }
       if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
     } else {
       // WARP1..WARP3: Decode values
-      int dtype = s->col.data_type & 7;
+      int const dtype = s->col.data_type & 7;
       src_pos += t - out_thread0;
 
       // the position in the output column/buffer
-      int dst_pos = sb->nz_idx[rolling_index(src_pos)];
+      int dst_pos = sb->nz_idx[rolling_index<rolling_buf_size>(src_pos)];
 
       // for the flat hierarchy case we will be reading from the beginning of the value stream,
       // regardless of the value of first_row. so adjust our destination offset accordingly.
@@ -2065,6 +570,8 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
               }
               break;
           }
+        } else if (dtype == FIXED_LEN_BYTE_ARRAY) {
+          gpuOutputString(s, sb, val_src_pos, dst);
         } else if (dtype == INT96) {
           gpuOutputInt96Timestamp(s, sb, val_src_pos, static_cast<int64_t*>(dst));
         } else if (dtype_len == 8) {
@@ -2091,47 +598,43 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
   }
 }
 
+struct mask_tform {
+  __device__ uint32_t operator()(PageInfo const& p) { return p.kernel_mask; }
+};
+
 }  // anonymous namespace
 
-/**
- * @copydoc cudf::io::parquet::gpu::ComputePageSizes
- */
-void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
-                      hostdevice_vector<ColumnChunkDesc> const& chunks,
-                      size_t min_row,
-                      size_t num_rows,
-                      bool compute_num_rows,
-                      bool compute_string_sizes,
-                      rmm::cuda_stream_view stream)
+uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                       rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(block_size, 1);
-  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
-
-  // computes:
-  // PageNestingInfo::size for each level of nesting, for each page.
-  // This computes the size for the entire page, not taking row bounds into account.
-  // If uses_custom_row_bounds is set to true, we have to do a second pass later that "trims"
-  // the starting and ending read values to account for these bounds.
-  gpuComputePageSizes<<<dim_grid, dim_block, 0, stream.value()>>>(
-    pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes);
+  // determine which kernels to invoke
+  auto mask_iter = thrust::make_transform_iterator(pages.d_begin(), mask_tform{});
+  return thrust::reduce(
+    rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or<uint32_t>{});
 }
 
 /**
  * @copydoc cudf::io::parquet::gpu::DecodePageData
  */
-void __host__ DecodePageData(hostdevice_vector<PageInfo>& pages,
-                             hostdevice_vector<ColumnChunkDesc> const& chunks,
+void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                             cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
                              size_t num_rows,
                              size_t min_row,
+                             int level_type_size,
                              rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
 
-  dim3 dim_block(block_size, 1);
+  dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
-  gpuDecodePageData<<<dim_grid, dim_block, 0, stream.value()>>>(
-    pages.device_ptr(), chunks, min_row, num_rows);
+  if (level_type_size == 1) {
+    gpuDecodePageData<rolling_buf_size, uint8_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  } else {
+    gpuDecodePageData<rolling_buf_size, uint16_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  }
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
new file mode 100644
index 00000000000..26e3c951b2e
--- /dev/null
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -0,0 +1,1363 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "parquet_gpu.hpp"
+#include "rle_stream.cuh"
+
+#include <io/utilities/block_utils.cuh>
+
+#include <cuda/std/tuple>
+
+namespace cudf::io::parquet::gpu {
+
+struct page_state_s {
+  uint8_t const* data_start;
+  uint8_t const* data_end;
+  uint8_t const* lvl_end;
+  uint8_t const* dict_base;    // ptr to dictionary page data
+  int32_t dict_size;           // size of dictionary data
+  int32_t first_row;           // First row in page to output
+  int32_t num_rows;            // Rows in page to decode (including rows to be skipped)
+  int32_t first_output_value;  // First value in page to output
+  int32_t num_input_values;    // total # of input/level values in the page
+  int32_t dtype_len;           // Output data type length
+  int32_t dtype_len_in;        // Can be larger than dtype_len if truncating 32-bit into 8-bit
+  int32_t dict_bits;           // # of bits to store dictionary indices
+  uint32_t dict_run;
+  int32_t dict_val;
+  uint32_t initial_rle_run[NUM_LEVEL_TYPES];   // [def,rep]
+  int32_t initial_rle_value[NUM_LEVEL_TYPES];  // [def,rep]
+  int32_t error;
+  PageInfo page;
+  ColumnChunkDesc col;
+
+  // (leaf) value decoding
+  int32_t nz_count;  // number of valid entries in nz_idx (write position in circular buffer)
+  int32_t dict_pos;  // write position of dictionary indices
+  int32_t src_pos;   // input read position of final output value
+  int32_t ts_scale;  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
+
+  // repetition/definition level decoding
+  int32_t input_value_count;                  // how many values of the input we've processed
+  int32_t input_row_count;                    // how many rows of the input we've processed
+  int32_t input_leaf_count;                   // how many leaf values of the input we've processed
+  uint8_t const* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
+  uint8_t const* abs_lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
+  uint8_t const* abs_lvl_end[NUM_LEVEL_TYPES];    // [def,rep]
+  int32_t lvl_count[NUM_LEVEL_TYPES];             // how many of each of the streams we've decoded
+  int32_t row_index_lower_bound;                  // lower bound of row indices we should process
+
+  // a shared-memory cache of frequently used data when decoding. The source of this data is
+  // normally stored in global memory which can yield poor performance. So, when possible
+  // we copy that info here prior to decoding
+  PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info];
+  // points to either nesting_decode_cache above when possible, or to the global source otherwise
+  PageNestingDecodeInfo* nesting_info;
+};
+
+// buffers only used in the decode kernel.  separated from page_state_s to keep
+// shared memory usage in other kernels (eg, gpuComputePageSizes) down.
+template <int _nz_buf_size, int _dict_buf_size, int _str_buf_size>
+struct page_state_buffers_s {
+  static constexpr int nz_buf_size   = _nz_buf_size;
+  static constexpr int dict_buf_size = _dict_buf_size;
+  static constexpr int str_buf_size  = _str_buf_size;
+
+  uint32_t nz_idx[nz_buf_size];      // circular buffer of non-null value positions
+  uint32_t dict_idx[dict_buf_size];  // Dictionary index, boolean, or string offset values
+  uint32_t str_len[str_buf_size];    // String length for plain encoding of strings
+};
+
+// Copies null counts back to `nesting_decode` at the end of scope
+struct null_count_back_copier {
+  page_state_s* s;
+  int t;
+  __device__ ~null_count_back_copier()
+  {
+    if (s->nesting_info != nullptr and s->nesting_info == s->nesting_decode_cache) {
+      int depth = 0;
+      while (depth < s->page.num_output_nesting_levels) {
+        int const thread_depth = depth + t;
+        if (thread_depth < s->page.num_output_nesting_levels) {
+          s->page.nesting_decode[thread_depth].null_count =
+            s->nesting_decode_cache[thread_depth].null_count;
+        }
+        depth += blockDim.x;
+      }
+    }
+  }
+};
+
+/**
+ * @brief Test if the given page is in a string column
+ */
+constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc const> chunks)
+{
+  if (page.flags & PAGEINFO_FLAGS_DICTIONARY != 0) { return false; }
+  auto const& col = chunks[page.chunk_idx];
+  return is_string_col(col);
+}
+
+/**
+ * @brief Returns whether or not a page spans either the beginning or the end of the
+ * specified row bounds
+ *
+ * @param s The page to be checked
+ * @param start_row The starting row index
+ * @param num_rows The number of rows
+ * @param has_repetition True if the schema has nesting
+ *
+ * @return True if the page spans the beginning or the end of the row bounds
+ */
+inline __device__ bool is_bounds_page(page_state_s* const s,
+                                      size_t start_row,
+                                      size_t num_rows,
+                                      bool has_repetition)
+{
+  size_t const page_begin = s->col.start_row + s->page.chunk_row;
+  size_t const page_end   = page_begin + s->page.num_rows;
+  size_t const begin      = start_row;
+  size_t const end        = start_row + num_rows;
+
+  // for non-nested schemas, rows cannot span pages, so use a more restrictive test
+  return has_repetition
+           ? ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end))
+           : ((page_begin < begin && page_end > begin) || (page_begin < end && page_end > end));
+}
+
+/**
+ * @brief Returns whether or not a page is completely contained within the specified
+ * row bounds
+ *
+ * @param s The page to be checked
+ * @param start_row The starting row index
+ * @param num_rows The number of rows
+ *
+ * @return True if the page is completely contained within the row bounds
+ */
+inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row, size_t num_rows)
+{
+  size_t const page_begin = s->col.start_row + s->page.chunk_row;
+  size_t const page_end   = page_begin + s->page.num_rows;
+  size_t const begin      = start_row;
+  size_t const end        = start_row + num_rows;
+
+  return page_begin >= begin && page_end <= end;
+}
+
+/**
+ * @brief Retrieves string information for a string at the specified source position
+ *
+ * @param[in] s Page state input
+ * @param[out] sb Page state buffer output
+ * @param[in] src_pos Source position
+ * @tparam state_buf Typename of the `state_buf` (usually inferred)
+ *
+ * @return A pair containing a pointer to the string and its length
+ */
+template <typename state_buf>
+inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(page_state_s volatile* s,
+                                                                        state_buf volatile* sb,
+                                                                        int src_pos)
+{
+  char const* ptr = nullptr;
+  size_t len      = 0;
+
+  if (s->dict_base) {
+    // String dictionary
+    uint32_t dict_pos =
+      (s->dict_bits > 0)
+        ? sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)] * sizeof(string_index_pair)
+        : 0;
+    if (dict_pos < (uint32_t)s->dict_size) {
+      auto const* src = reinterpret_cast<string_index_pair const*>(s->dict_base + dict_pos);
+      ptr             = src->first;
+      len             = src->second;
+    }
+  } else {
+    // Plain encoding
+    uint32_t dict_pos = sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)];
+    if (dict_pos <= (uint32_t)s->dict_size) {
+      ptr = reinterpret_cast<char const*>(s->data_start + dict_pos);
+      len = sb->str_len[rolling_index<state_buf::str_buf_size>(src_pos)];
+    }
+  }
+
+  return {ptr, len};
+}
+
+/**
+ * @brief Performs RLE decoding of dictionary indexes
+ *
+ * @param[in,out] s Page state input/output
+ * @param[out] sb Page state buffer output
+ * @param[in] target_pos Target index position in dict_idx buffer (may exceed this value by up to
+ * 31)
+ * @param[in] t Warp1 thread ID (0..31)
+ * @tparam sizes_only True if only sizes are to be calculated
+ * @tparam state_buf Typename of the `state_buf` (usually inferred)
+ *
+ * @return A pair containing the new output position, and the total length of strings decoded (this
+ * will only be valid on thread 0 and if sizes_only is true). In the event that this function
+ * decodes strings beyond target_pos, the total length of strings returned will include these
+ * additional values.
+ */
+template <bool sizes_only, typename state_buf>
+__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
+  page_state_s volatile* s, [[maybe_unused]] state_buf volatile* sb, int target_pos, int t)
+{
+  uint8_t const* end = s->data_end;
+  int dict_bits      = s->dict_bits;
+  int pos            = s->dict_pos;
+  int str_len        = 0;
+
+  while (pos < target_pos) {
+    int is_literal, batch_len;
+    if (!t) {
+      uint32_t run       = s->dict_run;
+      uint8_t const* cur = s->data_start;
+      if (run <= 1) {
+        run = (cur < end) ? get_vlq32(cur, end) : 0;
+        if (!(run & 1)) {
+          // Repeated value
+          int bytecnt = (dict_bits + 7) >> 3;
+          if (cur + bytecnt <= end) {
+            int32_t run_val = cur[0];
+            if (bytecnt > 1) {
+              run_val |= cur[1] << 8;
+              if (bytecnt > 2) {
+                run_val |= cur[2] << 16;
+                if (bytecnt > 3) { run_val |= cur[3] << 24; }
+              }
+            }
+            s->dict_val = run_val & ((1 << dict_bits) - 1);
+          }
+          cur += bytecnt;
+        }
+      }
+      if (run & 1) {
+        // Literal batch: must output a multiple of 8, except for the last batch
+        int batch_len_div8;
+        batch_len      = max(min(32, (int)(run >> 1) * 8), 1);
+        batch_len_div8 = (batch_len + 7) >> 3;
+        run -= batch_len_div8 * 2;
+        cur += batch_len_div8 * dict_bits;
+      } else {
+        batch_len = max(min(32, (int)(run >> 1)), 1);
+        run -= batch_len * 2;
+      }
+      s->dict_run   = run;
+      s->data_start = cur;
+      is_literal    = run & 1;
+      __threadfence_block();
+    }
+    __syncwarp();
+    is_literal = shuffle(is_literal);
+    batch_len  = shuffle(batch_len);
+
+    // compute dictionary index.
+    int dict_idx = 0;
+    if (t < batch_len) {
+      dict_idx = s->dict_val;
+      if (is_literal) {
+        int32_t ofs      = (t - ((batch_len + 7) & ~7)) * dict_bits;
+        uint8_t const* p = s->data_start + (ofs >> 3);
+        ofs &= 7;
+        if (p < end) {
+          uint32_t c = 8 - ofs;
+          dict_idx   = (*p++) >> ofs;
+          if (c < dict_bits && p < end) {
+            dict_idx |= (*p++) << c;
+            c += 8;
+            if (c < dict_bits && p < end) {
+              dict_idx |= (*p++) << c;
+              c += 8;
+              if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; }
+            }
+          }
+          dict_idx &= (1 << dict_bits) - 1;
+        }
+      }
+
+      // if we're not computing sizes, store off the dictionary index
+      if constexpr (!sizes_only) {
+        sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos + t)] = dict_idx;
+      }
+    }
+
+    // if we're computing sizes, add the length(s)
+    if constexpr (sizes_only) {
+      int const len = [&]() {
+        if (t >= batch_len || (pos + t >= target_pos)) { return 0; }
+        uint32_t const dict_pos = (s->dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
+        if (dict_pos < (uint32_t)s->dict_size) {
+          const auto* src = reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
+          return src->second;
+        }
+        return 0;
+      }();
+
+      using WarpReduce = cub::WarpReduce<size_type>;
+      __shared__ typename WarpReduce::TempStorage temp_storage;
+      // note: str_len will only be valid on thread 0.
+      str_len += WarpReduce(temp_storage).Sum(len);
+    }
+
+    pos += batch_len;
+  }
+  return {pos, str_len};
+}
+
+/**
+ * @brief Performs RLE decoding of dictionary indexes, for when dict_size=1
+ *
+ * @param[in,out] s Page state input/output
+ * @param[out] sb Page state buffer output
+ * @param[in] target_pos Target write position
+ * @param[in] t Thread ID
+ * @tparam state_buf Typename of the `state_buf` (usually inferred)
+ *
+ * @return The new output position
+ */
+template <typename state_buf>
+inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
+                                           state_buf volatile* sb,
+                                           int target_pos,
+                                           int t)
+{
+  uint8_t const* end = s->data_end;
+  int64_t pos        = s->dict_pos;
+
+  while (pos < target_pos) {
+    int is_literal, batch_len;
+    if (!t) {
+      uint32_t run       = s->dict_run;
+      uint8_t const* cur = s->data_start;
+      if (run <= 1) {
+        run = (cur < end) ? get_vlq32(cur, end) : 0;
+        if (!(run & 1)) {
+          // Repeated value
+          s->dict_val = (cur < end) ? cur[0] & 1 : 0;
+          cur++;
+        }
+      }
+      if (run & 1) {
+        // Literal batch: must output a multiple of 8, except for the last batch
+        int batch_len_div8;
+        batch_len = max(min(32, (int)(run >> 1) * 8), 1);
+        if (batch_len >= 8) { batch_len &= ~7; }
+        batch_len_div8 = (batch_len + 7) >> 3;
+        run -= batch_len_div8 * 2;
+        cur += batch_len_div8;
+      } else {
+        batch_len = max(min(32, (int)(run >> 1)), 1);
+        run -= batch_len * 2;
+      }
+      s->dict_run   = run;
+      s->data_start = cur;
+      is_literal    = run & 1;
+      __threadfence_block();
+    }
+    __syncwarp();
+    is_literal = shuffle(is_literal);
+    batch_len  = shuffle(batch_len);
+    if (t < batch_len) {
+      int dict_idx;
+      if (is_literal) {
+        int32_t ofs      = t - ((batch_len + 7) & ~7);
+        uint8_t const* p = s->data_start + (ofs >> 3);
+        dict_idx         = (p < end) ? (p[0] >> (ofs & 7u)) & 1 : 0;
+      } else {
+        dict_idx = s->dict_val;
+      }
+      sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos + t)] = dict_idx;
+    }
+    pos += batch_len;
+  }
+  return pos;
+}
+
+/**
+ * @brief Parses the length and position of strings and returns total length of all strings
+ * processed
+ *
+ * @param[in,out] s Page state input/output
+ * @param[out] sb Page state buffer output
+ * @param[in] target_pos Target output position
+ * @param[in] t Thread ID
+ * @tparam sizes_only True if only sizes are to be calculated
+ * @tparam state_buf Typename of the `state_buf` (usually inferred)
+ *
+ * @return Total length of strings processed
+ */
+template <bool sizes_only, typename state_buf>
+__device__ size_type gpuInitStringDescriptors(page_state_s volatile* s,
+                                              [[maybe_unused]] state_buf volatile* sb,
+                                              int target_pos,
+                                              int t)
+{
+  int pos       = s->dict_pos;
+  int total_len = 0;
+
+  // This step is purely serial
+  if (!t) {
+    uint8_t const* cur = s->data_start;
+    int dict_size      = s->dict_size;
+    int k              = s->dict_val;
+
+    while (pos < target_pos) {
+      int len = 0;
+      if ((s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+        if (k < dict_size) { len = s->dtype_len_in; }
+      } else {
+        if (k + 4 <= dict_size) {
+          len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
+          k += 4;
+          if (k + len > dict_size) { len = 0; }
+        }
+      }
+      if constexpr (!sizes_only) {
+        sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
+        sb->str_len[rolling_index<state_buf::str_buf_size>(pos)]   = len;
+      }
+      k += len;
+      total_len += len;
+      pos++;
+    }
+    s->dict_val = k;
+    __threadfence_block();
+  }
+
+  return total_len;
+}
+
+/**
+ * @brief Decode values out of a definition or repetition stream
+ *
+ * @param[out] output Level buffer output
+ * @param[in,out] s Page state input/output
+ * @param[in] target_count Target count of stream values on output
+ * @param[in] t Warp0 thread ID (0..31)
+ * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ * @tparam rolling_buf_size Size of the cyclic buffer used to store value data
+ */
+template <typename level_t, int rolling_buf_size>
+__device__ void gpuDecodeStream(
+  level_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl)
+{
+  uint8_t const* cur_def    = s->lvl_start[lvl];
+  uint8_t const* end        = s->lvl_end;
+  uint32_t level_run        = s->initial_rle_run[lvl];
+  int32_t level_val         = s->initial_rle_value[lvl];
+  int level_bits            = s->col.level_bits[lvl];
+  int32_t num_input_values  = s->num_input_values;
+  int32_t value_count       = s->lvl_count[lvl];
+  int32_t batch_coded_count = 0;
+
+  while (value_count < target_count && value_count < num_input_values) {
+    int batch_len;
+    if (level_run <= 1) {
+      // Get a new run symbol from the byte stream
+      int sym_len = 0;
+      if (!t) {
+        uint8_t const* cur = cur_def;
+        if (cur < end) { level_run = get_vlq32(cur, end); }
+        if (!(level_run & 1)) {
+          if (cur < end) level_val = cur[0];
+          cur++;
+          if (level_bits > 8) {
+            if (cur < end) level_val |= cur[0] << 8;
+            cur++;
+          }
+        }
+        if (cur > end || level_run <= 1) { s->error = 0x10; }
+        sym_len = (int32_t)(cur - cur_def);
+        __threadfence_block();
+      }
+      sym_len   = shuffle(sym_len);
+      level_val = shuffle(level_val);
+      level_run = shuffle(level_run);
+      cur_def += sym_len;
+    }
+    if (s->error) { break; }
+
+    batch_len = min(num_input_values - value_count, 32);
+    if (level_run & 1) {
+      // Literal run
+      int batch_len8;
+      batch_len  = min(batch_len, (level_run >> 1) * 8);
+      batch_len8 = (batch_len + 7) >> 3;
+      if (t < batch_len) {
+        int bitpos         = t * level_bits;
+        uint8_t const* cur = cur_def + (bitpos >> 3);
+        bitpos &= 7;
+        if (cur < end) level_val = cur[0];
+        cur++;
+        if (level_bits > 8 - bitpos && cur < end) {
+          level_val |= cur[0] << 8;
+          cur++;
+          if (level_bits > 16 - bitpos && cur < end) level_val |= cur[0] << 16;
+        }
+        level_val = (level_val >> bitpos) & ((1 << level_bits) - 1);
+      }
+      level_run -= batch_len8 * 2;
+      cur_def += batch_len8 * level_bits;
+    } else {
+      // Repeated value
+      batch_len = min(batch_len, level_run >> 1);
+      level_run -= batch_len * 2;
+    }
+    if (t < batch_len) {
+      int idx                                      = value_count + t;
+      output[rolling_index<rolling_buf_size>(idx)] = level_val;
+    }
+    batch_coded_count += batch_len;
+    value_count += batch_len;
+  }
+
+  // update the stream info
+  if (!t) {
+    s->lvl_start[lvl]         = cur_def;
+    s->initial_rle_run[lvl]   = level_run;
+    s->initial_rle_value[lvl] = level_val;
+    s->lvl_count[lvl]         = value_count;
+  }
+}
+
+/**
+ * @brief Store a validity mask containing value_count bits into the output validity buffer of the
+ * page.
+ *
+ * @param[in,out] nesting_info The page/nesting information to store the mask in. The validity map
+ * offset is also updated
+ * @param[in,out] valid_map Pointer to bitmask to store validity information to
+ * @param[in] valid_mask The validity mask to be stored
+ * @param[in] value_count # of bits in the validity mask
+ */
+inline __device__ void store_validity(int valid_map_offset,
+                                      bitmask_type* valid_map,
+                                      uint32_t valid_mask,
+                                      int32_t value_count)
+{
+  int word_offset = valid_map_offset / 32;
+  int bit_offset  = valid_map_offset % 32;
+  // if we fit entirely in the output word
+  if (bit_offset + value_count <= 32) {
+    auto relevant_mask = static_cast<uint32_t>((static_cast<uint64_t>(1) << value_count) - 1);
+
+    if (relevant_mask == ~0) {
+      valid_map[word_offset] = valid_mask;
+    } else {
+      atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset));
+      atomicOr(valid_map + word_offset, (valid_mask & relevant_mask) << bit_offset);
+    }
+  }
+  // we're going to spill over into the next word.
+  // note : writing both values here is the lazy/slow way.  we could be writing just
+  // the first word and rolling the remaining bits over into the next call.
+  // however, some basic performance tests shows almost no difference between these two
+  // methods. More detailed performance testing might be worthwhile here.
+  else {
+    uint32_t bits_left = 32 - bit_offset;
+
+    // first word. strip bits_left bits off the beginning and store that
+    uint32_t relevant_mask = ((1 << bits_left) - 1);
+    uint32_t mask_word0    = valid_mask & relevant_mask;
+    atomicAnd(valid_map + word_offset, ~(relevant_mask << bit_offset));
+    atomicOr(valid_map + word_offset, mask_word0 << bit_offset);
+
+    // second word. strip the remainder of the bits off the end and store that
+    relevant_mask       = ((1 << (value_count - bits_left)) - 1);
+    uint32_t mask_word1 = valid_mask & (relevant_mask << bits_left);
+    atomicAnd(valid_map + word_offset + 1, ~(relevant_mask));
+    atomicOr(valid_map + word_offset + 1, mask_word1 >> bits_left);
+  }
+}
+
+/**
+ * @brief Compute the nesting bounds within the hierarchy to add values to, and the definition level
+ * D to which we should considered them null or not.
+ *
+ * @param[out] start_depth The start nesting depth
+ * @param[out] end_depth The end nesting depth (inclusive)
+ * @param[out] d The definition level up to which added values are not-null. if t is out of bounds,
+ * d will be -1
+ * @param[in] s Local page information
+ * @param[in] rep Repetition level buffer
+ * @param[in] def Definition level buffer
+ * @param[in] input_value_count The current count of input level values we have processed
+ * @param[in] target_input_value_count The desired # of input level values we want to process
+ * @param[in] t Thread index
+ * @tparam rolling_buf_size Size of the cyclic buffer used to store value data
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ */
+template <int rolling_buf_size, typename level_t>
+inline __device__ void get_nesting_bounds(int& start_depth,
+                                          int& end_depth,
+                                          int& d,
+                                          page_state_s* s,
+                                          level_t const* const rep,
+                                          level_t const* const def,
+                                          int input_value_count,
+                                          int32_t target_input_value_count,
+                                          int t)
+{
+  start_depth = -1;
+  end_depth   = -1;
+  d           = -1;
+  if (input_value_count + t < target_input_value_count) {
+    int const index = rolling_index<rolling_buf_size>(input_value_count + t);
+    d               = static_cast<int>(def[index]);
+    // if we have repetition (there are list columns involved) we have to
+    // bound what nesting levels we apply values to
+    if (s->col.max_level[level_type::REPETITION] > 0) {
+      int r       = rep[index];
+      start_depth = s->nesting_info[r].start_depth;
+      end_depth   = s->nesting_info[d].end_depth;
+    }
+    // for columns without repetition (even ones involving structs) we always
+    // traverse the entire hierarchy.
+    else {
+      start_depth = 0;
+      end_depth   = s->col.max_nesting_depth - 1;
+    }
+  }
+}
+
+/**
+ * @brief Process a batch of incoming repetition/definition level values and generate
+ *        validity, nested column offsets (where appropriate) and decoding indices.
+ *
+ * @param[in] target_input_value_count The # of repetition/definition levels to process up to
+ * @param[in] s Local page information
+ * @param[out] sb Page state buffer output
+ * @param[in] rep Repetition level buffer
+ * @param[in] def Definition level buffer
+ * @param[in] t Thread index
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ * @tparam state_buf Typename of the `state_buf` (usually inferred)
+ * @tparam rolling_buf_size Size of the cyclic buffer used to store value data
+ */
+template <typename level_t, typename state_buf, int rolling_buf_size>
+__device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count,
+                                                      page_state_s* s,
+                                                      state_buf* sb,
+                                                      level_t const* const rep,
+                                                      level_t const* const def,
+                                                      int t)
+{
+  // max nesting depth of the column
+  int const max_depth       = s->col.max_nesting_depth;
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+  // how many (input) values we've processed in the page so far
+  int input_value_count = s->input_value_count;
+  // how many rows we've processed in the page so far
+  int input_row_count = s->input_row_count;
+
+  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+
+  // process until we've reached the target
+  while (input_value_count < target_input_value_count) {
+    // determine the nesting bounds for this thread (the range of nesting depths we
+    // will generate new value indices and validity bits for)
+    int start_depth, end_depth, d;
+    get_nesting_bounds<rolling_buf_size, level_t>(
+      start_depth, end_depth, d, s, rep, def, input_value_count, target_input_value_count, t);
+
+    // 4 interesting things to track:
+    // thread_value_count : # of output values from the view of this thread
+    // warp_value_count   : # of output values for the whole warp
+    //
+    // thread_valid_count : # of valid values from the view of this thread
+    // warp_valid_count   : # of valid values for the whole warp
+    uint32_t thread_value_count, warp_value_count;
+    uint32_t thread_valid_count, warp_valid_count;
+
+    // track (page-relative) row index for the thread so we can compare against input bounds
+    // keep track of overall # of rows we've read.
+    int const is_new_row               = start_depth == 0 ? 1 : 0;
+    uint32_t const warp_row_count_mask = ballot(is_new_row);
+    int32_t const thread_row_index =
+      input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1);
+    input_row_count += __popc(warp_row_count_mask);
+    // is this thread within read row bounds?
+    int const in_row_bounds = thread_row_index >= s->row_index_lower_bound &&
+                                  thread_row_index < (s->first_row + s->num_rows)
+                                ? 1
+                                : 0;
+
+    // compute warp and thread value counts
+    uint32_t const warp_count_mask =
+      ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0);
+
+    warp_value_count = __popc(warp_count_mask);
+    // Note : ((1 << t) - 1) implies "for all threads before me"
+    thread_value_count = __popc(warp_count_mask & ((1 << t) - 1));
+
+    // walk from 0 to max_depth
+    uint32_t next_thread_value_count, next_warp_value_count;
+    for (int s_idx = 0; s_idx < max_depth; s_idx++) {
+      PageNestingDecodeInfo* nesting_info = &nesting_info_base[s_idx];
+
+      // if we are within the range of nesting levels we should be adding value indices for
+      int const in_nesting_bounds =
+        ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0;
+
+      // everything up to the max_def_level is a non-null value
+      uint32_t const is_valid = d >= nesting_info->max_def_level && in_nesting_bounds ? 1 : 0;
+
+      // compute warp and thread valid counts
+      uint32_t const warp_valid_mask =
+        // for flat schemas, a simple ballot_sync gives us the correct count and bit positions
+        // because every value in the input matches to a value in the output
+        !has_repetition
+          ? ballot(is_valid)
+          :
+          // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
+          // however not all of them will necessarily represent a value at this nesting level. so
+          // the validity bit for thread t might actually represent output value t-6. the correct
+          // position for thread t's bit is cur_value_count. for cuda 11 we could use
+          // __reduce_or_sync(), but until then we have to do a warp reduce.
+          WarpReduceOr32(is_valid << thread_value_count);
+
+      thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1));
+      warp_valid_count   = __popc(warp_valid_mask);
+
+      // if this is the value column emit an index for value decoding
+      if (is_valid && s_idx == max_depth - 1) {
+        int const src_pos = nesting_info->valid_count + thread_valid_count;
+        int const dst_pos = nesting_info->value_count + thread_value_count;
+        // nz_idx is a mapping of src buffer indices to destination buffer indices
+        sb->nz_idx[rolling_index<rolling_buf_size>(src_pos)] = dst_pos;
+      }
+
+      // compute warp and thread value counts for the -next- nesting level. we need to
+      // do this for nested schemas so that we can emit an offset for the -current- nesting
+      // level. more concretely : the offset for the current nesting level == current length of the
+      // next nesting level
+      if (s_idx < max_depth - 1) {
+        uint32_t const next_warp_count_mask =
+          ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0);
+        next_warp_value_count   = __popc(next_warp_count_mask);
+        next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1));
+
+        // if we're -not- at a leaf column and we're within nesting/row bounds
+        // and we have a valid data_out pointer, it implies this is a list column, so
+        // emit an offset.
+        if (in_nesting_bounds && nesting_info->data_out != nullptr) {
+          int const idx             = nesting_info->value_count + thread_value_count;
+          cudf::size_type const ofs = nesting_info_base[s_idx + 1].value_count +
+                                      next_thread_value_count +
+                                      nesting_info_base[s_idx + 1].page_start_value;
+          (reinterpret_cast<cudf::size_type*>(nesting_info->data_out))[idx] = ofs;
+        }
+      }
+
+      // nested schemas always read and write to the same bounds (that is, read and write positions
+      // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the
+      // first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector here
+      // we need to adjust our computed mask to take into account the write row bounds.
+      int const in_write_row_bounds =
+        !has_repetition
+          ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows)
+          : in_row_bounds;
+      int const first_thread_in_write_range =
+        !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0;
+
+      // # of bits to of the validity mask to write out
+      int const warp_valid_mask_bit_count =
+        first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range;
+
+      // increment count of valid values, count of total values, and update validity mask
+      if (!t) {
+        if (nesting_info->valid_map != nullptr && warp_valid_mask_bit_count > 0) {
+          uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range;
+          store_validity(nesting_info->valid_map_offset,
+                         nesting_info->valid_map,
+                         warp_output_valid_mask,
+                         warp_valid_mask_bit_count);
+          nesting_info->valid_map_offset += warp_valid_mask_bit_count;
+          nesting_info->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask);
+        }
+        nesting_info->valid_count += warp_valid_count;
+        nesting_info->value_count += warp_value_count;
+      }
+
+      // propagate value counts for the next level
+      warp_value_count   = next_warp_value_count;
+      thread_value_count = next_thread_value_count;
+    }
+
+    input_value_count += min(32, (target_input_value_count - input_value_count));
+    __syncwarp();
+  }
+
+  // update
+  if (!t) {
+    // update valid value count for decoding and total # of values we've processed
+    s->nz_count          = nesting_info_base[max_depth - 1].valid_count;
+    s->input_value_count = input_value_count;
+    s->input_row_count   = input_row_count;
+  }
+}
+
+/**
+ * @brief Process repetition and definition levels up to the target count of leaf values.
+ *
+ * In order to decode actual leaf values from the input stream, we need to generate the
+ * list of non-null value positions (page_state_s::nz_idx). We do this by processing
+ * the repetition and definition level streams.  This process also generates validity information,
+ * and offset column values in the case of nested schemas. Because of the way the streams
+ * are encoded, this function may generate slightly more than target_leaf_count.
+ *
+ * Only runs on 1 warp.
+ *
+ * @param[in] s The local page state
+ * @param[out] sb Page state buffer output
+ * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for
+ * @param[in] rep Repetition level buffer
+ * @param[in] def Definition level buffer
+ * @param[in] t Thread index
+ * @tparam rolling_buf_size Size of the cyclic buffer used to store value data
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ * @tparam state_buf Typename of the `state_buf` (usually inferred)
+ */
+template <int rolling_buf_size, typename level_t, typename state_buf>
+__device__ void gpuDecodeLevels(page_state_s* s,
+                                state_buf* sb,
+                                int32_t target_leaf_count,
+                                level_t* const rep,
+                                level_t* const def,
+                                int t)
+{
+  bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  constexpr int batch_size = 32;
+  int cur_leaf_count       = target_leaf_count;
+  while (!s->error && s->nz_count < target_leaf_count &&
+         s->input_value_count < s->num_input_values) {
+    if (has_repetition) {
+      gpuDecodeStream<level_t, rolling_buf_size>(rep, s, cur_leaf_count, t, level_type::REPETITION);
+    }
+    gpuDecodeStream<level_t, rolling_buf_size>(def, s, cur_leaf_count, t, level_type::DEFINITION);
+    __syncwarp();
+
+    // because the rep and def streams are encoded separately, we cannot request an exact
+    // # of values to be decoded at once. we can only process the lowest # of decoded rep/def
+    // levels we get.
+    int actual_leaf_count = has_repetition ? min(s->lvl_count[level_type::REPETITION],
+                                                 s->lvl_count[level_type::DEFINITION])
+                                           : s->lvl_count[level_type::DEFINITION];
+
+    // process what we got back
+    gpuUpdateValidityOffsetsAndRowIndices<level_t, state_buf, rolling_buf_size>(
+      actual_leaf_count, s, sb, rep, def, t);
+    cur_leaf_count = actual_leaf_count + batch_size;
+    __syncwarp();
+  }
+}
+
+/**
+ * @brief Parse the beginning of the level section (definition or repetition),
+ * initializes the initial RLE run & value, and returns the section length
+ *
+ * @param[in,out] s The page state
+ * @param[in] cur The current data position
+ * @param[in] end The end of the data
+ * @param[in] lvl Enum indicating whether this is to initialize repetition or definition level data
+ *
+ * @return The length of the section
+ */
+inline __device__ uint32_t InitLevelSection(page_state_s* s,
+                                            uint8_t const* cur,
+                                            uint8_t const* end,
+                                            level_type lvl)
+{
+  int32_t len;
+  int const level_bits = s->col.level_bits[lvl];
+  auto const encoding  = lvl == level_type::DEFINITION ? s->page.definition_level_encoding
+                                                       : s->page.repetition_level_encoding;
+
+  auto start = cur;
+
+  auto init_rle = [s, lvl, end, level_bits](uint8_t const* cur, uint8_t const* end) {
+    uint32_t const run      = get_vlq32(cur, end);
+    s->initial_rle_run[lvl] = run;
+    if (!(run & 1)) {
+      if (cur < end) {
+        int v = cur[0];
+        cur++;
+        if (level_bits > 8) {
+          v |= ((cur < end) ? cur[0] : 0) << 8;
+          cur++;
+        }
+        s->initial_rle_value[lvl] = v;
+      } else {
+        s->initial_rle_value[lvl] = 0;
+      }
+    }
+    s->lvl_start[lvl] = cur;
+
+    if (cur > end) { s->error = 2; }
+  };
+
+  // this is a little redundant. if level_bits == 0, then nothing should be encoded
+  // for the level, but some V2 files in the wild violate this and encode the data anyway.
+  // thus we will handle V2 headers separately.
+  if ((s->page.flags & PAGEINFO_FLAGS_V2) != 0 && (len = s->page.lvl_bytes[lvl]) != 0) {
+    // V2 only uses RLE encoding so no need to check encoding
+    s->abs_lvl_start[lvl] = cur;
+    init_rle(cur, cur + len);
+  } else if (level_bits == 0) {
+    len                       = 0;
+    s->initial_rle_run[lvl]   = s->page.num_input_values * 2;  // repeated value
+    s->initial_rle_value[lvl] = 0;
+    s->lvl_start[lvl]         = cur;
+    s->abs_lvl_start[lvl]     = cur;
+  } else if (encoding == Encoding::RLE) {  // V1 header with RLE encoding
+    if (cur + 4 < end) {
+      len = (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
+      cur += 4;
+      s->abs_lvl_start[lvl] = cur;
+      init_rle(cur, cur + len);
+      // add back the 4 bytes for the length
+      len += 4;
+    } else {
+      len      = 0;
+      s->error = 2;
+    }
+  } else if (encoding == Encoding::BIT_PACKED) {
+    len                       = (s->page.num_input_values * level_bits + 7) >> 3;
+    s->initial_rle_run[lvl]   = ((s->page.num_input_values + 7) >> 3) * 2 + 1;  // literal run
+    s->initial_rle_value[lvl] = 0;
+    s->lvl_start[lvl]         = cur;
+    s->abs_lvl_start[lvl]     = cur;
+  } else {
+    s->error = 3;
+    len      = 0;
+  }
+
+  s->abs_lvl_end[lvl] = start + len;
+
+  return static_cast<uint32_t>(len);
+}
+
+/**
+ * @brief Functor for setupLocalPageInfo that always returns true.
+ */
+struct all_types_filter {
+  __device__ inline bool operator()(PageInfo const& page) { return true; }
+};
+
+/**
+ * @brief Functor for setupLocalPageInfo that takes a mask of allowed types.
+ */
+struct mask_filter {
+  int mask;
+  __device__ inline bool operator()(PageInfo const& page) { return (page.kernel_mask & mask) != 0; }
+};
+
+/**
+ * @brief Sets up block-local page state information from the global pages.
+ *
+ * @param[in, out] s The local page state to be filled in
+ * @param[in] p The global page to be copied from
+ * @param[in] chunks The global list of chunks
+ * @param[in] min_row Crop all rows below min_row
+ * @param[in] num_rows Maximum number of rows to read
+ * @param[in] filter Filtering function used to decide which pages to operate on
+ * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess)
+ * @tparam Filter Function that takes a PageInfo reference and returns true if the given page should
+ * be operated on Currently only used by gpuComputePageSizes step)
+ * @return True if this page should be processed further
+ */
+template <typename Filter>
+inline __device__ bool setupLocalPageInfo(page_state_s* const s,
+                                          PageInfo const* p,
+                                          device_span<ColumnChunkDesc const> chunks,
+                                          size_t min_row,
+                                          size_t num_rows,
+                                          Filter filter,
+                                          bool is_decode_step)
+{
+  int t = threadIdx.x;
+
+  // Fetch page info
+  if (!t) {
+    s->page         = *p;
+    s->nesting_info = nullptr;
+    s->col          = chunks[s->page.chunk_idx];
+  }
+  __syncthreads();
+
+  // return false if this is a dictionary page or it does not pass the filter condition
+  if ((s->page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0 || !filter(s->page)) { return false; }
+
+  // our starting row (absolute index) is
+  // col.start_row == absolute row index
+  // page.chunk-row == relative row index within the chunk
+  size_t const page_start_row = s->col.start_row + s->page.chunk_row;
+
+  // if we can use the nesting decode cache, set it up now
+  auto const can_use_decode_cache = s->page.nesting_info_size <= max_cacheable_nesting_decode_info;
+  if (can_use_decode_cache) {
+    int depth = 0;
+    while (depth < s->page.nesting_info_size) {
+      int const thread_depth = depth + t;
+      if (thread_depth < s->page.nesting_info_size) {
+        // these values need to be copied over from global
+        s->nesting_decode_cache[thread_depth].max_def_level =
+          s->page.nesting_decode[thread_depth].max_def_level;
+        s->nesting_decode_cache[thread_depth].page_start_value =
+          s->page.nesting_decode[thread_depth].page_start_value;
+        s->nesting_decode_cache[thread_depth].start_depth =
+          s->page.nesting_decode[thread_depth].start_depth;
+        s->nesting_decode_cache[thread_depth].end_depth =
+          s->page.nesting_decode[thread_depth].end_depth;
+      }
+      depth += blockDim.x;
+    }
+  }
+
+  if (!t) {
+    s->nesting_info = can_use_decode_cache ? s->nesting_decode_cache : s->page.nesting_decode;
+
+    // NOTE: s->page.num_rows, s->col.chunk_row, s->first_row and s->num_rows will be
+    // invalid/bogus during first pass of the preprocess step for nested types. this is ok
+    // because we ignore these values in that stage.
+    auto const max_row = min_row + num_rows;
+
+    // if we are totally outside the range of the input, do nothing
+    if ((page_start_row > max_row) || (page_start_row + s->page.num_rows < min_row)) {
+      s->first_row = 0;
+      s->num_rows  = 0;
+    }
+    // otherwise
+    else {
+      s->first_row             = page_start_row >= min_row ? 0 : min_row - page_start_row;
+      auto const max_page_rows = s->page.num_rows - s->first_row;
+      s->num_rows              = (page_start_row + s->first_row) + max_page_rows <= max_row
+                                   ? max_page_rows
+                                   : max_row - (page_start_row + s->first_row);
+    }
+  }
+
+  __syncthreads();
+
+  // zero counts
+  int depth = 0;
+  while (depth < s->page.num_output_nesting_levels) {
+    int const thread_depth = depth + t;
+    if (thread_depth < s->page.num_output_nesting_levels) {
+      s->nesting_info[thread_depth].valid_count = 0;
+      s->nesting_info[thread_depth].value_count = 0;
+      s->nesting_info[thread_depth].null_count  = 0;
+    }
+    depth += blockDim.x;
+  }
+  __syncthreads();
+
+  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
+  //
+  // corner case: in the case of lists, we can have pages that contain "0" rows if the current row
+  // starts before this page and ends after this page:
+  //       P0        P1        P2
+  //  |---------|---------|----------|
+  //        ^------------------^
+  //      row start           row end
+  // P1 will contain 0 rows
+  //
+  // NOTE: this check needs to be done after the null counts have been zeroed out
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+  if (is_decode_step && s->num_rows == 0 &&
+      !(has_repetition && (is_bounds_page(s, min_row, num_rows, has_repetition) ||
+                           is_page_contained(s, min_row, num_rows)))) {
+    return false;
+  }
+
+  if (!t) {
+    s->error = 0;
+
+    // IMPORTANT : nested schemas can have 0 rows in a page but still have
+    // values. The case is:
+    // - On page N-1, the last row starts, with 2/6 values encoded
+    // - On page N, the remaining 4/6 values are encoded, but there are no new rows.
+    // if (s->page.num_input_values > 0 && s->page.num_rows > 0) {
+    if (s->page.num_input_values > 0) {
+      uint8_t* cur = s->page.page_data;
+      uint8_t* end = cur + s->page.uncompressed_page_size;
+
+      uint32_t dtype_len_out = s->col.data_type >> 3;
+      s->ts_scale            = 0;
+      // Validate data type
+      auto const data_type = s->col.data_type & 7;
+      switch (data_type) {
+        case BOOLEAN:
+          s->dtype_len = 1;  // Boolean are stored as 1 byte on the output
+          break;
+        case INT32: [[fallthrough]];
+        case FLOAT: s->dtype_len = 4; break;
+        case INT64:
+          if (s->col.ts_clock_rate) {
+            int32_t units = 0;
+            // Duration types are not included because no scaling is done when reading
+            if (s->col.converted_type == TIMESTAMP_MILLIS) {
+              units = cudf::timestamp_ms::period::den;
+            } else if (s->col.converted_type == TIMESTAMP_MICROS) {
+              units = cudf::timestamp_us::period::den;
+            } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) {
+              units = cudf::timestamp_ns::period::den;
+            }
+            if (units and units != s->col.ts_clock_rate) {
+              s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate)
+                                                           : (s->col.ts_clock_rate / units);
+            }
+          }
+          [[fallthrough]];
+        case DOUBLE: s->dtype_len = 8; break;
+        case INT96: s->dtype_len = 12; break;
+        case BYTE_ARRAY:
+          if (s->col.converted_type == DECIMAL) {
+            auto const decimal_precision = s->col.decimal_precision;
+            s->dtype_len                 = [decimal_precision]() {
+              if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
+                return sizeof(int32_t);
+              } else if (decimal_precision <= MAX_DECIMAL64_PRECISION) {
+                return sizeof(int64_t);
+              } else {
+                return sizeof(__int128_t);
+              }
+            }();
+          } else {
+            s->dtype_len = sizeof(string_index_pair);
+          }
+          break;
+        default:  // FIXED_LEN_BYTE_ARRAY:
+          s->dtype_len = dtype_len_out;
+          s->error |= (s->dtype_len <= 0);
+          break;
+      }
+      // Special check for downconversions
+      s->dtype_len_in = s->dtype_len;
+      if (data_type == FIXED_LEN_BYTE_ARRAY) {
+        if (s->col.converted_type == DECIMAL) {
+          s->dtype_len = [dtype_len = s->dtype_len]() {
+            if (dtype_len <= sizeof(int32_t)) {
+              return sizeof(int32_t);
+            } else if (dtype_len <= sizeof(int64_t)) {
+              return sizeof(int64_t);
+            } else {
+              return sizeof(__int128_t);
+            }
+          }();
+        } else {
+          s->dtype_len = sizeof(string_index_pair);
+        }
+      } else if (data_type == INT32) {
+        if (dtype_len_out == 1) {
+          // INT8 output
+          s->dtype_len = 1;
+        } else if (dtype_len_out == 2) {
+          // INT16 output
+          s->dtype_len = 2;
+        } else if (s->col.converted_type == TIME_MILLIS) {
+          // INT64 output
+          s->dtype_len = 8;
+        }
+      } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) {
+        s->dtype_len = 4;  // HASH32 output
+      } else if (data_type == INT96) {
+        s->dtype_len = 8;  // Convert to 64-bit timestamp
+      }
+
+      // during the decoding step we need to offset the global output buffers
+      // for each level of nesting so that we write to the section this page
+      // is responsible for.
+      // - for flat schemas, we can do this directly by using row counts
+      // - for nested schemas, these offsets are computed during the preprocess step
+      //
+      // NOTE: in a chunked read situation, s->col.column_data_base and s->col.valid_map_base
+      // will be aliased to memory that has been freed when we get here in the non-decode step, so
+      // we cannot check against nullptr.  we'll just check a flag directly.
+      if (is_decode_step) {
+        int max_depth = s->col.max_nesting_depth;
+        for (int idx = 0; idx < max_depth; idx++) {
+          PageNestingDecodeInfo* nesting_info = &s->nesting_info[idx];
+
+          size_t output_offset;
+          // schemas without lists
+          if (s->col.max_level[level_type::REPETITION] == 0) {
+            output_offset = page_start_row >= min_row ? page_start_row - min_row : 0;
+          }
+          // for schemas with lists, we've already got the exact value precomputed
+          else {
+            output_offset = nesting_info->page_start_value;
+          }
+
+          if (s->col.column_data_base != nullptr) {
+            nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
+            if (s->col.column_string_base != nullptr) {
+              nesting_info->string_out = static_cast<uint8_t*>(s->col.column_string_base[idx]);
+            }
+
+            nesting_info->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
+
+            if (nesting_info->data_out != nullptr) {
+              // anything below max depth with a valid data pointer must be a list, so the
+              // element size is the size of the offset type.
+              uint32_t len = idx < max_depth - 1 ? sizeof(cudf::size_type) : s->dtype_len;
+              // if this is a string column, then dtype_len is a lie. data will be offsets rather
+              // than (ptr,len) tuples.
+              if (is_string_col(s->col)) { len = sizeof(cudf::size_type); }
+              nesting_info->data_out += (output_offset * len);
+            }
+            if (nesting_info->string_out != nullptr) {
+              nesting_info->string_out += s->page.str_offset;
+            }
+            nesting_info->valid_map = s->col.valid_map_base[idx];
+            if (nesting_info->valid_map != nullptr) {
+              nesting_info->valid_map += output_offset >> 5;
+              nesting_info->valid_map_offset = (int32_t)(output_offset & 0x1f);
+            }
+          }
+        }
+      }
+      s->first_output_value = 0;
+
+      // Find the compressed size of repetition levels
+      cur += InitLevelSection(s, cur, end, level_type::REPETITION);
+      // Find the compressed size of definition levels
+      cur += InitLevelSection(s, cur, end, level_type::DEFINITION);
+
+      s->dict_bits = 0;
+      s->dict_base = nullptr;
+      s->dict_size = 0;
+      // NOTE:  if additional encodings are supported in the future, modifications must
+      // be made to is_supported_encoding() in reader_impl_preprocess.cu
+      switch (s->page.encoding) {
+        case Encoding::PLAIN_DICTIONARY:
+        case Encoding::RLE_DICTIONARY:
+          // RLE-packed dictionary indices, first byte indicates index length in bits
+          if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
+            // String dictionary: use index
+            s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
+            s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
+          } else {
+            s->dict_base =
+              s->col.page_info[0].page_data;  // dictionary is always stored in the first page
+            s->dict_size = s->col.page_info[0].uncompressed_page_size;
+          }
+          s->dict_run  = 0;
+          s->dict_val  = 0;
+          s->dict_bits = (cur < end) ? *cur++ : 0;
+          if (s->dict_bits > 32 || !s->dict_base) { s->error = (10 << 8) | s->dict_bits; }
+          break;
+        case Encoding::PLAIN:
+          s->dict_size = static_cast<int32_t>(end - cur);
+          s->dict_val  = 0;
+          if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
+          break;
+        case Encoding::RLE: {
+          // first 4 bytes are length of RLE data
+          int const len = (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
+          cur += 4;
+          if (cur + len > end) { s->error = 2; }
+          s->dict_run = 0;
+        } break;
+        case Encoding::DELTA_BINARY_PACKED:
+          // nothing to do, just don't error
+          break;
+        default:
+          s->error = 1;  // Unsupported encoding
+          break;
+      }
+      if (cur > end) { s->error = 1; }
+      s->lvl_end    = cur;
+      s->data_start = cur;
+      s->data_end   = end;
+    } else {
+      s->error = 1;
+    }
+
+    s->lvl_count[level_type::REPETITION] = 0;
+    s->lvl_count[level_type::DEFINITION] = 0;
+    s->nz_count                          = 0;
+    s->num_input_values                  = s->page.num_input_values;
+    s->dict_pos                          = 0;
+    s->src_pos                           = 0;
+
+    // for flat hierarchies, we can't know how many leaf values to skip unless we do a full
+    // preprocess of the definition levels (since nulls will have no actual decodable value, there
+    // is no direct correlation between # of rows and # of decodable values).  so we will start
+    // processing at the beginning of the value stream and disregard any indices that start
+    // before the first row.
+    if (s->col.max_level[level_type::REPETITION] == 0) {
+      s->page.skipped_values      = 0;
+      s->page.skipped_leaf_values = 0;
+      s->input_value_count        = 0;
+      s->input_row_count          = 0;
+      s->input_leaf_count         = 0;
+
+      s->row_index_lower_bound = -1;
+    }
+    // for nested hierarchies, we have run a preprocess that lets us skip directly to the values
+    // we need to start decoding at
+    else {
+      // input_row_count translates to "how many rows we have processed so far", so since we are
+      // skipping directly to where we want to start decoding, set it to first_row
+      s->input_row_count = s->first_row;
+
+      // return the lower bound to compare (page-relative) thread row index against. Explanation:
+      // In the case of nested schemas, rows can span page boundaries.  That is to say,
+      // we can encounter the first value for row X on page M, but the last value for page M
+      // might not be the last value for row X. page M+1 (or further) may contain the last value.
+      //
+      // This means that the first values we encounter for a given page (M+1) may not belong to the
+      // row indicated by chunk_row, but to the row before it that spanned page boundaries. If that
+      // previous row is within the overall row bounds, include the values by allowing relative row
+      // index -1
+      int const max_row = (min_row + num_rows) - 1;
+      if (min_row < page_start_row && max_row >= page_start_row - 1) {
+        s->row_index_lower_bound = -1;
+      } else {
+        s->row_index_lower_bound = s->first_row;
+      }
+
+      // if we're in the decoding step, jump directly to the first
+      // value we care about
+      if (is_decode_step) {
+        s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0;
+      } else {
+        s->input_value_count = 0;
+        s->input_leaf_count  = 0;
+        s->page.skipped_values =
+          -1;  // magic number to indicate it hasn't been set for use inside UpdatePageSizes
+        s->page.skipped_leaf_values = 0;
+      }
+    }
+
+    __threadfence_block();
+  }
+  __syncthreads();
+
+  return true;
+}
+
+}  // namespace cudf::io::parquet::gpu
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
new file mode 100644
index 00000000000..e79a479388f
--- /dev/null
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "delta_binary.cuh"
+#include "page_string_utils.cuh"
+#include "parquet_gpu.hpp"
+
+#include <cudf/detail/utilities/cuda.cuh>
+
+#include <rmm/exec_policy.hpp>
+#include <thrust/transform_scan.h>
+
+namespace cudf::io::parquet::gpu {
+
+namespace {
+
+// Decode page data that is DELTA_BINARY_PACKED encoded. This encoding is
+// only used for int32 and int64 physical types (and appears to only be used
+// with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html).
+// this kernel only needs 96 threads (3 warps)(for now).
+template <typename level_t>
+__global__ void __launch_bounds__(96) gpuDecodeDeltaBinary(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  using cudf::detail::warp_size;
+  __shared__ __align__(16) delta_binary_decoder db_state;
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  int const lane_id     = t % warp_size;
+  auto* const db        = &db_state;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(s,
+                          &pages[page_idx],
+                          chunks,
+                          min_row,
+                          num_rows,
+                          mask_filter{KERNEL_MASK_DELTA_BINARY},
+                          true)) {
+    return;
+  }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  // copying logic from gpuDecodePageData.
+  PageNestingDecodeInfo const* nesting_info_base = s->nesting_info;
+
+  __shared__ level_t rep[delta_rolling_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[delta_rolling_buf_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
+
+  // initialize delta state
+  if (t == 0) { db->init_binary_block(s->data_start, s->data_end); }
+  __syncthreads();
+
+  auto const batch_size = db->values_per_mb;
+
+  // if skipped_leaf_values is non-zero, then we need to decode up to the first mini-block
+  // that has a value we need.
+  if (skipped_leaf_values > 0) { db->skip_values(skipped_leaf_values); }
+
+  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    uint32_t target_pos;
+    uint32_t const src_pos = s->src_pos;
+
+    if (t < 2 * warp_size) {  // warp0..1
+      target_pos = min(src_pos + 2 * batch_size, s->nz_count + batch_size);
+    } else {                  // warp2
+      target_pos = min(s->nz_count, src_pos + batch_size);
+    }
+    __syncthreads();
+
+    // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of deltas.
+    // warp2 waits one cycle for warps 0/1 to produce a batch, and then stuffs values
+    // into the proper location in the output.
+    if (t < warp_size) {
+      // warp 0
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<delta_rolling_buf_size, level_t>(s, sb, target_pos, rep, def, t);
+    } else if (t < 2 * warp_size) {
+      // warp 1
+      db->decode_batch();
+
+    } else if (src_pos < target_pos) {
+      // warp 2
+      // nesting level that is storing actual leaf values
+      int const leaf_level_index = s->col.max_nesting_depth - 1;
+
+      // process the mini-block in batches of 32
+      for (uint32_t sp = src_pos + lane_id; sp < src_pos + batch_size; sp += 32) {
+        // the position in the output column/buffer
+        int32_t dst_pos = sb->nz_idx[rolling_index<delta_rolling_buf_size>(sp)];
+
+        // handle skip_rows here. flat hierarchies can just skip up to first_row.
+        if (!has_repetition) { dst_pos -= s->first_row; }
+
+        // place value for this thread
+        if (dst_pos >= 0 && sp < target_pos) {
+          void* const dst = nesting_info_base[leaf_level_index].data_out + dst_pos * s->dtype_len;
+          switch (s->dtype_len) {
+            case 1:
+              *static_cast<int8_t*>(dst) =
+                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
+              break;
+            case 2:
+              *static_cast<int16_t*>(dst) =
+                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
+              break;
+            case 4:
+              *static_cast<int32_t*>(dst) =
+                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
+              break;
+            case 8:
+              *static_cast<int64_t*>(dst) =
+                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
+              break;
+          }
+        }
+      }
+
+      if (lane_id == 0) { s->src_pos = src_pos + batch_size; }
+    }
+    __syncthreads();
+  }
+}
+
+}  // anonymous namespace
+
+/**
+ * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary
+ */
+void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                                size_t num_rows,
+                                size_t min_row,
+                                int level_type_size,
+                                rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  dim3 dim_block(96, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  if (level_type_size == 1) {
+    gpuDecodeDeltaBinary<uint8_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  } else {
+    gpuDecodeDeltaBinary<uint16_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  }
+}
+
+}  // namespace cudf::io::parquet::gpu
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 5453e841741..0af561be8da 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -47,12 +47,6 @@ namespace parquet {
 namespace gpu {
 
 namespace {
-// Spark doesn't support RLE encoding for BOOLEANs
-#ifdef ENABLE_BOOL_RLE
-constexpr bool enable_bool_rle = true;
-#else
-constexpr bool enable_bool_rle = false;
-#endif
 
 using ::cudf::detail::device_2dspan;
 
@@ -70,6 +64,9 @@ constexpr uint32_t WARP_MASK = cudf::detail::warp_size - 1;
 // currently 64k - 1
 constexpr uint32_t MAX_GRID_Y_SIZE = (1 << 16) - 1;
 
+// space needed for RLE length field
+constexpr int RLE_LENGTH_FIELD_LEN = 4;
+
 struct frag_init_state_s {
   parquet_column_device_view col;
   PageFragment frag;
@@ -78,6 +75,7 @@ struct frag_init_state_s {
 struct page_enc_state_s {
   uint8_t* cur;          //!< current output ptr
   uint8_t* rle_out;      //!< current RLE write ptr
+  uint8_t* rle_len_pos;  //!< position to write RLE length (for V2 boolean data)
   uint32_t rle_run;      //!< current RLE run
   uint32_t run_val;      //!< current RLE run value
   uint32_t rle_pos;      //!< RLE encoder positions
@@ -115,8 +113,17 @@ constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_value
 {
   if (value_bit_width == 0) return 0;
 
-  // Run length = 4, max(rle/bitpack header) = 5, add one byte per 256 values for overhead
-  return 4 + 5 + util::div_rounding_up_unsafe(num_values * value_bit_width, 8) + (num_values / 256);
+  // Run length = 4, max(rle/bitpack header) = 5. bitpacking worst case is one byte every 8 values
+  // (because bitpacked runs are a multiple of 8). Don't need to round up the last term since that
+  // overhead is accounted for in the '5'.
+  // TODO: this formula does not take into account the data for RLE runs. The worst realistic case
+  // is repeated runs of 8 bitpacked, 2 RLE values. In this case, the formula would be
+  //   0.8 * (num_values * bw / 8 + num_values / 8) + 0.2 * (num_values / 2 * (1 + (bw+7)/8))
+  // for bw < 8 the above value will be larger than below, but in testing it seems like for low
+  // bitwidths it's hard to get the pathological 8:2 split.
+  // If the encoder starts printing the data corruption warning, then this will need to be
+  // revisited.
+  return 4 + 5 + util::div_rounding_up_unsafe(num_values * value_bit_width, 8) + (num_values / 8);
 }
 
 // subtract b from a, but return 0 if this would underflow
@@ -201,6 +208,37 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
   }
 }
 
+Encoding __device__ determine_encoding(PageType page_type,
+                                       Type physical_type,
+                                       bool use_dictionary,
+                                       bool write_v2_headers)
+{
+  // NOTE: For dictionary encoding, parquet v2 recommends using PLAIN in dictionary page and
+  // RLE_DICTIONARY in data page, but parquet v1 uses PLAIN_DICTIONARY in both dictionary and
+  // data pages (actual encoding is identical).
+  switch (page_type) {
+    case PageType::DATA_PAGE: return use_dictionary ? Encoding::PLAIN_DICTIONARY : Encoding::PLAIN;
+    case PageType::DATA_PAGE_V2:
+      // TODO need to work in delta encodings here when they're added
+      return physical_type == BOOLEAN ? Encoding::RLE
+             : use_dictionary         ? Encoding::RLE_DICTIONARY
+                                      : Encoding::PLAIN;
+    case PageType::DICTIONARY_PAGE:
+      return write_v2_headers ? Encoding::PLAIN : Encoding::PLAIN_DICTIONARY;
+    default: CUDF_UNREACHABLE("unsupported page type");
+  }
+}
+
+// operator to use with warp_reduce. stolen from cub::Sum
+struct BitwiseOr {
+  /// Binary OR operator, returns <tt>a | b</tt>
+  template <typename T>
+  __host__ __device__ __forceinline__ T operator()(T const& a, T const& b) const
+  {
+    return a | b;
+  }
+};
+
 }  // anonymous namespace
 
 // blockDim {512,1,1}
@@ -214,9 +252,9 @@ __global__ void __launch_bounds__(block_size)
 {
   __shared__ __align__(16) frag_init_state_s state_g;
 
-  frag_init_state_s* const s              = &state_g;
-  uint32_t const t                        = threadIdx.x;
-  uint32_t const num_fragments_per_column = frag.size().second;
+  frag_init_state_s* const s          = &state_g;
+  auto const t                        = threadIdx.x;
+  auto const num_fragments_per_column = frag.size().second;
 
   if (t == 0) { s->col = col_desc[blockIdx.x]; }
   __syncthreads();
@@ -300,7 +338,8 @@ __global__ void __launch_bounds__(128)
                int32_t num_columns,
                size_t max_page_size_bytes,
                size_type max_page_size_rows,
-               uint32_t page_align)
+               uint32_t page_align,
+               bool write_v2_headers)
 {
   // TODO: All writing seems to be done by thread 0. Could be replaced by thrust foreach
   __shared__ __align__(8) parquet_column_device_view col_g;
@@ -309,7 +348,8 @@ __global__ void __launch_bounds__(128)
   __shared__ __align__(8) EncPage page_g;
   __shared__ __align__(8) statistics_merge_group pagestats_g;
 
-  uint32_t t = threadIdx.x;
+  uint32_t const t          = threadIdx.x;
+  auto const data_page_type = write_v2_headers ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE;
 
   if (t == 0) {
     col_g  = col_desc[blockIdx.x];
@@ -373,6 +413,11 @@ __global__ void __launch_bounds__(128)
       num_pages = 1;
     }
     __syncwarp();
+
+    // page padding needed for RLE encoded boolean data
+    auto const rle_pad =
+      write_v2_headers && col_g.physical_type == BOOLEAN ? RLE_LENGTH_FIELD_LEN : 0;
+
     // This loop goes over one page fragment at a time and adds it to page.
     // When page size crosses a particular limit, then it moves on to the next page and then next
     // page fragment gets added to that one.
@@ -416,16 +461,22 @@ __global__ void __launch_bounds__(128)
       // override this_max_page_size if the requested size is smaller
       this_max_page_size = min(this_max_page_size, max_page_size_bytes);
 
-      // subtract size of rep and def level vectors
-      auto num_vals = values_in_page + frag_g.num_values;
-      this_max_page_size =
-        underflow_safe_subtract(this_max_page_size,
-                                max_RLE_page_size(col_g.num_def_level_bits(), num_vals) +
-                                  max_RLE_page_size(col_g.num_rep_level_bits(), num_vals));
-
-      if (num_rows >= ck_g.num_rows ||
-          (values_in_page > 0 && (page_size + fragment_data_size > this_max_page_size)) ||
-          rows_in_page + frag_g.num_rows > max_page_size_rows) {
+      // subtract size of rep and def level vectors and RLE length field
+      auto num_vals      = values_in_page + frag_g.num_values;
+      this_max_page_size = underflow_safe_subtract(
+        this_max_page_size,
+        max_RLE_page_size(col_g.num_def_level_bits(), num_vals) +
+          max_RLE_page_size(col_g.num_rep_level_bits(), num_vals) + rle_pad);
+
+      // checks to see when we need to close the current page and start a new one
+      auto const is_last_chunk          = num_rows >= ck_g.num_rows;
+      auto const is_page_bytes_exceeded = page_size + fragment_data_size > this_max_page_size;
+      auto const is_page_rows_exceeded  = rows_in_page + frag_g.num_rows > max_page_size_rows;
+      // only check for limit overflow if there's already at least one fragment for this page
+      auto const is_page_too_big =
+        values_in_page > 0 && (is_page_bytes_exceeded || is_page_rows_exceeded);
+
+      if (is_last_chunk || is_page_too_big) {
         if (ck_g.use_dictionary) {
           // Additional byte to store entry bit width
           page_size = 1 + max_RLE_page_size(ck_g.dict_rle_bits, values_in_page);
@@ -434,7 +485,7 @@ __global__ void __launch_bounds__(128)
           page_g.num_fragments = fragments_in_chunk - page_start;
           page_g.chunk         = &chunks[blockIdx.y][blockIdx.x];
           page_g.chunk_id      = blockIdx.y * num_columns + blockIdx.x;
-          page_g.page_type     = PageType::DATA_PAGE;
+          page_g.page_type     = data_page_type;
           page_g.hdr_size      = 0;
           page_g.max_hdr_size  = 32;  // Max size excluding statistics
           if (ck_g.stats) {
@@ -457,7 +508,7 @@ __global__ void __launch_bounds__(128)
           page_g.num_values         = values_in_page;
           auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page);
           auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page);
-          auto const max_data_size  = page_size + def_level_size + rep_level_size;
+          auto const max_data_size  = page_size + def_level_size + rep_level_size + rle_pad;
           // page size must fit in 32-bit signed integer
           if (max_data_size > std::numeric_limits<int32_t>::max()) {
             CUDF_UNREACHABLE("page size exceeds maximum for i32");
@@ -924,22 +975,24 @@ constexpr auto julian_calendar_epoch_diff()
 }
 
 /**
- * @brief Converts a timestamp_ns into a pair with nanoseconds since midnight and number of Julian
- * days. Does not deal with time zones. Used by INT96 code.
+ * @brief Converts number `v` of periods of type `PeriodT` into a pair with nanoseconds since
+ * midnight and number of Julian days. Does not deal with time zones. Used by INT96 code.
  *
- * @param ns number of nanoseconds since epoch
- * @return std::pair<nanoseconds,days> where nanoseconds is the number of nanoseconds
+ * @tparam PeriodT a ratio representing the tick period in duration
+ * @param v count of ticks since epoch
+ * @return A pair of (nanoseconds, days) where nanoseconds is the number of nanoseconds
  * elapsed in the day and days is the number of days from Julian epoch.
  */
-static __device__ std::pair<duration_ns, duration_D> convert_nanoseconds(timestamp_ns const ns)
+template <typename PeriodT>
+__device__ auto julian_days_with_time(int64_t v)
 {
   using namespace cuda::std::chrono;
-  auto const nanosecond_ticks = ns.time_since_epoch();
-  auto const gregorian_days   = floor<days>(nanosecond_ticks);
-  auto const julian_days      = gregorian_days + ceil<days>(julian_calendar_epoch_diff());
-
-  auto const last_day_ticks = nanosecond_ticks - gregorian_days;
-  return {last_day_ticks, julian_days};
+  auto const dur_total             = duration<int64_t, PeriodT>{v};
+  auto const dur_days              = floor<days>(dur_total);
+  auto const dur_time_of_day       = dur_total - dur_days;
+  auto const dur_time_of_day_nanos = duration_cast<nanoseconds>(dur_time_of_day);
+  auto const julian_days           = dur_days + ceil<days>(julian_calendar_epoch_diff());
+  return std::make_pair(dur_time_of_day_nanos, julian_days);
 }
 
 // blockDim(128, 1, 1)
@@ -948,14 +1001,19 @@ __global__ void __launch_bounds__(128, 8)
   gpuEncodePages(device_span<gpu::EncPage> pages,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
-                 device_span<compression_result> comp_results)
+                 device_span<compression_result> comp_results,
+                 bool write_v2_headers)
 {
   __shared__ __align__(8) page_enc_state_s state_g;
-  using block_scan = cub::BlockScan<uint32_t, block_size>;
-  __shared__ typename block_scan::TempStorage temp_storage;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  using block_scan   = cub::BlockScan<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
 
   page_enc_state_s* const s = &state_g;
-  uint32_t t                = threadIdx.x;
+  auto const t              = threadIdx.x;
 
   if (t == 0) {
     state_g = page_enc_state_s{};
@@ -963,9 +1021,16 @@ __global__ void __launch_bounds__(128, 8)
     s->ck   = *s->page.chunk;
     s->col  = *s->ck.col_desc;
     s->cur  = s->page.page_data + s->page.max_hdr_size;
+    // init V2 info
+    s->page.def_lvl_bytes = 0;
+    s->page.rep_lvl_bytes = 0;
+    s->page.num_nulls     = 0;
+    s->rle_len_pos        = nullptr;
   }
   __syncthreads();
 
+  auto const is_v2 = s->page.page_type == PageType::DATA_PAGE_V2;
+
   // Encode Repetition and Definition levels
   if (s->page.page_type != PageType::DICTIONARY_PAGE &&
       (s->col.num_def_level_bits()) != 0 &&  // This means max definition level is not 0 (nullable)
@@ -978,13 +1043,16 @@ __global__ void __launch_bounds__(128, 8)
         s->rle_run     = 0;
         s->rle_pos     = 0;
         s->rle_numvals = 0;
-        s->rle_out     = s->cur + 4;
+        s->rle_out     = s->cur;
+        if (not is_v2) {
+          s->rle_out += 4;  // save space for length
+        }
       }
       __syncthreads();
       while (s->rle_numvals < s->page.num_rows) {
         uint32_t rle_numvals = s->rle_numvals;
         uint32_t nrows       = min(s->page.num_rows - rle_numvals, 128);
-        uint32_t row         = s->page.start_row + rle_numvals + t;
+        auto row             = s->page.start_row + rle_numvals + t;
         // Definition level encodes validity. Checks the valid map and if it is valid, then sets the
         // def_lvl accordingly and sets it in s->vals which is then given to RleEncode to encode
         uint32_t def_lvl = [&]() {
@@ -1020,11 +1088,13 @@ __global__ void __launch_bounds__(128, 8)
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t* cur     = s->cur;
-        uint8_t* rle_out = s->rle_out;
-        if (t < 4) {
-          uint32_t rle_bytes = (uint32_t)(rle_out - cur) - 4;
-          cur[t]             = rle_bytes >> (t * 8);
+        uint8_t* const cur       = s->cur;
+        uint8_t* const rle_out   = s->rle_out;
+        uint32_t const rle_bytes = static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : 4);
+        if (is_v2 && t == 0) {
+          s->page.def_lvl_bytes = rle_bytes;
+        } else if (not is_v2 && t < 4) {
+          cur[t] = rle_bytes >> (t * 8);
         }
         __syncwarp();
         if (t == 0) { s->cur = rle_out; }
@@ -1033,14 +1103,17 @@ __global__ void __launch_bounds__(128, 8)
   } else if (s->page.page_type != PageType::DICTIONARY_PAGE &&
              s->col.num_rep_level_bits() != 0  // This means there ARE repetition levels (has list)
   ) {
-    auto encode_levels = [&](uint8_t const* lvl_val_data, uint32_t nbits) {
+    auto encode_levels = [&](uint8_t const* lvl_val_data, uint32_t nbits, uint32_t& lvl_bytes) {
       // For list types, the repetition and definition levels are pre-calculated. We just need to
       // encode and write them now.
       if (!t) {
         s->rle_run     = 0;
         s->rle_pos     = 0;
         s->rle_numvals = 0;
-        s->rle_out     = s->cur + 4;
+        s->rle_out     = s->cur;
+        if (not is_v2) {
+          s->rle_out += 4;  // save space for length
+        }
       }
       __syncthreads();
       size_type page_first_val_idx = s->col.level_offsets[s->page.start_row];
@@ -1058,19 +1131,21 @@ __global__ void __launch_bounds__(128, 8)
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t* cur     = s->cur;
-        uint8_t* rle_out = s->rle_out;
-        if (t < 4) {
-          uint32_t rle_bytes = (uint32_t)(rle_out - cur) - 4;
-          cur[t]             = rle_bytes >> (t * 8);
+        uint8_t* const cur       = s->cur;
+        uint8_t* const rle_out   = s->rle_out;
+        uint32_t const rle_bytes = static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : 4);
+        if (is_v2 && t == 0) {
+          lvl_bytes = rle_bytes;
+        } else if (not is_v2 && t < 4) {
+          cur[t] = rle_bytes >> (t * 8);
         }
         __syncwarp();
         if (t == 0) { s->cur = rle_out; }
       }
     };
-    encode_levels(s->col.rep_values, s->col.num_rep_level_bits());
+    encode_levels(s->col.rep_values, s->col.num_rep_level_bits(), s->page.rep_lvl_bytes);
     __syncthreads();
-    encode_levels(s->col.def_values, s->col.num_def_level_bits());
+    encode_levels(s->col.def_values, s->col.num_def_level_bits(), s->page.def_lvl_bytes);
   }
   // Encode data values
   __syncthreads();
@@ -1093,14 +1168,21 @@ __global__ void __launch_bounds__(128, 8)
     s->rle_pos     = 0;
     s->rle_numvals = 0;
     s->rle_out     = dst;
+    s->page.encoding =
+      determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
     if (dict_bits >= 0 && physical_type != BOOLEAN) {
       dst[0]     = dict_bits;
       s->rle_out = dst + 1;
+    } else if (is_v2 && physical_type == BOOLEAN) {
+      // save space for RLE length. we don't know the total length yet.
+      s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
+      s->rle_len_pos = dst;
     }
     s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
   }
   __syncthreads();
+  uint32_t num_valid = 0;
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
     uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128);
     uint32_t len, pos;
@@ -1127,13 +1209,15 @@ __global__ void __launch_bounds__(128, 8)
       return std::make_tuple(is_valid, val_idx);
     }();
 
+    if (is_valid) num_valid++;
+
     cur_val_idx += nvals;
     if (dict_bits >= 0) {
       // Dictionary encoding
       if (dict_bits > 0) {
         uint32_t rle_numvals;
         uint32_t rle_numvals_in_block;
-        block_scan(temp_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block);
+        block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block);
         rle_numvals = s->rle_numvals;
         if (is_valid) {
           uint32_t v;
@@ -1146,7 +1230,7 @@ __global__ void __launch_bounds__(128, 8)
         }
         rle_numvals += rle_numvals_in_block;
         __syncthreads();
-        if ((!enable_bool_rle) && (physical_type == BOOLEAN)) {
+        if (!is_v2 && physical_type == BOOLEAN) {
           PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t);
         } else {
           RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t);
@@ -1173,7 +1257,7 @@ __global__ void __launch_bounds__(128, 8)
         len = 0;
       }
       uint32_t total_len = 0;
-      block_scan(temp_storage).ExclusiveSum(len, pos, total_len);
+      block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len);
       __syncthreads();
       if (t == 0) { s->cur = dst + total_len; }
       if (is_valid) {
@@ -1227,22 +1311,23 @@ __global__ void __launch_bounds__(128, 8)
               }
             }
 
-            auto const ret = convert_nanoseconds([&]() {
+            auto const [last_day_nanos, julian_days] = [&] {
+              using namespace cuda::std::chrono;
               switch (s->col.leaf_column->type().id()) {
                 case type_id::TIMESTAMP_SECONDS:
                 case type_id::TIMESTAMP_MILLISECONDS: {
-                  return timestamp_ns{duration_ms{v}};
+                  return julian_days_with_time<cuda::std::milli>(v);
                 } break;
                 case type_id::TIMESTAMP_MICROSECONDS:
                 case type_id::TIMESTAMP_NANOSECONDS: {
-                  return timestamp_ns{duration_us{v}};
+                  return julian_days_with_time<cuda::std::micro>(v);
                 } break;
               }
-              return timestamp_ns{duration_ns{0}};
-            }());
+              return julian_days_with_time<cuda::std::nano>(0);
+            }();
 
             // the 12 bytes of fixed length data.
-            v             = ret.first.count();
+            v             = last_day_nanos.count();
             dst[pos + 0]  = v;
             dst[pos + 1]  = v >> 8;
             dst[pos + 2]  = v >> 16;
@@ -1251,7 +1336,7 @@ __global__ void __launch_bounds__(128, 8)
             dst[pos + 5]  = v >> 40;
             dst[pos + 6]  = v >> 48;
             dst[pos + 7]  = v >> 56;
-            uint32_t w    = ret.second.count();
+            uint32_t w    = julian_days.count();
             dst[pos + 8]  = w;
             dst[pos + 9]  = w >> 8;
             dst[pos + 10] = w >> 16;
@@ -1299,13 +1384,32 @@ __global__ void __launch_bounds__(128, 8)
       __syncthreads();
     }
   }
+
+  uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
+
+  // save RLE length if necessary
+  if (s->rle_len_pos != nullptr && t < 32) {
+    // size doesn't include the 4 bytes for the length
+    auto const rle_size = static_cast<uint32_t>(s->cur - s->rle_len_pos) - RLE_LENGTH_FIELD_LEN;
+    if (t < RLE_LENGTH_FIELD_LEN) { s->rle_len_pos[t] = rle_size >> (t * 8); }
+    __syncwarp();
+  }
+
+  // V2 does not compress rep and def level data
+  size_t const skip_comp_size = s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+
   if (t == 0) {
-    uint8_t* base         = s->page.page_data + s->page.max_hdr_size;
-    auto actual_data_size = static_cast<uint32_t>(s->cur - base);
+    s->page.num_nulls           = s->page.num_values - valid_count;
+    uint8_t* const base         = s->page.page_data + s->page.max_hdr_size;
+    auto const actual_data_size = static_cast<uint32_t>(s->cur - base);
+    if (actual_data_size > s->page.max_data_size) {
+      CUDF_UNREACHABLE("detected possible page data corruption");
+    }
     s->page.max_data_size = actual_data_size;
     if (not comp_in.empty()) {
-      comp_in[blockIdx.x]  = {base, actual_data_size};
-      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, 0};  // size is unused
+      comp_in[blockIdx.x]  = {base + skip_comp_size, actual_data_size - skip_comp_size};
+      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size,
+                              0};  // size is unused
     }
     pages[blockIdx.x] = s->page;
     if (not comp_results.empty()) {
@@ -1313,6 +1417,15 @@ __global__ void __launch_bounds__(128, 8)
       pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
     }
   }
+
+  // copy over uncompressed data
+  if (skip_comp_size != 0 && not comp_in.empty()) {
+    uint8_t const* const src = s->page.page_data + s->page.max_hdr_size;
+    uint8_t* const dst       = s->page.compressed_data + s->page.max_hdr_size;
+    for (int i = t; i < skip_comp_size; i += block_size) {
+      dst[i] = src[i];
+    }
+  }
 }
 
 constexpr int decide_compression_warps_in_block = 4;
@@ -1342,21 +1455,27 @@ __global__ void __launch_bounds__(decide_compression_block_size)
 
   uint32_t uncompressed_data_size = 0;
   uint32_t compressed_data_size   = 0;
+  uint32_t encodings              = 0;
   auto const num_pages            = ck_g[warp_id].num_pages;
   for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) {
     auto const& curr_page     = ck_g[warp_id].pages[page_id];
     auto const page_data_size = curr_page.max_data_size;
+    auto const lvl_bytes      = curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes;
     uncompressed_data_size += page_data_size;
     if (auto comp_res = curr_page.comp_res; comp_res != nullptr) {
-      compressed_data_size += comp_res->bytes_written;
+      compressed_data_size += comp_res->bytes_written + lvl_bytes;
       if (comp_res->status != compression_status::SUCCESS) {
         atomicOr(&compression_error[warp_id], 1);
       }
     }
+    // collect encoding info for the chunk metadata
+    encodings |= encoding_to_mask(curr_page.encoding);
   }
   uncompressed_data_size = warp_reduce(temp_storage[warp_id][0]).Sum(uncompressed_data_size);
   compressed_data_size   = warp_reduce(temp_storage[warp_id][1]).Sum(compressed_data_size);
   __syncwarp();
+  encodings = warp_reduce(temp_storage[warp_id][0]).Reduce(encodings, BitwiseOr{});
+  __syncwarp();
 
   if (lane_id == 0) {
     auto const write_compressed = compressed_data_size != 0 and compression_error[warp_id] == 0 and
@@ -1365,6 +1484,12 @@ __global__ void __launch_bounds__(decide_compression_block_size)
     chunks[chunk_id].bfr_size      = uncompressed_data_size;
     chunks[chunk_id].compressed_size =
       write_compressed ? compressed_data_size : uncompressed_data_size;
+
+    // if there is repetition or definition level data add RLE encoding
+    auto const rle_bits =
+      ck_g[warp_id].col_desc->num_def_level_bits() + ck_g[warp_id].col_desc->num_rep_level_bits();
+    if (rle_bits > 0) { encodings |= encoding_to_mask(Encoding::RLE); }
+    chunks[chunk_id].encodings = encodings;
   }
 }
 
@@ -1459,7 +1584,7 @@ class header_encoder {
     current_header_ptr = cpw_put_uint8(current_header_ptr, value ? ST_FLD_TRUE : ST_FLD_FALSE);
   }
 
-  inline __device__ void put_binary(const void* value, uint32_t length)
+  inline __device__ void put_binary(void const* value, uint32_t length)
   {
     current_header_ptr = cpw_put_uint32(current_header_ptr, length);
     memcpy(current_header_ptr, value, length);
@@ -1472,6 +1597,13 @@ class header_encoder {
     current_header_ptr = cpw_put_int64(current_header_ptr, static_cast<int64_t>(value));
   }
 
+  inline __device__ void field_bool(int field, bool value)
+  {
+    current_header_ptr = cpw_put_fldh(
+      current_header_ptr, field, current_field_index, value ? ST_FLD_TRUE : ST_FLD_FALSE);
+    current_field_index = field;
+  }
+
   template <typename T>
   inline __device__ void field_int32(int field, T value)
   {
@@ -1488,7 +1620,7 @@ class header_encoder {
     current_field_index = field;
   }
 
-  inline __device__ void field_binary(int field, const void* value, uint32_t length)
+  inline __device__ void field_binary(int field, void const* value, uint32_t length)
   {
     current_header_ptr =
       cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_BINARY);
@@ -1595,7 +1727,7 @@ __device__ bool increment_utf8_at(unsigned char* ptr)
  *
  * @return Pair object containing a pointer to the truncated data and its length.
  */
-__device__ std::pair<const void*, uint32_t> truncate_utf8(device_span<unsigned char const> span,
+__device__ std::pair<void const*, uint32_t> truncate_utf8(device_span<unsigned char const> span,
                                                           bool is_min,
                                                           void* scratch,
                                                           int32_t truncate_length)
@@ -1635,7 +1767,7 @@ __device__ std::pair<const void*, uint32_t> truncate_utf8(device_span<unsigned c
  *
  * @return Pair object containing a pointer to the truncated data and its length.
  */
-__device__ std::pair<const void*, uint32_t> truncate_binary(device_span<uint8_t const> arr,
+__device__ std::pair<void const*, uint32_t> truncate_binary(device_span<uint8_t const> arr,
                                                             bool is_min,
                                                             void* scratch,
                                                             int32_t truncate_length)
@@ -1662,7 +1794,7 @@ __device__ std::pair<const void*, uint32_t> truncate_binary(device_span<uint8_t
 /**
  * @brief Attempt to truncate a UTF-8 string to at most truncate_length bytes.
  */
-__device__ std::pair<const void*, uint32_t> truncate_string(const string_view& str,
+__device__ std::pair<void const*, uint32_t> truncate_string(string_view const& str,
                                                             bool is_min,
                                                             void* scratch,
                                                             int32_t truncate_length)
@@ -1686,8 +1818,8 @@ __device__ std::pair<const void*, uint32_t> truncate_string(const string_view& s
 /**
  * @brief Attempt to truncate a binary array to at most truncate_length bytes.
  */
-__device__ std::pair<const void*, uint32_t> truncate_byte_array(
-  const statistics::byte_array_view& arr, bool is_min, void* scratch, int32_t truncate_length)
+__device__ std::pair<void const*, uint32_t> truncate_byte_array(
+  statistics::byte_array_view const& arr, bool is_min, void* scratch, int32_t truncate_length)
 {
   if (truncate_length == NO_TRUNC_STATS or arr.size_bytes() <= truncate_length) {
     return {arr.data(), arr.size_bytes()};
@@ -1707,7 +1839,7 @@ __device__ std::pair<const void*, uint32_t> truncate_byte_array(
  * valid min or max binary value.  String and byte array types will be truncated if they exceed
  * truncate_length.
  */
-__device__ std::pair<const void*, uint32_t> get_extremum(const statistics_val* stats_val,
+__device__ std::pair<void const*, uint32_t> get_extremum(statistics_val const* stats_val,
                                                          statistics_dtype dtype,
                                                          void* scratch,
                                                          bool is_min,
@@ -1741,7 +1873,7 @@ __device__ std::pair<const void*, uint32_t> get_extremum(const statistics_val* s
 }  // namespace
 
 __device__ uint8_t* EncodeStatistics(uint8_t* start,
-                                     const statistics_chunk* s,
+                                     statistics_chunk const* s,
                                      statistics_dtype dtype,
                                      void* scratch)
 {
@@ -1765,7 +1897,7 @@ __global__ void __launch_bounds__(128)
   gpuEncodePageHeaders(device_span<EncPage> pages,
                        device_span<compression_result const> comp_results,
                        device_span<statistics_chunk const> page_stats,
-                       const statistics_chunk* chunk_stats)
+                       statistics_chunk const* chunk_stats)
 {
   // When this whole kernel becomes single thread, the following variables need not be __shared__
   __shared__ __align__(8) parquet_column_device_view col_g;
@@ -1773,7 +1905,7 @@ __global__ void __launch_bounds__(128)
   __shared__ __align__(8) EncPage page_g;
   __shared__ __align__(8) unsigned char scratch[MIN_STATS_SCRATCH_SIZE];
 
-  uint32_t t = threadIdx.x;
+  auto const t = threadIdx.x;
 
   if (t == 0) {
     uint8_t *hdr_start, *hdr_end;
@@ -1791,8 +1923,10 @@ __global__ void __launch_bounds__(128)
     }
     uncompressed_page_size = page_g.max_data_size;
     if (ck_g.is_compressed) {
+      auto const lvl_bytes = page_g.def_lvl_bytes + page_g.rep_lvl_bytes;
       hdr_start            = page_g.compressed_data;
-      compressed_page_size = (uint32_t)comp_results[blockIdx.x].bytes_written;
+      compressed_page_size =
+        static_cast<uint32_t>(comp_results[blockIdx.x].bytes_written) + lvl_bytes;
       page_g.max_data_size = compressed_page_size;
     } else {
       hdr_start            = page_g.page_data;
@@ -1800,28 +1934,16 @@ __global__ void __launch_bounds__(128)
     }
     header_encoder encoder(hdr_start);
     PageType page_type = page_g.page_type;
-    // NOTE: For dictionary encoding, parquet v2 recommends using PLAIN in dictionary page and
-    // RLE_DICTIONARY in data page, but parquet v1 uses PLAIN_DICTIONARY in both dictionary and
-    // data pages (actual encoding is identical).
-    Encoding encoding;
-    if (enable_bool_rle) {
-      encoding = (col_g.physical_type == BOOLEAN) ? Encoding::RLE
-                 : (page_type == PageType::DICTIONARY_PAGE || page_g.chunk->use_dictionary)
-                   ? Encoding::PLAIN_DICTIONARY
-                   : Encoding::PLAIN;
-    } else {
-      encoding = (page_type == PageType::DICTIONARY_PAGE || page_g.chunk->use_dictionary)
-                   ? Encoding::PLAIN_DICTIONARY
-                   : Encoding::PLAIN;
-    }
+
     encoder.field_int32(1, page_type);
     encoder.field_int32(2, uncompressed_page_size);
     encoder.field_int32(3, compressed_page_size);
+
     if (page_type == PageType::DATA_PAGE) {
       // DataPageHeader
       encoder.field_struct_begin(5);
       encoder.field_int32(1, page_g.num_values);  // NOTE: num_values != num_rows for list types
-      encoder.field_int32(2, encoding);           // encoding
+      encoder.field_int32(2, page_g.encoding);    // encoding
       encoder.field_int32(3, Encoding::RLE);      // definition_level_encoding
       encoder.field_int32(4, Encoding::RLE);      // repetition_level_encoding
       // Optionally encode page-level statistics
@@ -1832,11 +1954,29 @@ __global__ void __launch_bounds__(128)
         encoder.field_struct_end(5);
       }
       encoder.field_struct_end(5);
+    } else if (page_type == PageType::DATA_PAGE_V2) {
+      // DataPageHeaderV2
+      encoder.field_struct_begin(8);
+      encoder.field_int32(1, page_g.num_values);
+      encoder.field_int32(2, page_g.num_nulls);
+      encoder.field_int32(3, page_g.num_rows);
+      encoder.field_int32(4, page_g.encoding);
+      encoder.field_int32(5, page_g.def_lvl_bytes);
+      encoder.field_int32(6, page_g.rep_lvl_bytes);
+      encoder.field_bool(7, ck_g.is_compressed);  // TODO can compress at page level now
+      // Optionally encode page-level statistics
+      if (not page_stats.empty()) {
+        encoder.field_struct_begin(8);
+        encoder.set_ptr(
+          EncodeStatistics(encoder.get_ptr(), &page_stats[blockIdx.x], col_g.stats_dtype, scratch));
+        encoder.field_struct_end(8);
+      }
+      encoder.field_struct_end(8);
     } else {
       // DictionaryPageHeader
       encoder.field_struct_begin(7);
       encoder.field_int32(1, ck_g.num_dict_entries);  // number of values in dictionary
-      encoder.field_int32(2, encoding);
+      encoder.field_int32(2, page_g.encoding);
       encoder.field_struct_end(7);
     }
     encoder.end(&hdr_end, false);
@@ -1853,9 +1993,9 @@ __global__ void __launch_bounds__(1024)
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) EncPage page_g;
 
-  uint32_t t = threadIdx.x;
+  auto const t = threadIdx.x;
   uint8_t *dst, *dst_base;
-  const EncPage* first_page;
+  EncPage const* first_page;
   uint32_t num_pages, uncompressed_size;
 
   if (t == 0) ck_g = chunks[blockIdx.x];
@@ -1868,7 +2008,7 @@ __global__ void __launch_bounds__(1024)
   dst_base          = dst;
   uncompressed_size = ck_g.bfr_size;
   for (uint32_t page = 0; page < num_pages; page++) {
-    const uint8_t* src;
+    uint8_t const* src;
     uint32_t hdr_len, data_len;
 
     if (t == 0) { page_g = first_page[page]; }
@@ -1933,8 +2073,8 @@ constexpr __device__ int32_t compare(T& v1, T& v2)
  */
 __device__ int32_t compare_values(Type ptype,
                                   ConvertedType ctype,
-                                  const statistics_val& v1,
-                                  const statistics_val& v2)
+                                  statistics_val const& v1,
+                                  statistics_val const& v2)
 {
   switch (ptype) {
     case Type::BOOLEAN: return compare(v1.u_val, v2.u_val);
@@ -1962,7 +2102,7 @@ __device__ int32_t compare_values(Type ptype,
 /**
  * @brief Determine if a set of statstistics are in ascending order.
  */
-__device__ bool is_ascending(const statistics_chunk* s,
+__device__ bool is_ascending(statistics_chunk const* s,
                              Type ptype,
                              ConvertedType ctype,
                              uint32_t num_pages)
@@ -1979,7 +2119,7 @@ __device__ bool is_ascending(const statistics_chunk* s,
 /**
  * @brief Determine if a set of statstistics are in descending order.
  */
-__device__ bool is_descending(const statistics_chunk* s,
+__device__ bool is_descending(statistics_chunk const* s,
                               Type ptype,
                               ConvertedType ctype,
                               uint32_t num_pages)
@@ -1996,7 +2136,7 @@ __device__ bool is_descending(const statistics_chunk* s,
 /**
  * @brief Determine the ordering of a set of statistics.
  */
-__device__ int32_t calculate_boundary_order(const statistics_chunk* s,
+__device__ int32_t calculate_boundary_order(statistics_chunk const* s,
                                             Type ptype,
                                             ConvertedType ctype,
                                             uint32_t num_pages)
@@ -2133,6 +2273,7 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                       size_t max_page_size_bytes,
                       size_type max_page_size_rows,
                       uint32_t page_align,
+                      bool write_v2_headers,
                       statistics_merge_group* page_grstats,
                       statistics_merge_group* chunk_grstats,
                       rmm::cuda_stream_view stream)
@@ -2149,10 +2290,12 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                                                      num_columns,
                                                      max_page_size_bytes,
                                                      max_page_size_rows,
-                                                     page_align);
+                                                     page_align,
+                                                     write_v2_headers);
 }
 
 void EncodePages(device_span<gpu::EncPage> pages,
+                 bool write_v2_headers,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
                  device_span<compression_result> comp_results,
@@ -2161,8 +2304,8 @@ void EncodePages(device_span<gpu::EncPage> pages,
   auto num_pages = pages.size();
   // A page is part of one column. This is launching 1 block per page. 1 block will exclusively
   // deal with one datatype.
-  gpuEncodePages<128>
-    <<<num_pages, 128, 0, stream.value()>>>(pages, comp_in, comp_out, comp_results);
+  gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(
+    pages, comp_in, comp_out, comp_results, write_v2_headers);
 }
 
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
@@ -2175,7 +2318,7 @@ void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view
 void EncodePageHeaders(device_span<EncPage> pages,
                        device_span<compression_result const> comp_results,
                        device_span<statistics_chunk const> page_stats,
-                       const statistics_chunk* chunk_stats,
+                       statistics_chunk const* chunk_stats,
                        rmm::cuda_stream_view stream)
 {
   // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index ffb4cb60a20..0d611643b46 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,9 +45,9 @@ static const __device__ __constant__ uint8_t g_list2struct[16] = {0,
                                                                   ST_FLD_LIST};
 
 struct byte_stream_s {
-  const uint8_t* cur;
-  const uint8_t* end;
-  const uint8_t* base;
+  uint8_t const* cur;
+  uint8_t const* end;
+  uint8_t const* base;
   // Parsed symbols
   PageType page_type;
   PageInfo page;
@@ -154,6 +154,28 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
   } while (rep_cnt || struct_depth);
 }
 
+/**
+ * @brief Determine which decode kernel to run for the given page.
+ *
+ * @param page The page to decode
+ * @param chunk Column chunk the page belongs to
+ * @return `kernel_mask_bits` value for the given page
+ */
+__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page,
+                                         gpu::ColumnChunkDesc const& chunk)
+{
+  if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; }
+
+  if (page.encoding == Encoding::DELTA_BINARY_PACKED) {
+    return KERNEL_MASK_DELTA_BINARY;
+  } else if (is_string_col(chunk)) {
+    return KERNEL_MASK_STRING;
+  }
+
+  // non-string, non-delta
+  return KERNEL_MASK_GENERAL;
+}
+
 /**
  * @brief Functor to set value to 32 bit integer read from byte stream
  *
@@ -228,7 +250,7 @@ struct FunctionSwitchImpl {
   template <typename... Operator>
   static inline __device__ bool run(byte_stream_s* bs,
                                     int field_type,
-                                    const int& field,
+                                    int const& field,
                                     thrust::tuple<Operator...>& ops)
   {
     if (field == thrust::get<index>(ops).field) {
@@ -244,7 +266,7 @@ struct FunctionSwitchImpl<0> {
   template <typename... Operator>
   static inline __device__ bool run(byte_stream_s* bs,
                                     int field_type,
-                                    const int& field,
+                                    int const& field,
                                     thrust::tuple<Operator...>& ops)
   {
     if (field == thrust::get<0>(ops).field) {
@@ -310,8 +332,8 @@ struct gpuParseDataPageHeaderV2 {
                                  ParquetFieldInt32(2, bs->page.num_nulls),
                                  ParquetFieldInt32(3, bs->page.num_rows),
                                  ParquetFieldEnum<Encoding>(4, bs->page.encoding),
-                                 ParquetFieldInt32(5, bs->page.def_lvl_bytes),
-                                 ParquetFieldInt32(6, bs->page.rep_lvl_bytes));
+                                 ParquetFieldInt32(5, bs->page.lvl_bytes[level_type::DEFINITION]),
+                                 ParquetFieldInt32(6, bs->page.lvl_bytes[level_type::REPETITION]));
     return parse_header(op, bs);
   }
 };
@@ -365,9 +387,12 @@ __global__ void __launch_bounds__(128)
       // this computation is only valid for flat schemas. for nested schemas,
       // they will be recomputed in the preprocess step by examining repetition and
       // definition levels
-      bs->page.chunk_row = 0;
-      bs->page.num_rows  = 0;
-      bs->page.str_bytes = 0;
+      bs->page.chunk_row           = 0;
+      bs->page.num_rows            = 0;
+      bs->page.skipped_values      = -1;
+      bs->page.skipped_leaf_values = 0;
+      bs->page.str_bytes           = 0;
+      bs->page.kernel_mask         = 0;
     }
     num_values     = bs->ck.num_values;
     page_info      = bs->ck.page_info;
@@ -384,16 +409,16 @@ __global__ void __launch_bounds__(128)
         // definition levels
         bs->page.chunk_row += bs->page.num_rows;
         bs->page.num_rows = 0;
+        bs->page.flags    = 0;
         // zero out V2 info
-        bs->page.num_nulls     = 0;
-        bs->page.def_lvl_bytes = 0;
-        bs->page.rep_lvl_bytes = 0;
+        bs->page.num_nulls                         = 0;
+        bs->page.lvl_bytes[level_type::DEFINITION] = 0;
+        bs->page.lvl_bytes[level_type::REPETITION] = 0;
         if (parse_page_header(bs) && bs->page.compressed_page_size >= 0) {
           switch (bs->page_type) {
             case PageType::DATA_PAGE:
               index_out = num_dict_pages + data_page_count;
               data_page_count++;
-              bs->page.flags = 0;
               // this computation is only valid for flat schemas. for nested schemas,
               // they will be recomputed in the preprocess step by examining repetition and
               // definition levels
@@ -403,7 +428,7 @@ __global__ void __launch_bounds__(128)
             case PageType::DATA_PAGE_V2:
               index_out = num_dict_pages + data_page_count;
               data_page_count++;
-              bs->page.flags = 0;
+              bs->page.flags |= PAGEINFO_FLAGS_V2;
               values_found += bs->page.num_input_values;
               // V2 only uses RLE, so it was removed from the header
               bs->page.definition_level_encoding = Encoding::RLE;
@@ -412,12 +437,13 @@ __global__ void __launch_bounds__(128)
             case PageType::DICTIONARY_PAGE:
               index_out = dictionary_page_count;
               dictionary_page_count++;
-              bs->page.flags = PAGEINFO_FLAGS_DICTIONARY;
+              bs->page.flags |= PAGEINFO_FLAGS_DICTIONARY;
               break;
             default: index_out = -1; break;
           }
           bs->page.page_data = const_cast<uint8_t*>(bs->cur);
           bs->cur += bs->page.compressed_page_size;
+          bs->page.kernel_mask = kernel_mask_for_page(bs->page, bs->ck);
         } else {
           bs->cur = bs->end;
         }
@@ -462,7 +488,7 @@ __global__ void __launch_bounds__(128)
   if (!lane_id && ck->num_dict_pages > 0 && ck->str_dict_index) {
     // Data type to describe a string
     string_index_pair* dict_index = ck->str_dict_index;
-    const uint8_t* dict           = ck->page_info[0].page_data;
+    uint8_t const* dict           = ck->page_info[0].page_data;
     int dict_size                 = ck->page_info[0].uncompressed_page_size;
     int num_entries               = ck->page_info[0].num_input_values;
     int pos = 0, cur = 0;
@@ -478,7 +504,7 @@ __global__ void __launch_bounds__(128)
         }
       }
       // TODO: Could store 8 entries in shared mem, then do a single warp-wide store
-      dict_index[i].first  = reinterpret_cast<const char*>(dict + pos + 4);
+      dict_index[i].first  = reinterpret_cast<char const*>(dict + pos + 4);
       dict_index[i].second = len;
     }
   }
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
new file mode 100644
index 00000000000..1ac4c95f713
--- /dev/null
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "page_decode.cuh"
+#include "page_string_utils.cuh"
+
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/gather.cuh>
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace gpu {
+
+namespace {
+
+constexpr int preprocess_block_size = 512;
+constexpr int decode_block_size     = 128;
+constexpr int rolling_buf_size      = decode_block_size * 2;
+constexpr int preproc_buf_size      = LEVEL_DECODE_BUF_SIZE;
+
+/**
+ * @brief Compute the start and end page value bounds for this page
+ *
+ * This uses definition and repetition level info to determine the number of valid and null
+ * values for the page, taking into account skip_rows/num_rows (if set).
+ *
+ * @param s The local page info
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ * @param is_bounds_pg True if this page is clipped
+ * @param has_repetition True if the schema is nested
+ * @param decoders Definition and repetition level decoders
+ * @return pair containing start and end value indexes
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ * @tparam rle_buf_size Size of the buffer used when decoding repetition and definition levels
+ */
+template <typename level_t, int rle_buf_size>
+__device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
+                                              size_t min_row,
+                                              size_t num_rows,
+                                              bool is_bounds_pg,
+                                              bool has_repetition,
+                                              rle_stream<level_t, rle_buf_size>* decoders)
+{
+  using block_reduce = cub::BlockReduce<int, preprocess_block_size>;
+  using block_scan   = cub::BlockScan<int, preprocess_block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  auto const t = threadIdx.x;
+
+  // decode batches of level stream data using rle_stream objects and use the results to
+  // calculate start and end value positions in the encoded string data.
+  int const max_depth = s->col.max_nesting_depth;
+  int const max_def   = s->nesting_info[max_depth - 1].max_def_level;
+
+  // can skip all this if we know there are no nulls
+  if (max_def == 0 && !is_bounds_pg) {
+    s->page.num_valids = s->num_input_values;
+    s->page.num_nulls  = 0;
+    return {0, s->num_input_values};
+  }
+
+  int start_value = 0;
+  int end_value   = s->page.num_input_values;
+  auto const pp   = &s->page;
+  auto const col  = &s->col;
+
+  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  auto const def_decode = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
+  auto const rep_decode = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
+  decoders[level_type::DEFINITION].init(s->col.level_bits[level_type::DEFINITION],
+                                        s->abs_lvl_start[level_type::DEFINITION],
+                                        s->abs_lvl_end[level_type::DEFINITION],
+                                        preproc_buf_size,
+                                        def_decode,
+                                        s->page.num_input_values);
+  // only need repetition if this is a bounds page. otherwise all we need is def level info
+  // to count the nulls.
+  if (has_repetition && is_bounds_pg) {
+    decoders[level_type::REPETITION].init(s->col.level_bits[level_type::REPETITION],
+                                          s->abs_lvl_start[level_type::REPETITION],
+                                          s->abs_lvl_end[level_type::REPETITION],
+                                          preproc_buf_size,
+                                          rep_decode,
+                                          s->page.num_input_values);
+  }
+
+  int processed = 0;
+
+  // if this is a bounds page, we need to do extra work to find the start and/or end value index
+  if (is_bounds_pg) {
+    __shared__ int skipped_values;
+    __shared__ int skipped_leaf_values;
+    __shared__ int last_input_value;
+    __shared__ int end_val_idx;
+
+    // need these for skip_rows case
+    auto const page_start_row = col->start_row + pp->chunk_row;
+    auto const max_row        = min_row + num_rows;
+    auto const begin_row      = page_start_row >= min_row ? 0 : min_row - page_start_row;
+    auto const max_page_rows  = pp->num_rows - begin_row;
+    auto const page_rows      = page_start_row + begin_row + max_page_rows <= max_row
+                                  ? max_page_rows
+                                  : max_row - (page_start_row + begin_row);
+    auto end_row              = begin_row + page_rows;
+    int row_fudge             = -1;
+
+    // short circuit for no nulls
+    if (max_def == 0 && !has_repetition) {
+      if (t == 0) {
+        pp->num_nulls  = 0;
+        pp->num_valids = end_row - begin_row;
+      }
+      return {begin_row, end_row};
+    }
+
+    int row_count           = 0;
+    int leaf_count          = 0;
+    bool skipped_values_set = false;
+    bool end_value_set      = false;
+
+    while (processed < s->page.num_input_values) {
+      thread_index_type start_val = processed;
+
+      if (has_repetition) {
+        decoders[level_type::REPETITION].decode_next(t);
+        __syncthreads();
+
+        // special case where page does not begin at a row boundary
+        if (processed == 0 && rep_decode[0] != 0) {
+          if (t == 0) {
+            skipped_values      = 0;
+            skipped_leaf_values = 0;
+          }
+          skipped_values_set = true;
+          end_row++;  // need to finish off the previous row
+          row_fudge = 0;
+        }
+      }
+
+      // the # of rep/def levels will always be the same size
+      processed += decoders[level_type::DEFINITION].decode_next(t);
+      __syncthreads();
+
+      // do something with the level data
+      while (start_val < processed) {
+        auto const idx_t = start_val + t;
+        auto const idx   = rolling_index<preproc_buf_size>(idx_t);
+
+        // get absolute thread row index
+        int is_new_row = idx_t < processed && (!has_repetition || rep_decode[idx] == 0);
+        int thread_row_count, block_row_count;
+        block_scan(temp_storage.scan_storage)
+          .InclusiveSum(is_new_row, thread_row_count, block_row_count);
+        __syncthreads();
+
+        // get absolute thread leaf index
+        int const is_new_leaf = idx_t < processed && (def_decode[idx] >= max_def);
+        int thread_leaf_count, block_leaf_count;
+        block_scan(temp_storage.scan_storage)
+          .InclusiveSum(is_new_leaf, thread_leaf_count, block_leaf_count);
+        __syncthreads();
+
+        // if we have not set skipped values yet, see if we found the first in-bounds row
+        if (!skipped_values_set && row_count + block_row_count > begin_row) {
+          // if this thread is in row bounds
+          int const row_index = thread_row_count + row_count - 1;
+          int const in_row_bounds =
+            idx_t < processed && (row_index >= begin_row) && (row_index < end_row);
+
+          int local_count, global_count;
+          block_scan(temp_storage.scan_storage)
+            .InclusiveSum(in_row_bounds, local_count, global_count);
+          __syncthreads();
+
+          // we found it
+          if (global_count > 0) {
+            // this is the thread that represents the first row. need to test in_row_bounds for
+            // the case where we only want one row and local_count == 1 for many threads.
+            if (local_count == 1 && in_row_bounds) {
+              skipped_values = idx_t;
+              skipped_leaf_values =
+                leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
+            }
+            skipped_values_set = true;
+          }
+        }
+
+        // test if row_count will exceed end_row in this batch
+        if (!end_value_set && row_count + block_row_count >= end_row) {
+          // if this thread exceeds row bounds. row_fudge change depending on whether we've faked
+          // the end row to account for starting a page in the middle of a row.
+          int const row_index          = thread_row_count + row_count + row_fudge;
+          int const exceeds_row_bounds = row_index >= end_row;
+
+          int local_count, global_count;
+          block_scan(temp_storage.scan_storage)
+            .InclusiveSum(exceeds_row_bounds, local_count, global_count);
+          __syncthreads();
+
+          // we found it
+          if (global_count > 0) {
+            // this is the thread that represents the end row.
+            if (local_count == 1) {
+              last_input_value = idx_t;
+              end_val_idx = leaf_count + (is_new_leaf ? thread_leaf_count - 1 : thread_leaf_count);
+            }
+            end_value_set = true;
+            break;
+          }
+        }
+
+        row_count += block_row_count;
+        leaf_count += block_leaf_count;
+
+        start_val += preprocess_block_size;
+      }
+      __syncthreads();
+      if (end_value_set) { break; }
+    }
+
+    start_value = skipped_values_set ? skipped_leaf_values : 0;
+    end_value   = end_value_set ? end_val_idx : leaf_count;
+
+    if (t == 0) {
+      int const v0                = skipped_values_set ? skipped_values : 0;
+      int const vn                = end_value_set ? last_input_value : s->num_input_values;
+      int const total_values      = vn - v0;
+      int const total_leaf_values = end_value - start_value;
+      int const num_nulls         = total_values - total_leaf_values;
+      pp->num_nulls               = num_nulls;
+      pp->num_valids              = total_leaf_values;
+    }
+  }
+  // already filtered out unwanted pages, so need to count all non-null values in this page
+  else {
+    int num_nulls = 0;
+    while (processed < s->page.num_input_values) {
+      thread_index_type start_val = processed;
+      processed += decoders[level_type::DEFINITION].decode_next(t);
+      __syncthreads();
+
+      while (start_val < processed) {
+        auto const idx_t = start_val + t;
+        if (idx_t < processed) {
+          auto const idx = rolling_index<preproc_buf_size>(idx_t);
+          if (def_decode[idx] < max_def) { num_nulls++; }
+        }
+        start_val += preprocess_block_size;
+      }
+      __syncthreads();
+    }
+
+    int const null_count = block_reduce(temp_storage.reduce_storage).Sum(num_nulls);
+
+    if (t == 0) {
+      pp->num_nulls  = null_count;
+      pp->num_valids = pp->num_input_values - null_count;
+    }
+    __syncthreads();
+
+    end_value -= pp->num_nulls;
+  }
+
+  return {start_value, end_value};
+}
+
+/**
+ * @brief Compute string size information for dictionary encoded strings.
+ *
+ * @param data Pointer to the start of the page data stream
+ * @param dict_base Pointer to the start of the dictionary
+ * @param dict_bits The number of bits used to in the dictionary bit packing
+ * @param dict_size Size of the dictionary in bytes
+ * @param data_size Size of the page data in bytes
+ * @param start_value Do not count values that occur before this index
+ * @param end_value Do not count values that occur after this index
+ */
+__device__ size_t totalDictEntriesSize(uint8_t const* data,
+                                       uint8_t const* dict_base,
+                                       int dict_bits,
+                                       int dict_size,
+                                       int data_size,
+                                       int start_value,
+                                       int end_value)
+{
+  int const t              = threadIdx.x;
+  uint8_t const* ptr       = data;
+  uint8_t const* const end = data + data_size;
+  int const bytecnt        = (dict_bits + 7) >> 3;
+  size_t l_str_len         = 0;  // partial sums across threads
+  int pos                  = 0;  // current value index in the data stream
+  int t0                   = 0;  // thread 0 for this batch
+
+  int dict_run = 0;
+  int dict_val = 0;
+
+  while (pos < end_value && ptr <= end) {
+    if (dict_run <= 1) {
+      dict_run = (ptr < end) ? get_vlq32(ptr, end) : 0;
+      if (!(dict_run & 1)) {
+        // Repeated value
+        if (ptr + bytecnt <= end) {
+          int32_t run_val = ptr[0];
+          if (bytecnt > 1) {
+            run_val |= ptr[1] << 8;
+            if (bytecnt > 2) {
+              run_val |= ptr[2] << 16;
+              if (bytecnt > 3) { run_val |= ptr[3] << 24; }
+            }
+          }
+          dict_val = run_val & ((1 << dict_bits) - 1);
+        }
+        ptr += bytecnt;
+      }
+    }
+
+    int batch_len;
+    if (dict_run & 1) {
+      // Literal batch: must output a multiple of 8, except for the last batch
+      int batch_len_div8;
+      batch_len      = max(min(preprocess_block_size, (int)(dict_run >> 1) * 8), 1);
+      batch_len_div8 = (batch_len + 7) >> 3;
+      dict_run -= batch_len_div8 * 2;
+      ptr += batch_len_div8 * dict_bits;
+    } else {
+      batch_len = dict_run >> 1;
+      dict_run  = 0;
+    }
+
+    int const is_literal = dict_run & 1;
+
+    // calculate my thread id for this batch.  way to round-robin the work.
+    int mytid = t - t0;
+    if (mytid < 0) mytid += preprocess_block_size;
+
+    // compute dictionary index.
+    if (is_literal) {
+      int dict_idx = 0;
+      if (mytid < batch_len) {
+        dict_idx         = dict_val;
+        int32_t ofs      = (mytid - ((batch_len + 7) & ~7)) * dict_bits;
+        const uint8_t* p = ptr + (ofs >> 3);
+        ofs &= 7;
+        if (p < end) {
+          uint32_t c = 8 - ofs;
+          dict_idx   = (*p++) >> ofs;
+          if (c < dict_bits && p < end) {
+            dict_idx |= (*p++) << c;
+            c += 8;
+            if (c < dict_bits && p < end) {
+              dict_idx |= (*p++) << c;
+              c += 8;
+              if (c < dict_bits && p < end) { dict_idx |= (*p++) << c; }
+            }
+          }
+          dict_idx &= (1 << dict_bits) - 1;
+        }
+
+        if (pos + mytid < end_value) {
+          uint32_t const dict_pos = (dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
+          if (pos + mytid >= start_value && dict_pos < (uint32_t)dict_size) {
+            const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
+            l_str_len += src->second;
+          }
+        }
+      }
+
+      t0 += batch_len;
+    } else {
+      int const start_off =
+        (pos < start_value && pos + batch_len > start_value) ? start_value - pos : 0;
+      batch_len = min(batch_len, end_value - pos);
+      if (mytid == 0) {
+        uint32_t const dict_pos = (dict_bits > 0) ? dict_val * sizeof(string_index_pair) : 0;
+        if (pos + batch_len > start_value && dict_pos < (uint32_t)dict_size) {
+          const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
+          l_str_len += (batch_len - start_off) * src->second;
+        }
+      }
+
+      t0 += 1;
+    }
+
+    t0 = t0 % preprocess_block_size;
+    pos += batch_len;
+  }
+  __syncthreads();
+
+  using block_reduce = cub::BlockReduce<size_t, preprocess_block_size>;
+  __shared__ typename block_reduce::TempStorage reduce_storage;
+  size_t sum_l = block_reduce(reduce_storage).Sum(l_str_len);
+
+  return sum_l;
+}
+
+/**
+ * @brief Compute string size information for plain encoded strings.
+ *
+ * @param data Pointer to the start of the page data stream
+ * @param data_size Length of data
+ * @param start_value Do not count values that occur before this index
+ * @param end_value Do not count values that occur after this index
+ */
+__device__ size_t totalPlainEntriesSize(uint8_t const* data,
+                                        int data_size,
+                                        int start_value,
+                                        int end_value)
+{
+  int const t      = threadIdx.x;
+  int pos          = 0;
+  size_t total_len = 0;
+
+  // This step is purely serial
+  if (!t) {
+    const uint8_t* cur = data;
+    int k              = 0;
+
+    while (pos < end_value && k < data_size) {
+      int len;
+      if (k + 4 <= data_size) {
+        len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
+        k += 4;
+        if (k + len > data_size) { len = 0; }
+      } else {
+        len = 0;
+      }
+
+      k += len;
+      if (pos >= start_value) { total_len += len; }
+      pos++;
+    }
+  }
+
+  return total_len;
+}
+
+/**
+ * @brief Kernel for computing string page output size information.
+ *
+ * String columns need accurate data size information to preallocate memory in the column buffer to
+ * store the char data. This calls a kernel to calculate information needed by the string decoding
+ * kernel. On exit, the `str_bytes`, `num_nulls`, and `num_valids` fields of the PageInfo struct
+ * are updated. This call ignores non-string columns.
+ *
+ * @param pages All pages to be decoded
+ * @param chunks All chunks to be decoded
+ * @param min_rows crop all rows below min_row
+ * @param num_rows Maximum number of rows to read
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ */
+template <typename level_t>
+__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+
+  page_state_s* const s = &state_g;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  PageInfo* const pp    = &pages[page_idx];
+
+  // reset str_bytes to 0 in case it's already been calculated
+  if (t == 0) { pp->str_bytes = 0; }
+
+  // whether or not we have repetition levels (lists)
+  bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // the required number of runs in shared memory we will need to provide the
+  // rle_stream object
+  constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<preprocess_block_size>();
+
+  // the level stream decoders
+  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
+  __shared__ rle_run<level_t> rep_runs[rle_run_buffer_size];
+  rle_stream<level_t, preprocess_block_size> decoders[level_type::NUM_LEVEL_TYPES] = {{def_runs},
+                                                                                      {rep_runs}};
+
+  // setup page info
+  if (!setupLocalPageInfo(
+        s, pp, chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, false)) {
+    return;
+  }
+
+  if (!t) {
+    s->page.num_nulls  = 0;
+    s->page.num_valids = 0;
+    s->page.str_bytes  = 0;
+  }
+  __syncthreads();
+
+  bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition);
+
+  // if we're skipping this page anyway, no need to count it
+  if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; }
+
+  // find start/end value indices
+  auto const [start_value, end_value] =
+    page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders);
+
+  // need to save num_nulls and num_valids calculated in page_bounds in this page
+  if (t == 0) {
+    pp->num_nulls  = s->page.num_nulls;
+    pp->num_valids = s->page.num_valids;
+  }
+
+  auto const& col  = s->col;
+  size_t str_bytes = 0;
+  // short circuit for FIXED_LEN_BYTE_ARRAY
+  if ((col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+    str_bytes = pp->num_valids * s->dtype_len_in;
+  } else {
+    // now process string info in the range [start_value, end_value)
+    // set up for decoding strings...can be either plain or dictionary
+    uint8_t const* data      = s->data_start;
+    uint8_t const* const end = s->data_end;
+    uint8_t const* dict_base = nullptr;
+    int dict_size            = 0;
+
+    switch (pp->encoding) {
+      case Encoding::PLAIN_DICTIONARY:
+      case Encoding::RLE_DICTIONARY:
+        // RLE-packed dictionary indices, first byte indicates index length in bits
+        if (col.str_dict_index) {
+          // String dictionary: use index
+          dict_base = reinterpret_cast<const uint8_t*>(col.str_dict_index);
+          dict_size = col.page_info[0].num_input_values * sizeof(string_index_pair);
+        } else {
+          dict_base = col.page_info[0].page_data;  // dictionary is always stored in the first page
+          dict_size = col.page_info[0].uncompressed_page_size;
+        }
+
+        // FIXME: need to return an error condition...this won't actually do anything
+        if (s->dict_bits > 32 || !dict_base) { CUDF_UNREACHABLE("invalid dictionary bit size"); }
+
+        str_bytes = totalDictEntriesSize(
+          data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value);
+        break;
+      case Encoding::PLAIN:
+        dict_size = static_cast<int32_t>(end - data);
+        str_bytes = is_bounds_pg ? totalPlainEntriesSize(data, dict_size, start_value, end_value)
+                                 : dict_size - sizeof(int) * pp->num_valids;
+        break;
+    }
+  }
+
+  if (t == 0) {
+    // TODO check for overflow
+    pp->str_bytes = str_bytes;
+  }
+}
+
+/**
+ * @brief Kernel for computing the string column data stored in the pages
+ *
+ * This function will write the page data and the page data's validity to the
+ * output specified in the page's column chunk.
+ *
+ * This version uses a single warp to do the string copies.
+ *
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ * @tparam level_t Type used to store decoded repetition and definition levels
+ */
+template <typename level_t>
+__global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(4) size_type last_offset;
+  __shared__ __align__(16)
+    page_state_buffers_s<rolling_buf_size, rolling_buf_size, rolling_buf_size>
+      state_buffers;
+
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, true)) {
+    return;
+  }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  // offsets are local to the page
+  if (t == 0) { last_offset = 0; }
+  __syncthreads();
+
+  int const out_thread0                          = s->dict_base && s->dict_bits == 0 ? 32 : 64;
+  int const leaf_level_index                     = s->col.max_nesting_depth - 1;
+  PageNestingDecodeInfo* const nesting_info_base = s->nesting_info;
+
+  __shared__ level_t rep[rolling_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[rolling_buf_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
+  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    int target_pos;
+    int src_pos = s->src_pos;
+
+    if (t < out_thread0) {
+      target_pos = min(src_pos + 2 * (decode_block_size - out_thread0),
+                       s->nz_count + (decode_block_size - out_thread0));
+    } else {
+      target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
+      if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
+    }
+    __syncthreads();
+    if (t < 32) {
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<rolling_buf_size, level_t>(s, sb, target_pos, rep, def, t);
+    } else if (t < out_thread0) {
+      // skipped_leaf_values will always be 0 for flat hierarchies.
+      uint32_t src_target_pos = target_pos + skipped_leaf_values;
+
+      // WARP1: Decode dictionary indices, booleans or string positions
+      if (s->dict_base) {
+        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
+      } else {
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
+      }
+      if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
+    } else {
+      int const me = t - out_thread0;
+
+      // WARP1..WARP3: Decode values
+      src_pos += t - out_thread0;
+
+      // the position in the output column/buffer
+      int dst_pos = sb->nz_idx[rolling_index<rolling_buf_size>(src_pos)];
+
+      // for the flat hierarchy case we will be reading from the beginning of the value stream,
+      // regardless of the value of first_row. so adjust our destination offset accordingly.
+      // example:
+      // - user has passed skip_rows = 2, so our first_row to output is 2
+      // - the row values we get from nz_idx will be
+      //   0, 1, 2, 3, 4 ....
+      // - by shifting these values by first_row, the sequence becomes
+      //   -1, -2, 0, 1, 2 ...
+      // - so we will end up ignoring the first two input rows, and input rows 2..n will
+      //   get written to the output starting at position 0.
+      //
+      if (!has_repetition) { dst_pos -= s->first_row; }
+
+      // need to do this before we branch on src_pos/dst_pos so we don't deadlock
+      // choose a character parallel string copy when the average string is longer than a warp
+      using cudf::detail::warp_size;
+      auto const use_char_ll =
+        s->page.num_valids > 0 && (s->page.str_bytes / s->page.num_valids) >= warp_size;
+
+      if (me < warp_size) {
+        for (int i = 0; i < decode_block_size - out_thread0; i += warp_size) {
+          dst_pos = sb->nz_idx[rolling_index<rolling_buf_size>(src_pos + i)];
+          if (!has_repetition) { dst_pos -= s->first_row; }
+
+          auto [ptr, len] = src_pos + i < target_pos && dst_pos >= 0
+                              ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i)
+                              : cuda::std::pair<char const*, size_t>{nullptr, 0};
+
+          __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;
+          size_type offset;
+          cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset);
+          offset += last_offset;
+
+          if (use_char_ll) {
+            __shared__ __align__(8) uint8_t const* pointers[warp_size];
+            __shared__ __align__(4) size_type offsets[warp_size];
+            __shared__ __align__(4) int dsts[warp_size];
+            __shared__ __align__(4) int lengths[warp_size];
+
+            offsets[me]  = offset;
+            pointers[me] = reinterpret_cast<uint8_t const*>(ptr);
+            dsts[me]     = dst_pos;
+            lengths[me]  = len;
+            __syncwarp();
+
+            for (int ss = 0; ss < warp_size && ss + i + s->src_pos < target_pos; ss++) {
+              if (dsts[ss] >= 0) {
+                auto offptr =
+                  reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) +
+                  dsts[ss];
+                *offptr      = lengths[ss];
+                auto str_ptr = nesting_info_base[leaf_level_index].string_out + offsets[ss];
+                ll_strcpy(str_ptr, pointers[ss], lengths[ss], me);
+              }
+            }
+
+          } else {
+            if (src_pos + i < target_pos && dst_pos >= 0) {
+              auto offptr =
+                reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
+              *offptr      = len;
+              auto str_ptr = nesting_info_base[leaf_level_index].string_out + offset;
+              memcpy(str_ptr, ptr, len);
+            }
+            __syncwarp();
+          }
+
+          // last thread in warp updates last_offset
+          if (me == warp_size - 1) { last_offset = offset + len; }
+          __syncwarp();
+        }
+      }
+
+      if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; }
+    }
+    __syncthreads();
+  }
+
+  // now turn array of lengths into offsets
+  int value_count = nesting_info_base[leaf_level_index].value_count;
+
+  // if no repetition we haven't calculated start/end bounds and instead just skipped
+  // values until we reach first_row. account for that here.
+  if (!has_repetition) { value_count -= s->first_row; }
+
+  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+}
+
+}  // anonymous namespace
+
+/**
+ * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes
+ */
+void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            size_t min_row,
+                            size_t num_rows,
+                            int level_type_size,
+                            rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(preprocess_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+  if (level_type_size == 1) {
+    gpuComputePageStringSizes<uint8_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  } else {
+    gpuComputePageStringSizes<uint16_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  }
+}
+
+/**
+ * @copydoc cudf::io::parquet::gpu::DecodeStringPageData
+ */
+void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                   cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                                   size_t num_rows,
+                                   size_t min_row,
+                                   int level_type_size,
+                                   rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  dim3 dim_block(decode_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  if (level_type_size == 1) {
+    gpuDecodeStringPageData<uint8_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  } else {
+    gpuDecodeStringPageData<uint16_t>
+      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+  }
+}
+
+}  // namespace gpu
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh
new file mode 100644
index 00000000000..9395599b3ff
--- /dev/null
+++ b/cpp/src/io/parquet/page_string_utils.cuh
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/strings/detail/gather.cuh>
+
+namespace cudf::io::parquet::gpu {
+
+// stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
+// copies from src to dst in 16B chunks per thread.
+inline __device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
+{
+  using cudf::detail::warp_size;
+  using cudf::strings::detail::load_uint4;
+
+  constexpr size_t out_datatype_size = sizeof(uint4);
+  constexpr size_t in_datatype_size  = sizeof(uint);
+
+  auto const alignment_offset = reinterpret_cast<std::uintptr_t>(dst) % out_datatype_size;
+  uint4* out_chars_aligned    = reinterpret_cast<uint4*>(dst - alignment_offset);
+  auto const in_start         = src;
+
+  // Both `out_start_aligned` and `out_end_aligned` are indices into `dst`.
+  // `out_start_aligned` is the first 16B aligned memory location after `dst + 4`.
+  // `out_end_aligned` is the last 16B aligned memory location before `len - 4`. Characters
+  // between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
+  // `dst + 4` and `len - 4` are used instead of `dst` and `len` to avoid
+  // `load_uint4` reading beyond string boundaries.
+  // use signed int since out_end_aligned can be negative.
+  int64_t const out_start_aligned = (in_datatype_size + alignment_offset + out_datatype_size - 1) /
+                                      out_datatype_size * out_datatype_size -
+                                    alignment_offset;
+  int64_t const out_end_aligned =
+    (len - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size -
+    alignment_offset;
+
+  for (int64_t ichar = out_start_aligned + lane_id * out_datatype_size; ichar < out_end_aligned;
+       ichar += warp_size * out_datatype_size) {
+    *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
+      load_uint4((const char*)in_start + ichar);
+  }
+
+  // Tail logic: copy characters of the current string outside
+  // `[out_start_aligned, out_end_aligned)`.
+  if (out_end_aligned <= out_start_aligned) {
+    // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
+    // entire string.
+    for (int64_t ichar = lane_id; ichar < len; ichar += warp_size) {
+      dst[ichar] = in_start[ichar];
+    }
+  } else {
+    // Copy characters in range `[0, out_start_aligned)`.
+    if (lane_id < out_start_aligned) { dst[lane_id] = in_start[lane_id]; }
+    // Copy characters in range `[out_end_aligned, len)`.
+    int64_t ichar = out_end_aligned + lane_id;
+    if (ichar < len) { dst[ichar] = in_start[ichar]; }
+  }
+}
+
+/**
+ * @brief char-parallel string copy.
+ */
+inline __device__ void ll_strcpy(uint8_t* dst, uint8_t const* src, size_t len, uint32_t lane_id)
+{
+  using cudf::detail::warp_size;
+  if (len > 64) {
+    wideStrcpy(dst, src, len, lane_id);
+  } else {
+    for (int i = lane_id; i < len; i += warp_size) {
+      dst[i] = src[i];
+    }
+  }
+}
+
+/**
+ * @brief Perform exclusive scan on an array of any length using a single block of threads.
+ */
+template <int block_size>
+__device__ void block_excl_sum(size_type* arr, size_type length, size_type initial_value)
+{
+  using block_scan = cub::BlockScan<size_type, block_size>;
+  __shared__ typename block_scan::TempStorage scan_storage;
+  int const t = threadIdx.x;
+
+  // do a series of block sums, storing results in arr as we go
+  for (int pos = 0; pos < length; pos += block_size) {
+    int const tidx = pos + t;
+    size_type tval = tidx < length ? arr[tidx] : 0;
+    size_type block_sum;
+    block_scan(scan_storage).ExclusiveScan(tval, tval, initial_value, cub::Sum(), block_sum);
+    if (tidx < length) { arr[tidx] = tval; }
+    initial_value += block_sum;
+  }
+}
+
+}  // namespace cudf::io::parquet::gpu
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 48c7e2fca57..a729f28d672 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -226,8 +226,8 @@ struct ColumnChunkMetaData {
   int64_t data_page_offset  = 0;  // Byte offset from beginning of file to first data page
   int64_t index_page_offset = 0;  // Byte offset from beginning of file to root index page
   int64_t dictionary_page_offset =
-    0;  // Byte offset from the beginning of file to first (only) dictionary page
-  std::vector<uint8_t> statistics_blob;  // Encoded chunk-level statistics as binary blob
+    0;                    // Byte offset from the beginning of file to first (only) dictionary page
+  Statistics statistics;  // Encoded chunk-level statistics
 };
 
 /**
@@ -298,6 +298,20 @@ struct DataPageHeader {
   Encoding repetition_level_encoding = Encoding::PLAIN;  // Encoding used for repetition levels
 };
 
+/**
+ * @brief Thrift-derived struct describing the header for a V2 data page
+ */
+struct DataPageHeaderV2 {
+  int32_t num_values = 0;  // Number of values, including NULLs, in this data page.
+  int32_t num_nulls  = 0;  // Number of NULL values, in this data page.
+  int32_t num_rows   = 0;  // Number of rows in this data page. which means
+                           // pages change on record boundaries (r = 0)
+  Encoding encoding                     = Encoding::PLAIN;  // Encoding used for this data page
+  int32_t definition_levels_byte_length = 0;                // length of the definition levels
+  int32_t repetition_levels_byte_length = 0;                // length of the repetition levels
+  bool is_compressed                    = true;             // whether the values are compressed.
+};
+
 /**
  * @brief Thrift-derived struct describing the header for a dictionary page
  */
@@ -322,6 +336,7 @@ struct PageHeader {
   int32_t compressed_page_size   = 0;  // Compressed page size in bytes (not including the header)
   DataPageHeader data_page_header;
   DictionaryPageHeader dictionary_page_header;
+  DataPageHeaderV2 data_page_header_v2;
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index ab6290c4ed6..5f8f1617cb9 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -92,6 +92,7 @@ enum class Encoding : uint8_t {
   DELTA_BYTE_ARRAY        = 7,
   RLE_DICTIONARY          = 8,
   BYTE_STREAM_SPLIT       = 9,
+  NUM_ENCODINGS           = 10,
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 4b577929e82..e82b6abc13d 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -45,6 +45,15 @@ constexpr int MAX_DICT_BITS = 24;
 // Total number of unsigned 24 bit values
 constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1;
 
+// level decode buffer size.
+constexpr int LEVEL_DECODE_BUF_SIZE = 2048;
+
+template <int rolling_size>
+constexpr int rolling_index(int index)
+{
+  return index % rolling_size;
+}
+
 /**
  * @brief Struct representing an input column in the file.
  */
@@ -71,6 +80,7 @@ namespace gpu {
  */
 enum {
   PAGEINFO_FLAGS_DICTIONARY = (1 << 0),  // Indicates a dictionary page
+  PAGEINFO_FLAGS_V2         = (1 << 1),  // V2 page header
 };
 
 /**
@@ -83,6 +93,17 @@ enum level_type {
   NUM_LEVEL_TYPES
 };
 
+/**
+ * @brief Enum of mask bits for the PageInfo kernel_mask
+ *
+ * Used to control which decode kernels to run.
+ */
+enum kernel_mask_bits {
+  KERNEL_MASK_GENERAL      = (1 << 0),  // Run catch-all decode kernel
+  KERNEL_MASK_STRING       = (1 << 1),  // Run decode kernel for string data
+  KERNEL_MASK_DELTA_BINARY = (1 << 2)   // Run decode kernel for DELTA_BINARY_PACKED data
+};
+
 /**
  * @brief Nesting information specifically needed by the decode and preprocessing
  * kernels.
@@ -109,6 +130,7 @@ struct PageNestingDecodeInfo {
   int32_t valid_count;
   int32_t value_count;
   uint8_t* data_out;
+  uint8_t* string_out;
   bitmask_type* valid_map;
 };
 
@@ -149,8 +171,7 @@ struct PageInfo {
   int32_t uncompressed_page_size;  // uncompressed data size in bytes
   // for V2 pages, the def and rep level data is not compressed, and lacks the 4-byte length
   // indicator. instead the lengths for these are stored in the header.
-  int32_t def_lvl_bytes;  // length of the definition levels (V2 header)
-  int32_t rep_lvl_bytes;  // length of the repetition levels (V2 header)
+  int32_t lvl_bytes[level_type::NUM_LEVEL_TYPES];  // length of the rep/def levels (V2 header)
   // Number of values in this data page or dictionary.
   // Important : the # of input values does not necessarily
   // correspond to the number of rows in the output. It just reflects the number
@@ -159,9 +180,11 @@ struct PageInfo {
   // - In the case of a nested schema, you have to decode the repetition and definition
   //   levels to extract actual column values
   int32_t num_input_values;
-  int32_t chunk_row;       // starting row of this page relative to the start of the chunk
-  int32_t num_rows;        // number of rows in this page
-  int32_t num_nulls;       // number of null values (V2 header)
+  int32_t chunk_row;  // starting row of this page relative to the start of the chunk
+  int32_t num_rows;   // number of rows in this page
+  // the next two are calculated in gpuComputePageStringSizes
+  int32_t num_nulls;       // number of null values (V2 header), but recalculated for string cols
+  int32_t num_valids;      // number of non-null values, taking into account skip_rows/num_rows
   int32_t chunk_idx;       // column chunk this page belongs to
   int32_t src_col_schema;  // schema index of this column
   uint8_t flags;           // PAGEINFO_FLAGS_XXX
@@ -184,6 +207,7 @@ struct PageInfo {
   // for string columns only, the size of all the chars in the string for
   // this page. only valid/computed during the base preprocess pass
   int32_t str_bytes;
+  int32_t str_offset;  // offset into string data for this page
 
   // nesting information (input/output) for each page. this array contains
   // input column nesting information, output column nesting information and
@@ -193,6 +217,11 @@ struct PageInfo {
   int32_t nesting_info_size;
   PageNestingInfo* nesting;
   PageNestingDecodeInfo* nesting_decode;
+
+  // level decode buffers
+  uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES];
+
+  uint32_t kernel_mask;
 };
 
 /**
@@ -235,6 +264,7 @@ struct ColumnChunkDesc {
       str_dict_index(nullptr),
       valid_map_base{nullptr},
       column_data_base{nullptr},
+      column_string_base{nullptr},
       codec(codec_),
       converted_type(converted_type_),
       logical_type(logical_type_),
@@ -264,6 +294,7 @@ struct ColumnChunkDesc {
   string_index_pair* str_dict_index;          // index for string dictionary
   bitmask_type** valid_map_base;              // base pointers of valid bit map for this column
   void** column_data_base;                    // base pointers of column data
+  void** column_string_base;                  // base pointers of column string data
   int8_t codec;                               // compressed codec enum
   int8_t converted_type;                      // converted type enum
   LogicalType logical_type;                   // logical type
@@ -280,10 +311,13 @@ struct ColumnChunkDesc {
 struct file_intermediate_data {
   std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
   rmm::device_buffer decomp_page_data;
-  hostdevice_vector<gpu::ColumnChunkDesc> chunks{};
-  hostdevice_vector<gpu::PageInfo> pages_info{};
-  hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
-  hostdevice_vector<gpu::PageNestingDecodeInfo> page_nesting_decode_info{};
+  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> chunks{};
+  cudf::detail::hostdevice_vector<gpu::PageInfo> pages_info{};
+  cudf::detail::hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
+  cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo> page_nesting_decode_info{};
+
+  rmm::device_buffer level_decode_data;
+  int level_type_size;
 };
 
 /**
@@ -311,8 +345,8 @@ struct parquet_column_device_view : stats_column_desc {
   ConvertedType converted_type;  //!< logical data type
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
-  constexpr uint8_t num_def_level_bits() { return level_bits & 0xf; }
-  constexpr uint8_t num_rep_level_bits() { return level_bits >> 4; }
+  constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
+  constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; }
   size_type const* const*
     nesting_offsets;  //!< If column is a nested type, contains offset array of each nesting level
 
@@ -350,6 +384,12 @@ constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
 struct EncPage;
 struct slot_type;
 
+// convert Encoding to a mask value
+constexpr uint32_t encoding_to_mask(Encoding encoding)
+{
+  return 1 << static_cast<uint32_t>(encoding);
+}
+
 /**
  * @brief Struct describing an encoder column chunk
  */
@@ -386,6 +426,7 @@ struct EncColumnChunk {
   bool use_dictionary;    //!< True if the chunk uses dictionary encoding
   uint8_t* column_index_blob;  //!< Binary blob containing encoded column index for this chunk
   uint32_t column_index_size;  //!< Size of column index blob
+  uint32_t encodings;          //!< Mask representing the set of encodings used for this chunk
 };
 
 /**
@@ -396,6 +437,7 @@ struct EncPage {
   uint8_t* compressed_data;  //!< Ptr to compressed page
   uint16_t num_fragments;    //!< Number of fragments in page
   PageType page_type;        //!< Page type
+  Encoding encoding;         //!< Encoding used for page data
   EncColumnChunk* chunk;     //!< Chunk that this page belongs to
   uint32_t chunk_id;         //!< Index in chunk array
   uint32_t hdr_size;         //!< Size of page header
@@ -406,15 +448,30 @@ struct EncPage {
   uint32_t num_leaf_values;  //!< Values in page. Different from num_rows in case of nested types
   uint32_t num_values;  //!< Number of def/rep level values in page. Includes null/empty elements in
                         //!< non-leaf levels
+  uint32_t def_lvl_bytes;        //!< Number of bytes of encoded definition level data (V2 only)
+  uint32_t rep_lvl_bytes;        //!< Number of bytes of encoded repetition level data (V2 only)
   compression_result* comp_res;  //!< Ptr to compression result
+  uint32_t num_nulls;            //!< Number of null values (V2 only) (down here for alignment)
 };
 
+/**
+ * @brief Test if the given column chunk is in a string column
+ */
+constexpr bool is_string_col(ColumnChunkDesc const& chunk)
+{
+  auto const not_converted_to_decimal = chunk.converted_type != DECIMAL;
+  auto const non_hashed_byte_array =
+    (chunk.data_type & 7) == BYTE_ARRAY and (chunk.data_type >> 3) != 4;
+  auto const fixed_len_byte_array = (chunk.data_type & 7) == FIXED_LEN_BYTE_ARRAY;
+  return not_converted_to_decimal and (non_hashed_byte_array or fixed_len_byte_array);
+}
+
 /**
  * @brief Launches kernel for parsing the page headers in the column chunks
  *
  * @param[in] chunks List of column chunks
  * @param[in] num_chunks Number of column chunks
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream);
 
@@ -424,12 +481,25 @@ void DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_st
  *
  * @param[in] chunks List of column chunks
  * @param[in] num_chunks Number of column chunks
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
                                 int32_t num_chunks,
                                 rmm::cuda_stream_view stream);
 
+/**
+ * @brief Get the set of kernels that need to be invoked on these pages as a bitmask.
+ *
+ * This function performs a bitwise OR on all of the individual `kernel_mask` fields on the pages
+ * passed in.
+ *
+ * @param[in] pages List of pages to aggregate
+ * @param[in] stream CUDA stream to use
+ * @return Bitwise OR of all page `kernel_mask` values
+ */
+uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                       rmm::cuda_stream_view stream);
+
 /**
  * @brief Compute page output size information.
  *
@@ -451,16 +521,40 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
  * computed
  * @param compute_string_sizes If set to true, the str_bytes field in PageInfo will
  * be computed
- * @param stream CUDA stream to use, default 0
+ * @param level_type_size Size in bytes of the type for level decoding
+ * @param stream CUDA stream to use
  */
-void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
-                      hostdevice_vector<ColumnChunkDesc> const& chunks,
+void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                      cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
                       size_t min_row,
                       size_t num_rows,
                       bool compute_num_rows,
                       bool compute_string_sizes,
+                      int level_type_size,
                       rmm::cuda_stream_view stream);
 
+/**
+ * @brief Compute string page output size information.
+ *
+ * String columns need accurate data size information to preallocate memory in the column buffer to
+ * store the char data. This calls a kernel to calculate information needed by the string decoding
+ * kernel. On exit, the `str_bytes`, `num_nulls`, `num_valids`, and `str_offset` fields of the
+ * PageInfo struct are updated. This call ignores non-string columns.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] min_rows crop all rows below min_row
+ * @param[in] num_rows Maximum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] stream CUDA stream to use
+ */
+void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            size_t min_row,
+                            size_t num_rows,
+                            int level_type_size,
+                            rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for reading the column data stored in the pages
  *
@@ -471,14 +565,56 @@ void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
  * @param[in] chunks All chunks to be decoded
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] stream CUDA stream to use
  */
-void DecodePageData(hostdevice_vector<PageInfo>& pages,
-                    hostdevice_vector<ColumnChunkDesc> const& chunks,
+void DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                    cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
                     size_t num_rows,
                     size_t min_row,
+                    int level_type_size,
                     rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading the string column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] stream CUDA stream to use
+ */
+void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                          cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                          size_t num_rows,
+                          size_t min_row,
+                          int level_type_size,
+                          rmm::cuda_stream_view stream);
+
+/**
+ * @brief Launches kernel for reading the DELTA_BINARY_PACKED column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] stream CUDA stream to use, default 0
+ */
+void DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                       cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                       size_t num_rows,
+                       size_t min_row,
+                       int level_type_size,
+                       rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for initializing encoder row group fragments
  *
@@ -577,9 +713,10 @@ void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const>
  * @param[in] num_columns Number of columns
  * @param[in] page_grstats Setup for page-level stats
  * @param[in] page_align Required alignment for uncompressed pages
+ * @param[in] write_v2_headers True if V2 page headers should be written
  * @param[in] chunk_grstats Setup for chunk-level stats
  * @param[in] max_page_comp_data_size Calculated maximum compressed data size of pages
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
                       device_span<gpu::EncPage> pages,
@@ -590,6 +727,7 @@ void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
                       size_t max_page_size_bytes,
                       size_type max_page_size_rows,
                       uint32_t page_align,
+                      bool write_v2_headers,
                       statistics_merge_group* page_grstats,
                       statistics_merge_group* chunk_grstats,
                       rmm::cuda_stream_view stream);
@@ -597,13 +735,18 @@ void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
 /**
  * @brief Launches kernel for packing column data into parquet pages
  *
+ * If compression is to be used, `comp_in`, `comp_out`, and `comp_res` will be initialized for
+ * use in subsequent compression operations.
+ *
  * @param[in,out] pages Device array of EncPages (unordered)
+ * @param[in] write_v2_headers True if V2 page headers should be written
  * @param[out] comp_in Compressor input buffers
- * @param[out] comp_in Compressor output buffers
- * @param[out] comp_stats Compressor results
- * @param[in] stream CUDA stream to use, default 0
+ * @param[out] comp_out Compressor output buffers
+ * @param[out] comp_res Compressor results
+ * @param[in] stream CUDA stream to use
  */
 void EncodePages(device_span<EncPage> pages,
+                 bool write_v2_headers,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
                  device_span<compression_result> comp_res,
@@ -612,8 +755,10 @@ void EncodePages(device_span<EncPage> pages,
 /**
  * @brief Launches kernel to make the compressed vs uncompressed chunk-level decision
  *
+ * Also calculates the set of page encodings used for each chunk.
+ *
  * @param[in,out] chunks Column chunks (updated with actual compressed/uncompressed sizes)
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
 
@@ -621,15 +766,15 @@ void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view
  * @brief Launches kernel to encode page headers
  *
  * @param[in,out] pages Device array of EncPages
- * @param[in] comp_stats Compressor status
+ * @param[in] comp_res Compressor status
  * @param[in] page_stats Optional page-level statistics to be included in page header
  * @param[in] chunk_stats Optional chunk-level statistics to be encoded
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void EncodePageHeaders(device_span<EncPage> pages,
                        device_span<compression_result const> comp_res,
                        device_span<statistics_chunk const> page_stats,
-                       const statistics_chunk* chunk_stats,
+                       statistics_chunk const* chunk_stats,
                        rmm::cuda_stream_view stream);
 
 /**
@@ -637,7 +782,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
  *
  * @param[in,out] chunks Column chunks
  * @param[in] pages Device array of EncPages
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void GatherPages(device_span<EncColumnChunk> chunks,
                  device_span<gpu::EncPage const> pages,
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
new file mode 100644
index 00000000000..805d082c71e
--- /dev/null
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "reader_impl_helpers.hpp"
+
+#include <cudf/ast/detail/expression_transformer.hpp>
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/transform.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <algorithm>
+#include <list>
+#include <numeric>
+#include <optional>
+
+namespace cudf::io::detail::parquet {
+
+namespace {
+/**
+ * @brief Converts statistics in column chunks to 2 device columns - min, max values.
+ *
+ */
+struct stats_caster {
+  size_type total_row_groups;
+  std::vector<metadata> const& per_file_metadata;
+  host_span<std::vector<size_type> const> row_group_indices;
+
+  template <typename ToType, typename FromType>
+  static ToType targetType(FromType const value)
+  {
+    if constexpr (cudf::is_timestamp<ToType>()) {
+      return static_cast<ToType>(
+        typename ToType::duration{static_cast<typename ToType::rep>(value)});
+    } else if constexpr (std::is_same_v<ToType, string_view>) {
+      return ToType{nullptr, 0};
+    } else {
+      return static_cast<ToType>(value);
+    }
+  }
+
+  // uses storage type as T
+  template <typename T, CUDF_ENABLE_IF(cudf::is_dictionary<T>() or cudf::is_nested<T>())>
+  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  {
+    CUDF_FAIL("unsupported type for stats casting");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(cudf::is_boolean<T>())>
+  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  {
+    CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination");
+    return targetType<T>(*reinterpret_cast<bool const*>(stats_val));
+  }
+
+  // integral but not boolean, and fixed_point, and chrono.
+  template <typename T,
+            CUDF_ENABLE_IF((cudf::is_integral<T>() and !cudf::is_boolean<T>()) or
+                           cudf::is_fixed_point<T>() or cudf::is_chrono<T>())>
+  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  {
+    switch (type) {
+      case INT32: return targetType<T>(*reinterpret_cast<int32_t const*>(stats_val));
+      case INT64: return targetType<T>(*reinterpret_cast<int64_t const*>(stats_val));
+      case INT96:  // Deprecated in parquet specification
+        return targetType<T>(static_cast<__int128_t>(reinterpret_cast<int64_t const*>(stats_val)[0])
+                               << 32 |
+                             reinterpret_cast<int32_t const*>(stats_val)[2]);
+      case BYTE_ARRAY: [[fallthrough]];
+      case FIXED_LEN_BYTE_ARRAY:
+        if (stats_size == sizeof(T)) {
+          // if type size == length of stats_val. then typecast and return.
+          if constexpr (cudf::is_chrono<T>()) {
+            return targetType<T>(*reinterpret_cast<typename T::rep const*>(stats_val));
+          } else {
+            return targetType<T>(*reinterpret_cast<T const*>(stats_val));
+          }
+        }
+        // unsupported type
+      default: CUDF_FAIL("Invalid type and stats combination");
+    }
+  }
+
+  template <typename T, CUDF_ENABLE_IF(cudf::is_floating_point<T>())>
+  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  {
+    switch (type) {
+      case FLOAT: return targetType<T>(*reinterpret_cast<float const*>(stats_val));
+      case DOUBLE: return targetType<T>(*reinterpret_cast<double const*>(stats_val));
+      default: CUDF_FAIL("Invalid type and stats combination");
+    }
+  }
+
+  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
+  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  {
+    switch (type) {
+      case BYTE_ARRAY: [[fallthrough]];
+      case FIXED_LEN_BYTE_ARRAY:
+        return string_view(reinterpret_cast<char const*>(stats_val), stats_size);
+      default: CUDF_FAIL("Invalid type and stats combination");
+    }
+  }
+
+  // Creates device columns from column statistics (min, max)
+  template <typename T>
+  std::pair<std::unique_ptr<column>, std::unique_ptr<column>> operator()(
+    size_t col_idx,
+    cudf::data_type dtype,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const
+  {
+    // List, Struct, Dictionary types are not supported
+    if constexpr (cudf::is_compound<T>() && !std::is_same_v<T, string_view>) {
+      CUDF_FAIL("Compound types do not have statistics");
+    } else {
+      // Local struct to hold host columns
+      struct host_column {
+        // using thrust::host_vector because std::vector<bool> uses bitmap instead of byte per bool.
+        thrust::host_vector<T> val;
+        std::vector<bitmask_type> null_mask;
+        cudf::size_type null_count = 0;
+        host_column(size_type total_row_groups)
+          : val(total_row_groups),
+            null_mask(
+              cudf::util::div_rounding_up_safe<size_type>(
+                cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)),
+              ~bitmask_type{0})
+        {
+        }
+
+        void set_index(size_type index, std::vector<uint8_t> const& binary_value, Type const type)
+        {
+          if (!binary_value.empty()) {
+            val[index] = convert<T>(binary_value.data(), binary_value.size(), type);
+          }
+          if (binary_value.empty()) {
+            clear_bit_unsafe(null_mask.data(), index);
+            null_count++;
+          }
+        }
+
+        static auto make_strings_children(host_span<string_view> host_strings,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+        {
+          std::vector<char> chars{};
+          std::vector<cudf::size_type> offsets(1, 0);
+          for (auto const& str : host_strings) {
+            auto tmp =
+              str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes());
+            chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp));
+            offsets.push_back(offsets.back() + tmp.length());
+          }
+          auto d_chars   = cudf::detail::make_device_uvector_async(chars, stream, mr);
+          auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream, mr);
+          return std::tuple{std::move(d_chars), std::move(d_offsets)};
+        }
+
+        auto to_device(cudf::data_type dtype,
+                       rmm::cuda_stream_view stream,
+                       rmm::mr::device_memory_resource* mr)
+        {
+          if constexpr (std::is_same_v<T, string_view>) {
+            auto [d_chars, d_offsets] = make_strings_children(val, stream, mr);
+            return cudf::make_strings_column(
+              val.size(),
+              std::move(d_offsets),
+              std::move(d_chars),
+              rmm::device_buffer{
+                null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr},
+              null_count);
+          }
+          return std::make_unique<column>(
+            dtype,
+            val.size(),
+            cudf::detail::make_device_uvector_async(val, stream, mr).release(),
+            rmm::device_buffer{
+              null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr},
+            null_count);
+        }
+      };  // local struct host_column
+      host_column min(total_row_groups);
+      host_column max(total_row_groups);
+
+      size_type stats_idx = 0;
+      for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) {
+        for (auto const rg_idx : row_group_indices[src_idx]) {
+          auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
+          auto const& colchunk  = row_group.columns[col_idx];
+          // To support deprecated min, max fields.
+          auto const& min_value = colchunk.meta_data.statistics.min_value.size() > 0
+                                    ? colchunk.meta_data.statistics.min_value
+                                    : colchunk.meta_data.statistics.min;
+          auto const& max_value = colchunk.meta_data.statistics.max_value.size() > 0
+                                    ? colchunk.meta_data.statistics.max_value
+                                    : colchunk.meta_data.statistics.max;
+          // translate binary data to Type then to <T>
+          min.set_index(stats_idx, min_value, colchunk.meta_data.type);
+          max.set_index(stats_idx, max_value, colchunk.meta_data.type);
+          stats_idx++;
+        }
+      };
+      return {min.to_device(dtype, stream, mr), max.to_device(dtype, stream, mr)};
+    }
+  }
+};
+
+/**
+ * @brief Converts AST expression to StatsAST for comparing with column statistics
+ * This is used in row group filtering based on predicate.
+ * statistics min value of a column is referenced by column_index*2
+ * statistics max value of a column is referenced by column_index*2+1
+ *
+ */
+class stats_expression_converter : public ast::detail::expression_transformer {
+ public:
+  stats_expression_converter(ast::expression const& expr, size_type const& num_columns)
+    : _num_columns{num_columns}
+  {
+    expr.accept(*this);
+  }
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::literal const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::literal const& expr) override
+  {
+    _stats_expr = std::reference_wrapper<ast::expression const>(expr);
+    return expr;
+  }
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::column_reference const& expr) override
+  {
+    CUDF_EXPECTS(expr.get_table_source() == ast::table_reference::LEFT,
+                 "Statistics AST supports only left table");
+    CUDF_EXPECTS(expr.get_column_index() < _num_columns,
+                 "Column index cannot be more than number of columns in the table");
+    _stats_expr = std::reference_wrapper<ast::expression const>(expr);
+    return expr;
+  }
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(
+    ast::column_name_reference const& expr) override
+  {
+    CUDF_FAIL("Column name reference is not supported in statistics AST");
+  }
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::operation const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::operation const& expr) override
+  {
+    using cudf::ast::ast_operator;
+    auto const operands = expr.get_operands();
+    auto const op       = expr.get_operator();
+
+    if (auto* v = dynamic_cast<ast::column_reference const*>(&operands[0].get())) {
+      // First operand should be column reference, second should be literal.
+      CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2,
+                   "Only binary operations are supported on column reference");
+      CUDF_EXPECTS(dynamic_cast<ast::literal const*>(&operands[1].get()) != nullptr,
+                   "Second operand of binary operation with column reference must be a literal");
+      v->accept(*this);
+      auto const col_index = v->get_column_index();
+      switch (op) {
+        /* transform to stats conditions. op(col, literal)
+        col1 == val --> vmin <= val && vmax >= val
+        col1 != val --> !(vmin == val && vmax == val)
+        col1 >  val --> vmax > val
+        col1 <  val --> vmin < val
+        col1 >= val --> vmax >= val
+        col1 <= val --> vmin <= val
+        */
+        case ast_operator::EQUAL: {
+          auto const& vmin = _col_ref.emplace_back(col_index * 2);
+          auto const& vmax = _col_ref.emplace_back(col_index * 2 + 1);
+          auto const& op1 =
+            _operators.emplace_back(ast_operator::LESS_EQUAL, vmin, operands[1].get());
+          auto const& op2 =
+            _operators.emplace_back(ast_operator::GREATER_EQUAL, vmax, operands[1].get());
+          _operators.emplace_back(ast::ast_operator::LOGICAL_AND, op1, op2);
+          break;
+        }
+        case ast_operator::NOT_EQUAL: {
+          auto const& vmin = _col_ref.emplace_back(col_index * 2);
+          auto const& vmax = _col_ref.emplace_back(col_index * 2 + 1);
+          auto const& op1  = _operators.emplace_back(ast_operator::NOT_EQUAL, vmin, vmax);
+          auto const& op2 =
+            _operators.emplace_back(ast_operator::NOT_EQUAL, vmax, operands[1].get());
+          _operators.emplace_back(ast_operator::LOGICAL_OR, op1, op2);
+          break;
+        }
+        case ast_operator::LESS: [[fallthrough]];
+        case ast_operator::LESS_EQUAL: {
+          auto const& vmin = _col_ref.emplace_back(col_index * 2);
+          _operators.emplace_back(op, vmin, operands[1].get());
+          break;
+        }
+        case ast_operator::GREATER: [[fallthrough]];
+        case ast_operator::GREATER_EQUAL: {
+          auto const& vmax = _col_ref.emplace_back(col_index * 2 + 1);
+          _operators.emplace_back(op, vmax, operands[1].get());
+          break;
+        }
+        default: CUDF_FAIL("Unsupported operation in Statistics AST");
+      };
+    } else {
+      auto new_operands = visit_operands(operands);
+      if (cudf::ast::detail::ast_operator_arity(op) == 2) {
+        _operators.emplace_back(op, new_operands.front(), new_operands.back());
+      } else if (cudf::ast::detail::ast_operator_arity(op) == 1) {
+        _operators.emplace_back(op, new_operands.front());
+      }
+    }
+    _stats_expr = std::reference_wrapper<ast::expression const>(_operators.back());
+    return std::reference_wrapper<ast::expression const>(_operators.back());
+  }
+
+  /**
+   * @brief Returns the AST to apply on Column chunk statistics.
+   *
+   * @return AST operation expression
+   */
+  [[nodiscard]] std::reference_wrapper<ast::expression const> get_stats_expr() const
+  {
+    return _stats_expr.value().get();
+  }
+
+ private:
+  std::vector<std::reference_wrapper<ast::expression const>> visit_operands(
+    std::vector<std::reference_wrapper<ast::expression const>> operands)
+  {
+    std::vector<std::reference_wrapper<ast::expression const>> transformed_operands;
+    for (auto const& operand : operands) {
+      auto const new_operand = operand.get().accept(*this);
+      transformed_operands.push_back(new_operand);
+    }
+    return transformed_operands;
+  }
+  std::optional<std::reference_wrapper<ast::expression const>> _stats_expr;
+  size_type _num_columns;
+  std::list<ast::column_reference> _col_ref;
+  std::list<ast::operation> _operators;
+};
+}  // namespace
+
+std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::filter_row_groups(
+  host_span<std::vector<size_type> const> row_group_indices,
+  host_span<data_type const> output_dtypes,
+  std::reference_wrapper<ast::expression const> filter,
+  rmm::cuda_stream_view stream) const
+{
+  auto mr = rmm::mr::get_current_device_resource();
+  // Create row group indices.
+  std::vector<std::vector<size_type>> filtered_row_group_indices;
+  std::vector<std::vector<size_type>> all_row_group_indices;
+  host_span<std::vector<size_type> const> input_row_group_indices;
+  if (row_group_indices.empty()) {
+    std::transform(per_file_metadata.cbegin(),
+                   per_file_metadata.cend(),
+                   std::back_inserter(all_row_group_indices),
+                   [](auto const& file_meta) {
+                     std::vector<size_type> rg_idx(file_meta.row_groups.size());
+                     std::iota(rg_idx.begin(), rg_idx.end(), 0);
+                     return rg_idx;
+                   });
+    input_row_group_indices = host_span<std::vector<size_type> const>(all_row_group_indices);
+  } else {
+    input_row_group_indices = row_group_indices;
+  }
+  auto const total_row_groups = std::accumulate(input_row_group_indices.begin(),
+                                                input_row_group_indices.end(),
+                                                0,
+                                                [](size_type sum, auto const& per_file_row_groups) {
+                                                  return sum + per_file_row_groups.size();
+                                                });
+
+  // Converts Column chunk statistics to a table
+  // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1]
+  // For each column, it contains #sources * #column_chunks_per_src rows.
+  std::vector<std::unique_ptr<column>> columns;
+  stats_caster stats_col{total_row_groups, per_file_metadata, input_row_group_indices};
+  for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) {
+    auto const& dtype = output_dtypes[col_idx];
+    // Only comparable types except fixed point are supported.
+    if (cudf::is_compound(dtype) && dtype.id() != cudf::type_id::STRING) {
+      // placeholder only for unsupported types.
+      columns.push_back(cudf::make_numeric_column(
+        data_type{cudf::type_id::BOOL8}, total_row_groups, rmm::device_buffer{}, 0, stream, mr));
+      columns.push_back(cudf::make_numeric_column(
+        data_type{cudf::type_id::BOOL8}, total_row_groups, rmm::device_buffer{}, 0, stream, mr));
+      continue;
+    }
+    auto [min_col, max_col] =
+      cudf::type_dispatcher<dispatch_storage_type>(dtype, stats_col, col_idx, dtype, stream, mr);
+    columns.push_back(std::move(min_col));
+    columns.push_back(std::move(max_col));
+  }
+  auto stats_table = cudf::table(std::move(columns));
+
+  // Converts AST to StatsAST with reference to min, max columns in above `stats_table`.
+  stats_expression_converter stats_expr{filter, static_cast<size_type>(output_dtypes.size())};
+  auto stats_ast     = stats_expr.get_stats_expr();
+  auto predicate_col = cudf::detail::compute_column(stats_table, stats_ast.get(), stream, mr);
+  auto predicate     = predicate_col->view();
+  CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8,
+               "Filter expression must return a boolean column");
+
+  auto num_bitmasks = num_bitmask_words(predicate.size());
+  std::vector<bitmask_type> host_bitmask(num_bitmasks, ~bitmask_type{0});
+  if (predicate.nullable()) {
+    CUDF_CUDA_TRY(cudaMemcpyAsync(host_bitmask.data(),
+                                  predicate.null_mask(),
+                                  num_bitmasks * sizeof(bitmask_type),
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+  }
+  auto validity_it = cudf::detail::make_counting_transform_iterator(
+    0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); });
+
+  auto is_row_group_required = cudf::detail::make_std_vector_sync(
+    device_span<uint8_t const>(predicate.data<uint8_t>(), predicate.size()), stream);
+
+  // Return only filtered row groups based on predicate
+  // if all are required or all are nulls, return.
+  if (std::all_of(is_row_group_required.cbegin(),
+                  is_row_group_required.cend(),
+                  [](auto i) { return bool(i); }) or
+      predicate.null_count() == predicate.size()) {
+    return std::nullopt;
+  }
+  size_type is_required_idx = 0;
+  for (size_t src_idx = 0; src_idx < input_row_group_indices.size(); ++src_idx) {
+    std::vector<size_type> filtered_row_groups;
+    for (auto const rg_idx : input_row_group_indices[src_idx]) {
+      if ((!validity_it[is_required_idx]) || is_row_group_required[is_required_idx]) {
+        filtered_row_groups.push_back(rg_idx);
+      }
+      ++is_required_idx;
+    }
+    filtered_row_group_indices.push_back(std::move(filtered_row_groups));
+  }
+  return {std::move(filtered_row_group_indices)};
+}
+
+// convert column named expression to column index reference expression
+std::reference_wrapper<ast::expression const> named_to_reference_converter::visit(
+  ast::literal const& expr)
+{
+  _stats_expr = std::reference_wrapper<ast::expression const>(expr);
+  return expr;
+}
+
+std::reference_wrapper<ast::expression const> named_to_reference_converter::visit(
+  ast::column_reference const& expr)
+{
+  _stats_expr = std::reference_wrapper<ast::expression const>(expr);
+  return expr;
+}
+
+std::reference_wrapper<ast::expression const> named_to_reference_converter::visit(
+  ast::column_name_reference const& expr)
+{
+  // check if column name is in metadata
+  auto col_index_it = column_name_to_index.find(expr.get_column_name());
+  if (col_index_it == column_name_to_index.end()) {
+    CUDF_FAIL("Column name not found in metadata");
+  }
+  auto col_index = col_index_it->second;
+  _col_ref.emplace_back(col_index);
+  _stats_expr = std::reference_wrapper<ast::expression const>(_col_ref.back());
+  return std::reference_wrapper<ast::expression const>(_col_ref.back());
+}
+
+std::reference_wrapper<ast::expression const> named_to_reference_converter::visit(
+  ast::operation const& expr)
+{
+  auto const operands = expr.get_operands();
+  auto op             = expr.get_operator();
+  auto new_operands   = visit_operands(operands);
+  if (cudf::ast::detail::ast_operator_arity(op) == 2) {
+    _operators.emplace_back(op, new_operands.front(), new_operands.back());
+  } else if (cudf::ast::detail::ast_operator_arity(op) == 1) {
+    _operators.emplace_back(op, new_operands.front());
+  }
+  _stats_expr = std::reference_wrapper<ast::expression const>(_operators.back());
+  return std::reference_wrapper<ast::expression const>(_operators.back());
+}
+
+std::vector<std::reference_wrapper<ast::expression const>>
+named_to_reference_converter::visit_operands(
+  std::vector<std::reference_wrapper<ast::expression const>> operands)
+{
+  std::vector<std::reference_wrapper<ast::expression const>> transformed_operands;
+  for (auto const& operand : operands) {
+    auto const new_operand = operand.get().accept(*this);
+    transformed_operands.push_back(new_operand);
+  }
+  return transformed_operands;
+}
+
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 1d01d10b5b0..7365c102d8f 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -38,7 +38,8 @@ table_with_metadata reader::read(parquet_reader_options const& options)
   return _impl->read(options.get_skip_rows(),
                      options.get_num_rows(),
                      uses_custom_row_bounds,
-                     options.get_row_groups());
+                     options.get_row_groups(),
+                     options.get_filter());
 }
 
 chunked_reader::chunked_reader(std::size_t chunk_read_limit,
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 9f1644dfd45..8a73c43be3e 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -16,12 +16,33 @@
 
 #include "reader_impl.hpp"
 
+#include <cudf/detail/stream_compaction.hpp>
+#include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <rmm/cuda_stream_pool.hpp>
 
 #include <numeric>
 
 namespace cudf::io::detail::parquet {
 
+namespace {
+
+int constexpr NUM_DECODERS       = 3;  // how many decode kernels are there to run
+int constexpr APPROX_NUM_THREADS = 4;  // guestimate from DaveB
+int constexpr STREAM_POOL_SIZE   = NUM_DECODERS * APPROX_NUM_THREADS;
+
+auto& get_stream_pool()
+{
+  // TODO: creating this on the heap because there were issues with trying to call the
+  // stream pool destructor during cuda shutdown that lead to a segmentation fault in
+  // nvbench. this allocation is being deliberately leaked to avoid the above, but still
+  // results in non-fatal warnings when running nvbench in cuda-gdb.
+  static auto pool = new rmm::cuda_stream_pool{STREAM_POOL_SIZE};
+  return *pool;
+}
+
+}  // namespace
+
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
   auto& chunks              = _file_itm_data.chunks;
@@ -37,12 +58,40 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
+  // figure out which kernels to run
+  auto const kernel_mask = GetAggregatedDecodeKernelMask(pages, _stream);
+
+  // Check to see if there are any string columns present. If so, then we need to get size info
+  // for each string page. This size info will be used to pre-allocate memory for the column,
+  // allowing the page decoder to write string data directly to the column buffer, rather than
+  // doing a gather operation later on.
+  // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
+  // chunked reader).
+  auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0;
+  std::vector<size_t> col_sizes(_input_columns.size(), 0L);
+  if (has_strings) {
+    gpu::ComputePageStringSizes(
+      pages, chunks, skip_rows, num_rows, _file_itm_data.level_type_size, _stream);
+
+    col_sizes = calculate_page_string_offsets();
+
+    // check for overflow
+    if (std::any_of(col_sizes.cbegin(), col_sizes.cend(), [](size_t sz) {
+          return sz > std::numeric_limits<size_type>::max();
+        })) {
+      CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
+    }
+  }
+
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
   // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
   // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`
-  auto chunk_nested_valids = hostdevice_vector<bitmask_type*>(sum_max_depths, _stream);
-  auto chunk_nested_data   = hostdevice_vector<void*>(sum_max_depths, _stream);
-  auto chunk_offsets       = std::vector<size_t>();
+  auto chunk_nested_valids =
+    cudf::detail::hostdevice_vector<bitmask_type*>(sum_max_depths, _stream);
+  auto chunk_nested_data = cudf::detail::hostdevice_vector<void*>(sum_max_depths, _stream);
+  auto chunk_offsets     = std::vector<size_t>();
+  auto chunk_nested_str_data =
+    cudf::detail::hostdevice_vector<void*>(has_strings ? sum_max_depths : 0, _stream);
 
   // Update chunks with pointers to column data.
   for (size_t c = 0, page_count = 0, chunk_off = 0; c < chunks.size(); c++) {
@@ -63,6 +112,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     auto data                  = chunk_nested_data.host_ptr(chunk_off);
     chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
 
+    auto str_data = has_strings ? chunk_nested_str_data.host_ptr(chunk_off) : nullptr;
+    chunks[c].column_string_base =
+      has_strings ? chunk_nested_str_data.device_ptr(chunk_off) : nullptr;
+
     chunk_off += max_depth;
 
     // fill in the arrays on the host.  there are some important considerations to
@@ -105,6 +158,11 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       if (owning_schema == 0 || owning_schema == input_col.schema_idx) {
         valids[idx] = out_buf.null_mask();
         data[idx]   = out_buf.data();
+        // only do string buffer for leaf
+        if (out_buf.string_size() == 0 && col_sizes[chunks[c].src_col_index] > 0) {
+          out_buf.create_string_data(col_sizes[chunks[c].src_col_index], _stream);
+        }
+        if (has_strings) { str_data[idx] = out_buf.string_data(); }
         out_buf.user_data |=
           static_cast<uint32_t>(input_col.schema_idx) & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
       } else {
@@ -117,15 +175,41 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     page_count += chunks[c].max_num_pages;
   }
 
-  chunks.host_to_device(_stream);
-  chunk_nested_valids.host_to_device(_stream);
-  chunk_nested_data.host_to_device(_stream);
+  chunks.host_to_device_async(_stream);
+  chunk_nested_valids.host_to_device_async(_stream);
+  chunk_nested_data.host_to_device_async(_stream);
+  _stream.synchronize();
 
-  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _stream);
+  auto const level_type_size = _file_itm_data.level_type_size;
 
-  pages.device_to_host(_stream);
-  page_nesting.device_to_host(_stream);
-  page_nesting_decode.device_to_host(_stream);
+  // vector of launched streams
+  std::vector<rmm::cuda_stream_view> streams;
+
+  // launch string decoder
+  if (has_strings) {
+    streams.push_back(get_stream_pool().get_stream());
+    chunk_nested_str_data.host_to_device_async(streams.back());
+    gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
+  }
+
+  // launch delta binary decoder
+  if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) {
+    streams.push_back(get_stream_pool().get_stream());
+    gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
+  }
+
+  // launch the catch-all page decoder
+  if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) {
+    streams.push_back(get_stream_pool().get_stream());
+    gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
+  }
+
+  // synchronize the streams
+  std::for_each(streams.begin(), streams.end(), [](auto& stream) { stream.synchronize(); });
+
+  pages.device_to_host_async(_stream);
+  page_nesting.device_to_host_async(_stream);
+  page_nesting_decode.device_to_host_async(_stream);
   _stream.synchronize();
 
   // for list columns, add the final offset to every offset buffer.
@@ -143,21 +227,28 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
       auto& out_buf = (*cols)[input_col.nesting[l_idx]];
       cols          = &out_buf.children;
 
-      if (out_buf.type.id() != type_id::LIST ||
-          (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED)) {
-        continue;
+      if (out_buf.type.id() == type_id::LIST &&
+          (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED) == 0) {
+        CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column");
+        auto const& child = (*cols)[input_col.nesting[l_idx + 1]];
+
+        // the final offset for a list at level N is the size of it's child
+        int const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
+        CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
+                                      &offset,
+                                      sizeof(offset),
+                                      cudaMemcpyDefault,
+                                      _stream.value()));
+        out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
+      } else if (out_buf.type.id() == type_id::STRING) {
+        // need to cap off the string offsets column
+        size_type const sz = static_cast<size_type>(col_sizes[idx]);
+        cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + out_buf.size,
+                        &sz,
+                        sizeof(size_type),
+                        cudaMemcpyDefault,
+                        _stream.value());
       }
-      CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column");
-      auto& child = (*cols)[input_col.nesting[l_idx + 1]];
-
-      // the final offset for a list at level N is the size of it's child
-      int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
-      CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
-                                    &offset,
-                                    sizeof(offset),
-                                    cudaMemcpyDefault,
-                                    _stream.value()));
-      out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
     }
   }
 
@@ -231,7 +322,7 @@ reader::impl::impl(std::size_t chunk_read_limit,
   // Don't need to do it if we read the file all at once.
   if (_chunk_read_limit > 0) {
     for (auto const& buff : _output_buffers) {
-      _output_buffers_template.emplace_back(column_buffer::empty_like(buff));
+      _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff));
     }
   }
 }
@@ -239,14 +330,24 @@ reader::impl::impl(std::size_t chunk_read_limit,
 void reader::impl::prepare_data(int64_t skip_rows,
                                 std::optional<size_type> const& num_rows,
                                 bool uses_custom_row_bounds,
-                                host_span<std::vector<size_type> const> row_group_indices)
+                                host_span<std::vector<size_type> const> row_group_indices,
+                                std::optional<std::reference_wrapper<ast::expression const>> filter)
 {
   if (_file_preprocessed) { return; }
 
-  const auto [skip_rows_corrected, num_rows_corrected, row_groups_info] =
-    _metadata->select_row_groups(row_group_indices, skip_rows, num_rows);
+  // if filter is not empty, then create output types as vector and pass for filtering.
+  std::vector<data_type> output_types;
+  if (filter.has_value()) {
+    std::transform(_output_buffers.cbegin(),
+                   _output_buffers.cend(),
+                   std::back_inserter(output_types),
+                   [](auto const& col) { return col.type; });
+  }
+  auto const [skip_rows_corrected, num_rows_corrected, row_groups_info] =
+    _metadata->select_row_groups(
+      row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
 
-  if (num_rows_corrected > 0 && row_groups_info.size() != 0 && _input_columns.size() != 0) {
+  if (num_rows_corrected > 0 && not row_groups_info.empty() && not _input_columns.empty()) {
     load_and_decompress_data(row_groups_info, num_rows_corrected);
     preprocess_pages(
       skip_rows_corrected, num_rows_corrected, uses_custom_row_bounds, _chunk_read_limit);
@@ -260,17 +361,35 @@ void reader::impl::prepare_data(int64_t skip_rows,
   _file_preprocessed = true;
 }
 
-table_with_metadata reader::impl::read_chunk_internal(bool uses_custom_row_bounds)
+void reader::impl::populate_metadata(table_metadata& out_metadata)
+{
+  // Return column names
+  out_metadata.schema_info.resize(_output_buffers.size());
+  for (size_t i = 0; i < _output_column_schemas.size(); i++) {
+    auto const& schema                      = _metadata->get_schema(_output_column_schemas[i]);
+    out_metadata.schema_info[i].name        = schema.name;
+    out_metadata.schema_info[i].is_nullable = schema.repetition_type != REQUIRED;
+  }
+
+  // Return user metadata
+  out_metadata.per_file_user_data = _metadata->get_key_value_metadata();
+  out_metadata.user_data          = {out_metadata.per_file_user_data[0].begin(),
+                                     out_metadata.per_file_user_data[0].end()};
+}
+
+table_with_metadata reader::impl::read_chunk_internal(
+  bool uses_custom_row_bounds, std::optional<std::reference_wrapper<ast::expression const>> filter)
 {
   // If `_output_metadata` has been constructed, just copy it over.
   auto out_metadata = _output_metadata ? table_metadata{*_output_metadata} : table_metadata{};
+  out_metadata.schema_info.resize(_output_buffers.size());
 
   // output cudf columns as determined by the top level schema
   auto out_columns = std::vector<std::unique_ptr<column>>{};
   out_columns.reserve(_output_buffers.size());
 
-  if (!has_next() || _chunk_read_info.size() == 0) {
-    return finalize_output(out_metadata, out_columns);
+  if (!has_next() || _chunk_read_info.empty()) {
+    return finalize_output(out_metadata, out_columns, filter);
   }
 
   auto const& read_info = _chunk_read_info[_current_read_chunk++];
@@ -283,29 +402,37 @@ table_with_metadata reader::impl::read_chunk_internal(bool uses_custom_row_bound
 
   // Create the final output cudf columns.
   for (size_t i = 0; i < _output_buffers.size(); ++i) {
-    auto const metadata = _reader_column_schema.has_value()
-                            ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
-                            : std::nullopt;
+    auto metadata      = _reader_column_schema.has_value()
+                           ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
+                           : std::nullopt;
+    auto const& schema = _metadata->get_schema(_output_column_schemas[i]);
+    // FIXED_LEN_BYTE_ARRAY never read as string
+    if (schema.type == FIXED_LEN_BYTE_ARRAY and schema.converted_type != DECIMAL) {
+      metadata = std::make_optional<reader_column_schema>();
+      metadata->set_convert_binary_to_strings(false);
+    }
     // Only construct `out_metadata` if `_output_metadata` has not been cached.
     if (!_output_metadata) {
-      column_name_info& col_name = out_metadata.schema_info.emplace_back("");
+      column_name_info& col_name = out_metadata.schema_info[i];
       out_columns.emplace_back(make_column(_output_buffers[i], &col_name, metadata, _stream));
     } else {
       out_columns.emplace_back(make_column(_output_buffers[i], nullptr, metadata, _stream));
     }
   }
 
-  // Add empty columns if needed.
-  return finalize_output(out_metadata, out_columns);
+  // Add empty columns if needed. Filter output columns based on filter.
+  return finalize_output(out_metadata, out_columns, filter);
 }
 
-table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
-                                                  std::vector<std::unique_ptr<column>>& out_columns)
+table_with_metadata reader::impl::finalize_output(
+  table_metadata& out_metadata,
+  std::vector<std::unique_ptr<column>>& out_columns,
+  std::optional<std::reference_wrapper<ast::expression const>> filter)
 {
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
   for (size_t i = out_columns.size(); i < _output_buffers.size(); ++i) {
     if (!_output_metadata) {
-      column_name_info& col_name = out_metadata.schema_info.emplace_back("");
+      column_name_info& col_name = out_metadata.schema_info[i];
       out_columns.emplace_back(io::detail::empty_like(_output_buffers[i], &col_name, _stream, _mr));
     } else {
       out_columns.emplace_back(io::detail::empty_like(_output_buffers[i], nullptr, _stream, _mr));
@@ -313,33 +440,38 @@ table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
   }
 
   if (!_output_metadata) {
-    // Return column names
-    out_metadata.schema_info.resize(_output_buffers.size());
-    for (size_t i = 0; i < _output_column_schemas.size(); i++) {
-      auto const& schema               = _metadata->get_schema(_output_column_schemas[i]);
-      out_metadata.schema_info[i].name = schema.name;
-    }
-
-    // Return user metadata
-    out_metadata.per_file_user_data = _metadata->get_key_value_metadata();
-    out_metadata.user_data          = {out_metadata.per_file_user_data[0].begin(),
-                                       out_metadata.per_file_user_data[0].end()};
-
+    populate_metadata(out_metadata);
     // Finally, save the output table metadata into `_output_metadata` for reuse next time.
     _output_metadata = std::make_unique<table_metadata>(out_metadata);
   }
 
+  if (filter.has_value()) {
+    auto read_table = std::make_unique<table>(std::move(out_columns));
+    auto predicate  = cudf::detail::compute_column(
+      *read_table, filter.value().get(), _stream, rmm::mr::get_current_device_resource());
+    CUDF_EXPECTS(predicate->view().type().id() == type_id::BOOL8,
+                 "Predicate filter should return a boolean");
+    auto output_table = cudf::detail::apply_boolean_mask(*read_table, *predicate, _stream, _mr);
+    return {std::move(output_table), std::move(out_metadata)};
+  }
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-table_with_metadata reader::impl::read(int64_t skip_rows,
-                                       std::optional<size_type> const& num_rows,
-                                       bool uses_custom_row_bounds,
-                                       host_span<std::vector<size_type> const> row_group_indices)
+table_with_metadata reader::impl::read(
+  int64_t skip_rows,
+  std::optional<size_type> const& num_rows,
+  bool uses_custom_row_bounds,
+  host_span<std::vector<size_type> const> row_group_indices,
+  std::optional<std::reference_wrapper<ast::expression const>> filter)
 {
   CUDF_EXPECTS(_chunk_read_limit == 0, "Reading the whole file must not have non-zero byte_limit.");
-  prepare_data(skip_rows, num_rows, uses_custom_row_bounds, row_group_indices);
-  return read_chunk_internal(uses_custom_row_bounds);
+  table_metadata metadata;
+  populate_metadata(metadata);
+  auto expr_conv     = named_to_reference_converter(filter, metadata);
+  auto output_filter = expr_conv.get_converted_expr();
+
+  prepare_data(skip_rows, num_rows, uses_custom_row_bounds, row_group_indices, output_filter);
+  return read_chunk_internal(uses_custom_row_bounds, output_filter);
 }
 
 table_with_metadata reader::impl::read_chunk()
@@ -349,15 +481,16 @@ table_with_metadata reader::impl::read_chunk()
   if (_chunk_read_limit > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
-      _output_buffers.emplace_back(column_buffer::empty_like(buff));
+      _output_buffers.emplace_back(inline_column_buffer::empty_like(buff));
     }
   }
 
   prepare_data(0 /*skip_rows*/,
                std::nullopt /*num_rows, `nullopt` means unlimited*/,
                true /*uses_custom_row_bounds*/,
-               {} /*row_group_indices, empty means read all row groups*/);
-  return read_chunk_internal(true);
+               {} /*row_group_indices, empty means read all row groups*/,
+               std::nullopt /*filter*/);
+  return read_chunk_internal(true, std::nullopt);
 }
 
 bool reader::impl::has_next()
@@ -365,8 +498,33 @@ bool reader::impl::has_next()
   prepare_data(0 /*skip_rows*/,
                std::nullopt /*num_rows, `nullopt` means unlimited*/,
                true /*uses_custom_row_bounds*/,
-               {} /*row_group_indices, empty means read all row groups*/);
+               {} /*row_group_indices, empty means read all row groups*/,
+               std::nullopt /*filter*/);
   return _current_read_chunk < _chunk_read_info.size();
 }
 
+namespace {
+parquet_column_schema walk_schema(aggregate_reader_metadata const* mt, int idx)
+{
+  SchemaElement const& sch = mt->get_schema(idx);
+  std::vector<parquet_column_schema> children;
+  for (auto const& child_idx : sch.children_idx) {
+    children.push_back(walk_schema(mt, child_idx));
+  }
+  return parquet_column_schema{
+    sch.name, static_cast<parquet::TypeKind>(sch.type), std::move(children)};
+}
+}  // namespace
+
+parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources)
+{
+  // Open and parse the source dataset metadata
+  auto metadata = aggregate_reader_metadata(sources);
+
+  return parquet_metadata{parquet_schema{walk_schema(&metadata, 0)},
+                          metadata.get_num_rows(),
+                          metadata.get_num_row_groups(),
+                          metadata.get_key_value_metadata()[0]};
+}
+
 }  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 9b40610b141..a980670e465 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -24,8 +24,6 @@
 #include "parquet_gpu.hpp"
 #include "reader_impl_helpers.hpp"
 
-#include <io/utilities/column_buffer.hpp>
-
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
@@ -38,6 +36,7 @@
 #include <vector>
 
 namespace cudf::io::detail::parquet {
+
 /**
  * @brief Implementation for Parquet reader
  */
@@ -67,13 +66,15 @@ class reader::impl {
    * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
    *        bounds
    * @param row_group_indices Lists of row groups to read, one per source
+   * @param filter Optional AST expression to filter output rows
    *
    * @return The set of columns along with metadata
    */
   table_with_metadata read(int64_t skip_rows,
                            std::optional<size_type> const& num_rows,
                            bool uses_custom_row_bounds,
-                           host_span<std::vector<size_type> const> row_group_indices);
+                           host_span<std::vector<size_type> const> row_group_indices,
+                           std::optional<std::reference_wrapper<ast::expression const>> filter);
 
   /**
    * @brief Constructor from a chunk read limit and an array of dataset sources with reader options.
@@ -124,11 +125,13 @@ class reader::impl {
    * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
    *        bounds
    * @param row_group_indices Lists of row groups to read (one per source), or empty if read all
+   * @param filter Optional AST expression to filter row groups based on column chunk statistics
    */
   void prepare_data(int64_t skip_rows,
                     std::optional<size_type> const& num_rows,
                     bool uses_custom_row_bounds,
-                    host_span<std::vector<size_type> const> row_group_indices);
+                    host_span<std::vector<size_type> const> row_group_indices,
+                    std::optional<std::reference_wrapper<ast::expression const>> filter);
 
   /**
    * @brief Create chunk information and start file reads
@@ -181,6 +184,21 @@ class reader::impl {
    */
   void allocate_nesting_info();
 
+  /**
+   * @brief Allocate space for use when decoding definition/repetition levels.
+   *
+   * One large contiguous buffer of data allocated and
+   * distributed among the PageInfo structs.
+   */
+  void allocate_level_decode_space();
+
+  /**
+   * @brief Populate the output table metadata from the parquet file metadata.
+   *
+   * @param out_metadata The output table metadata to add to
+   */
+  void populate_metadata(table_metadata& out_metadata);
+
   /**
    * @brief Read a chunk of data and return an output table.
    *
@@ -188,9 +206,12 @@ class reader::impl {
    *
    * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
    *        bounds
+   * @param filter Optional AST expression to filter output rows
    * @return The output table along with columns' metadata
    */
-  table_with_metadata read_chunk_internal(bool uses_custom_row_bounds);
+  table_with_metadata read_chunk_internal(
+    bool uses_custom_row_bounds,
+    std::optional<std::reference_wrapper<ast::expression const>> filter);
 
   /**
    * @brief Finalize the output table by adding empty columns for the non-selected columns in
@@ -198,10 +219,13 @@ class reader::impl {
    *
    * @param out_metadata The output table metadata
    * @param out_columns The columns for building the output table
+   * @param filter Optional AST expression to filter output rows
    * @return The output table along with columns' metadata
    */
-  table_with_metadata finalize_output(table_metadata& out_metadata,
-                                      std::vector<std::unique_ptr<column>>& out_columns);
+  table_with_metadata finalize_output(
+    table_metadata& out_metadata,
+    std::vector<std::unique_ptr<column>>& out_columns,
+    std::optional<std::reference_wrapper<ast::expression const>> filter);
 
   /**
    * @brief Allocate data buffers for the output columns.
@@ -213,6 +237,13 @@ class reader::impl {
    */
   void allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds);
 
+  /**
+   * @brief Calculate per-page offsets for string data
+   *
+   * @return Vector of total string data sizes for each column
+   */
+  std::vector<size_t> calculate_page_string_offsets();
+
   /**
    * @brief Converts the page data and outputs to columns.
    *
@@ -232,10 +263,10 @@ class reader::impl {
   std::vector<input_column_info> _input_columns;
 
   // Buffers for generating output columns
-  std::vector<column_buffer> _output_buffers;
+  std::vector<inline_column_buffer> _output_buffers;
 
   // Buffers copied from `_output_buffers` after construction for reuse
-  std::vector<column_buffer> _output_buffers_template;
+  std::vector<inline_column_buffer> _output_buffers_template;
 
   // _output_buffers associated schema indices
   std::vector<int> _output_column_schemas;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 1f70a7afdfe..f6dbeb275fc 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -180,29 +180,29 @@ metadata::metadata(datasource* source)
   constexpr auto header_len = sizeof(file_header_s);
   constexpr auto ender_len  = sizeof(file_ender_s);
 
-  const auto len           = source->size();
-  const auto header_buffer = source->host_read(0, header_len);
-  const auto header        = reinterpret_cast<const file_header_s*>(header_buffer->data());
-  const auto ender_buffer  = source->host_read(len - ender_len, ender_len);
-  const auto ender         = reinterpret_cast<const file_ender_s*>(ender_buffer->data());
+  auto const len           = source->size();
+  auto const header_buffer = source->host_read(0, header_len);
+  auto const header        = reinterpret_cast<file_header_s const*>(header_buffer->data());
+  auto const ender_buffer  = source->host_read(len - ender_len, ender_len);
+  auto const ender         = reinterpret_cast<file_ender_s const*>(ender_buffer->data());
   CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source");
   CUDF_EXPECTS(header->magic == parquet_magic && ender->magic == parquet_magic,
                "Corrupted header or footer");
   CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len),
                "Incorrect footer length");
 
-  const auto buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len);
+  auto const buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len);
   CompactProtocolReader cp(buffer->data(), ender->footer_len);
   CUDF_EXPECTS(cp.read(this), "Cannot parse metadata");
   CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema");
 }
 
 std::vector<metadata> aggregate_reader_metadata::metadatas_from_sources(
-  std::vector<std::unique_ptr<datasource>> const& sources)
+  host_span<std::unique_ptr<datasource> const> sources)
 {
   std::vector<metadata> metadatas;
   std::transform(
-    sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
+    sources.begin(), sources.end(), std::back_inserter(metadatas), [](auto const& source) {
       return metadata(source.get());
     });
   return metadatas;
@@ -232,21 +232,27 @@ aggregate_reader_metadata::collect_keyval_metadata() const
 int64_t aggregate_reader_metadata::calc_num_rows() const
 {
   return std::accumulate(
-    per_file_metadata.begin(), per_file_metadata.end(), 0l, [](auto& sum, auto& pfm) {
-      return sum + pfm.num_rows;
+    per_file_metadata.cbegin(), per_file_metadata.cend(), 0l, [](auto& sum, auto& pfm) {
+      auto const rowgroup_rows = std::accumulate(
+        pfm.row_groups.cbegin(), pfm.row_groups.cend(), 0l, [](auto& rg_sum, auto& rg) {
+          return rg_sum + rg.num_rows;
+        });
+      CUDF_EXPECTS(pfm.num_rows == 0 || pfm.num_rows == rowgroup_rows,
+                   "Header and row groups disagree about number of rows in file!");
+      return sum + (pfm.num_rows == 0 && rowgroup_rows > 0 ? rowgroup_rows : pfm.num_rows);
     });
 }
 
 size_type aggregate_reader_metadata::calc_num_row_groups() const
 {
   return std::accumulate(
-    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
+    per_file_metadata.cbegin(), per_file_metadata.cend(), 0, [](auto& sum, auto& pfm) {
       return sum + pfm.row_groups.size();
     });
 }
 
 aggregate_reader_metadata::aggregate_reader_metadata(
-  std::vector<std::unique_ptr<datasource>> const& sources)
+  host_span<std::unique_ptr<datasource> const> sources)
   : per_file_metadata(metadatas_from_sources(sources)),
     keyval_maps(collect_keyval_metadata()),
     num_rows(calc_num_rows()),
@@ -342,8 +348,20 @@ std::tuple<int64_t, size_type, std::vector<row_group_info>>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
-  std::optional<size_type> const& num_rows_opt) const
+  std::optional<size_type> const& num_rows_opt,
+  host_span<data_type const> output_dtypes,
+  std::optional<std::reference_wrapper<ast::expression const>> filter,
+  rmm::cuda_stream_view stream) const
 {
+  std::optional<std::vector<std::vector<size_type>>> filtered_row_group_indices;
+  if (filter.has_value()) {
+    filtered_row_group_indices =
+      filter_row_groups(row_group_indices, output_dtypes, filter.value(), stream);
+    if (filtered_row_group_indices.has_value()) {
+      row_group_indices =
+        host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
+    }
+  }
   std::vector<row_group_info> selection;
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
@@ -383,7 +401,9 @@ aggregate_reader_metadata::select_row_groups(
   return {rows_to_skip, rows_to_read, std::move(selection)};
 }
 
-std::tuple<std::vector<input_column_info>, std::vector<column_buffer>, std::vector<size_type>>
+std::tuple<std::vector<input_column_info>,
+           std::vector<inline_column_buffer>,
+           std::vector<size_type>>
 aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
                                           bool include_index,
                                           bool strings_to_categorical,
@@ -400,17 +420,17 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
              : -1;
   };
 
-  std::vector<column_buffer> output_columns;
+  std::vector<inline_column_buffer> output_columns;
   std::vector<input_column_info> input_columns;
   std::vector<int> nesting;
 
   // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
   // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
   // not a child of "struct1" then the function will return false for "struct1"
-  std::function<bool(column_name_info const*, int, std::vector<column_buffer>&, bool)>
+  std::function<bool(column_name_info const*, int, std::vector<inline_column_buffer>&, bool)>
     build_column = [&](column_name_info const* col_name_info,
                        int schema_idx,
-                       std::vector<column_buffer>& out_col_array,
+                       std::vector<inline_column_buffer>& out_col_array,
                        bool has_list_parent) {
       if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
@@ -431,7 +451,7 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
                               : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
       auto const dtype    = to_data_type(col_type, schema_elem);
 
-      column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+      inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
       if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
       // store the index of this element if inserted in out_col_array
       nesting.push_back(static_cast<int>(out_col_array.size()));
@@ -471,7 +491,7 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
             to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
           auto const element_dtype = to_data_type(element_type, schema_elem);
 
-          column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
+          inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
           if (has_list_parent || col_type == type_id::LIST) {
             element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
           }
@@ -561,8 +581,8 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
 
     // Now construct paths as vector of strings for further consumption
     std::vector<std::vector<std::string>> use_names3;
-    std::transform(valid_selected_paths.begin(),
-                   valid_selected_paths.end(),
+    std::transform(valid_selected_paths.cbegin(),
+                   valid_selected_paths.cend(),
                    std::back_inserter(use_names3),
                    [&](path_info const& valid_path) {
                      auto schema_idx = valid_path.schema_idx;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 748f0164244..751ffc33123 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -19,10 +19,16 @@
 #include "compact_protocol_reader.hpp"
 #include "parquet_gpu.hpp"
 
+#include <cudf/ast/detail/expression_transformer.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/types.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+#include <list>
 #include <tuple>
 #include <vector>
 
@@ -77,7 +83,7 @@ class aggregate_reader_metadata {
    * @brief Create a metadata object from each element in the source vector
    */
   static std::vector<metadata> metadatas_from_sources(
-    std::vector<std::unique_ptr<datasource>> const& sources);
+    host_span<std::unique_ptr<datasource> const> sources);
 
   /**
    * @brief Collect the keyvalue maps from each per-file metadata object into a vector of maps.
@@ -96,7 +102,7 @@ class aggregate_reader_metadata {
   [[nodiscard]] size_type calc_num_row_groups() const;
 
  public:
-  aggregate_reader_metadata(std::vector<std::unique_ptr<datasource>> const& sources);
+  aggregate_reader_metadata(host_span<std::unique_ptr<datasource> const> sources);
 
   [[nodiscard]] RowGroup const& get_row_group(size_type row_group_index, size_type src_idx) const;
 
@@ -113,8 +119,9 @@ class aggregate_reader_metadata {
     return per_file_metadata[0].schema[schema_idx];
   }
 
-  [[nodiscard]] auto const& get_key_value_metadata() const { return keyval_maps; }
+  [[nodiscard]] auto const& get_key_value_metadata() const& { return keyval_maps; }
 
+  [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); }
   /**
    * @brief Gets the concrete nesting depth of output cudf columns
    *
@@ -157,6 +164,21 @@ class aggregate_reader_metadata {
    */
   [[nodiscard]] std::vector<std::string> get_pandas_index_names() const;
 
+  /**
+   * @brief Filters the row groups based on predicate filter
+   *
+   * @param row_group_indices Lists of row groups to read, one per source
+   * @param output_dtypes List of output column datatypes
+   * @param filter AST expression to filter row groups based on Column chunk statistics
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @return Filtered row group indices, if any is filtered.
+   */
+  [[nodiscard]] std::optional<std::vector<std::vector<size_type>>> filter_row_groups(
+    host_span<std::vector<size_type> const> row_group_indices,
+    host_span<data_type const> output_dtypes,
+    std::reference_wrapper<ast::expression const> filter,
+    rmm::cuda_stream_view stream) const;
+
   /**
    * @brief Filters and reduces down to a selection of row groups
    *
@@ -166,14 +188,19 @@ class aggregate_reader_metadata {
    * @param row_group_indices Lists of row groups to read, one per source
    * @param row_start Starting row of the selection
    * @param row_count Total number of rows selected
-   *
+   * @param output_dtypes List of output column datatypes
+   * @param filter Optional AST expression to filter row groups based on Column chunk statistics
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @return A tuple of corrected row_start, row_count and list of row group indexes and its
    *         starting row
    */
   [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>> select_row_groups(
     host_span<std::vector<size_type> const> row_group_indices,
     int64_t row_start,
-    std::optional<size_type> const& row_count) const;
+    std::optional<size_type> const& row_count,
+    host_span<data_type const> output_dtypes,
+    std::optional<std::reference_wrapper<ast::expression const>> filter,
+    rmm::cuda_stream_view stream) const;
 
   /**
    * @brief Filters and reduces down to a selection of columns
@@ -188,11 +215,77 @@ class aggregate_reader_metadata {
    * indices
    */
   [[nodiscard]] std::
-    tuple<std::vector<input_column_info>, std::vector<column_buffer>, std::vector<size_type>>
+    tuple<std::vector<input_column_info>, std::vector<inline_column_buffer>, std::vector<size_type>>
     select_columns(std::optional<std::vector<std::string>> const& use_names,
                    bool include_index,
                    bool strings_to_categorical,
                    type_id timestamp_type_id) const;
 };
 
+/**
+ * @brief Converts named columns to index reference columns
+ *
+ */
+class named_to_reference_converter : public ast::detail::expression_transformer {
+ public:
+  named_to_reference_converter(std::optional<std::reference_wrapper<ast::expression const>> expr,
+                               table_metadata const& metadata)
+    : metadata(metadata)
+  {
+    if (!expr.has_value()) return;
+    // create map for column name.
+    std::transform(
+      thrust::make_zip_iterator(metadata.schema_info.cbegin(),
+                                thrust::counting_iterator<size_t>(0)),
+      thrust::make_zip_iterator(metadata.schema_info.cend(),
+                                thrust::counting_iterator(metadata.schema_info.size())),
+      std::inserter(column_name_to_index, column_name_to_index.end()),
+      [](auto const& name_index) {
+        return std::make_pair(thrust::get<0>(name_index).name, thrust::get<1>(name_index));
+      });
+
+    expr.value().get().accept(*this);
+  }
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::literal const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::literal const& expr) override;
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::column_reference const& expr) override;
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(
+    ast::column_name_reference const& expr) override;
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::operation const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::operation const& expr) override;
+
+  /**
+   * @brief Returns the AST to apply on Column chunk statistics.
+   *
+   * @return AST operation expression
+   */
+  [[nodiscard]] std::optional<std::reference_wrapper<ast::expression const>> get_converted_expr()
+    const
+  {
+    return _stats_expr;
+  }
+
+ private:
+  std::vector<std::reference_wrapper<ast::expression const>> visit_operands(
+    std::vector<std::reference_wrapper<ast::expression const>> operands);
+
+  table_metadata const& metadata;
+  std::unordered_map<std::string, size_type> column_name_to_index;
+  std::optional<std::reference_wrapper<ast::expression const>> _stats_expr;
+  // Using std::list or std::deque to avoid reference invalidation
+  std::list<ast::column_reference> _col_ref;
+  std::list<ast::operation> _operators;
+};
+
 }  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 14aaec48b2b..bde73c3dd96 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -226,25 +226,26 @@ template <typename T = uint8_t>
 [[nodiscard]] std::future<void> read_column_chunks_async(
   std::vector<std::unique_ptr<datasource>> const& sources,
   std::vector<std::unique_ptr<datasource::buffer>>& page_data,
-  hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
   size_t begin_chunk,
   size_t end_chunk,
-  const std::vector<size_t>& column_chunk_offsets,
+  std::vector<size_t> const& column_chunk_offsets,
   std::vector<size_type> const& chunk_source_map,
   rmm::cuda_stream_view stream)
 {
   // Transfer chunk data, coalescing adjacent chunks
   std::vector<std::future<size_t>> read_tasks;
   for (size_t chunk = begin_chunk; chunk < end_chunk;) {
-    const size_t io_offset   = column_chunk_offsets[chunk];
+    size_t const io_offset   = column_chunk_offsets[chunk];
     size_t io_size           = chunks[chunk].compressed_size;
     size_t next_chunk        = chunk + 1;
-    const bool is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED);
+    bool const is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED);
     while (next_chunk < end_chunk) {
-      const size_t next_offset = column_chunk_offsets[next_chunk];
-      const bool is_next_compressed =
+      size_t const next_offset = column_chunk_offsets[next_chunk];
+      bool const is_next_compressed =
         (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED);
-      if (next_offset != io_offset + io_size || is_next_compressed != is_compressed) {
+      if (next_offset != io_offset + io_size || is_next_compressed != is_compressed ||
+          chunk_source_map[chunk] != chunk_source_map[next_chunk]) {
         // Can't merge if not contiguous or mixing compressed and uncompressed
         // Not coalescing uncompressed with compressed chunks is so that compressed buffers can be
         // freed earlier (immediately after decompression stage) to limit peak memory requirements
@@ -256,15 +257,23 @@ template <typename T = uint8_t>
     if (io_size != 0) {
       auto& source = sources[chunk_source_map[chunk]];
       if (source->is_device_read_preferred(io_size)) {
-        auto buffer        = rmm::device_buffer(io_size, stream);
+        // Buffer needs to be padded.
+        // Required by `gpuDecodePageData`.
+        auto buffer =
+          rmm::device_buffer(cudf::util::round_up_safe(io_size, BUFFER_PADDING_MULTIPLE), stream);
         auto fut_read_size = source->device_read_async(
           io_offset, io_size, static_cast<uint8_t*>(buffer.data()), stream);
         read_tasks.emplace_back(std::move(fut_read_size));
         page_data[chunk] = datasource::buffer::create(std::move(buffer));
       } else {
-        auto const buffer = source->host_read(io_offset, io_size);
-        page_data[chunk] =
-          datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream));
+        auto const read_buffer = source->host_read(io_offset, io_size);
+        // Buffer needs to be padded.
+        // Required by `gpuDecodePageData`.
+        auto tmp_buffer = rmm::device_buffer(
+          cudf::util::round_up_safe(read_buffer->size(), BUFFER_PADDING_MULTIPLE), stream);
+        CUDF_CUDA_TRY(cudaMemcpyAsync(
+          tmp_buffer.data(), read_buffer->data(), read_buffer->size(), cudaMemcpyDefault, stream));
+        page_data[chunk] = datasource::buffer::create(std::move(tmp_buffer));
       }
       auto d_compdata = page_data[chunk]->data();
       do {
@@ -291,14 +300,14 @@ template <typename T = uint8_t>
  *
  * @return The total number of pages
  */
-[[nodiscard]] size_t count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                        rmm::cuda_stream_view stream)
+[[nodiscard]] size_t count_page_headers(
+  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks, rmm::cuda_stream_view stream)
 {
   size_t total_pages = 0;
 
-  chunks.host_to_device(stream);
+  chunks.host_to_device_async(stream);
   gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
-  chunks.device_to_host(stream, true);
+  chunks.device_to_host_sync(stream);
 
   for (size_t c = 0; c < chunks.size(); c++) {
     total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages;
@@ -314,7 +323,8 @@ constexpr bool is_supported_encoding(Encoding enc)
     case Encoding::PLAIN:
     case Encoding::PLAIN_DICTIONARY:
     case Encoding::RLE:
-    case Encoding::RLE_DICTIONARY: return true;
+    case Encoding::RLE_DICTIONARY:
+    case Encoding::DELTA_BINARY_PACKED: return true;
     default: return false;
   }
 }
@@ -325,10 +335,11 @@ constexpr bool is_supported_encoding(Encoding enc)
  * @param chunks List of column chunk descriptors
  * @param pages List of page information
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @returns The size in bytes of level type data required
  */
-void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                         hostdevice_vector<gpu::PageInfo>& pages,
-                         rmm::cuda_stream_view stream)
+int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                        cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+                        rmm::cuda_stream_view stream)
 {
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
   // please update preprocess_nested_columns to reflect this.
@@ -338,15 +349,33 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
     page_count += chunks[c].max_num_pages;
   }
 
-  chunks.host_to_device(stream);
+  chunks.host_to_device_async(stream);
   gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
-  pages.device_to_host(stream, true);
+
+  // compute max bytes needed for level data
+  auto level_bit_size =
+    cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) {
+      auto c = chunks[i];
+      return static_cast<int>(
+        max(c.level_bits[gpu::level_type::REPETITION], c.level_bits[gpu::level_type::DEFINITION]));
+    });
+  // max level data bit size.
+  int const max_level_bits   = thrust::reduce(rmm::exec_policy(stream),
+                                            level_bit_size,
+                                            level_bit_size + chunks.size(),
+                                            0,
+                                            thrust::maximum<int>());
+  auto const level_type_size = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
+
+  pages.device_to_host_sync(stream);
 
   // validate page encodings
   CUDF_EXPECTS(std::all_of(pages.begin(),
                            pages.end(),
                            [](auto const& page) { return is_supported_encoding(page.encoding); }),
                "Unsupported page encoding detected");
+
+  return level_type_size;
 }
 
 /**
@@ -359,11 +388,11 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
  * @return Device buffer to decompressed page data
  */
 [[nodiscard]] rmm::device_buffer decompress_page_data(
-  hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-  hostdevice_vector<gpu::PageInfo>& pages,
+  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+  cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
   rmm::cuda_stream_view stream)
 {
-  auto for_each_codec_page = [&](parquet::Compression codec, const std::function<void(size_t)>& f) {
+  auto for_each_codec_page = [&](parquet::Compression codec, std::function<void(size_t)> const& f) {
     for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
       const auto page_stride = chunks[c].max_num_pages;
       if (chunks[c].codec == codec) {
@@ -421,8 +450,10 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
     }
   }
 
-  // Dispatch batches of pages to decompress for each codec
-  rmm::device_buffer decomp_pages(total_decomp_size, stream);
+  // Dispatch batches of pages to decompress for each codec.
+  // Buffer needs to be padded, required by `gpuDecodePageData`.
+  rmm::device_buffer decomp_pages(
+    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
 
   std::vector<device_span<uint8_t const>> comp_in;
   comp_in.reserve(num_comp_pages);
@@ -443,14 +474,15 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
 
   size_t decomp_offset = 0;
   int32_t start_pos    = 0;
-  for (const auto& codec : codecs) {
+  for (auto const& codec : codecs) {
     if (codec.num_pages == 0) { continue; }
 
     for_each_codec_page(codec.compression_type, [&](size_t page_idx) {
       auto const dst_base = static_cast<uint8_t*>(decomp_pages.data()) + decomp_offset;
       auto& page          = pages[page_idx];
       // offset will only be non-zero for V2 pages
-      auto const offset = page.def_lvl_bytes + page.rep_lvl_bytes;
+      auto const offset =
+        page.lvl_bytes[gpu::level_type::DEFINITION] + page.lvl_bytes[gpu::level_type::REPETITION];
       // for V2 need to copy def and rep level info into place, and then offset the
       // input and output buffers. otherwise we'd have to keep both the compressed
       // and decompressed data.
@@ -536,7 +568,7 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
 
   // Update the page information in device memory with the updated value of
   // page_data; it now points to the uncompressed data buffer
-  pages.host_to_device(stream);
+  pages.host_to_device_async(stream);
 
   return decomp_pages;
 }
@@ -561,12 +593,10 @@ void reader::impl::allocate_nesting_info()
       return total + (per_page_nesting_info_size * chunk.num_data_pages);
     });
 
-  page_nesting_info = hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, _stream};
+  page_nesting_info =
+    cudf::detail::hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, _stream};
   page_nesting_decode_info =
-    hostdevice_vector<gpu::PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
-
-  // retrieve from the gpu so we can update
-  pages.device_to_host(_stream, true);
+    cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
 
   // update pointers in the PageInfos
   int target_page_index = 0;
@@ -593,9 +623,6 @@ void reader::impl::allocate_nesting_info()
     target_page_index += chunks[idx].num_data_pages;
   }
 
-  // copy back to the gpu
-  pages.host_to_device(_stream);
-
   // fill in
   int nesting_info_index = 0;
   std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
@@ -669,8 +696,32 @@ void reader::impl::allocate_nesting_info()
   }
 
   // copy nesting info to the device
-  page_nesting_info.host_to_device(_stream);
-  page_nesting_decode_info.host_to_device(_stream);
+  page_nesting_info.host_to_device_async(_stream);
+  page_nesting_decode_info.host_to_device_async(_stream);
+}
+
+void reader::impl::allocate_level_decode_space()
+{
+  auto& pages = _file_itm_data.pages_info;
+
+  // TODO: this could be made smaller if we ignored dictionary pages and pages with no
+  // repetition data.
+  size_t const per_page_decode_buf_size =
+    LEVEL_DECODE_BUF_SIZE * 2 * _file_itm_data.level_type_size;
+  auto const decode_buf_size = per_page_decode_buf_size * pages.size();
+  _file_itm_data.level_decode_data =
+    rmm::device_buffer(decode_buf_size, _stream, rmm::mr::get_current_device_resource());
+
+  // distribute the buffers
+  uint8_t* buf = static_cast<uint8_t*>(_file_itm_data.level_decode_data.data());
+  for (size_t idx = 0; idx < pages.size(); idx++) {
+    auto& p = pages[idx];
+
+    p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf;
+    buf += (LEVEL_DECODE_BUF_SIZE * _file_itm_data.level_type_size);
+    p.lvl_decode_buf[gpu::level_type::REPETITION] = buf;
+    buf += (LEVEL_DECODE_BUF_SIZE * _file_itm_data.level_type_size);
+  }
 }
 
 std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_column_chunks(
@@ -680,9 +731,9 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_co
   auto& chunks        = _file_itm_data.chunks;
 
   // Descriptors for all the chunks that make up the selected columns
-  const auto num_input_columns = _input_columns.size();
-  const auto num_chunks        = row_groups_info.size() * num_input_columns;
-  chunks                       = hostdevice_vector<gpu::ColumnChunkDesc>(0, num_chunks, _stream);
+  auto const num_input_columns = _input_columns.size();
+  auto const num_chunks        = row_groups_info.size() * num_input_columns;
+  chunks = cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(0, num_chunks, _stream);
 
   // Association between each column chunk and its source
   std::vector<size_type> chunk_source_map(num_chunks);
@@ -697,8 +748,8 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_co
   size_t total_decompressed_size = 0;
   auto remaining_rows            = num_rows;
   std::vector<std::future<void>> read_rowgroup_tasks;
-  for (const auto& rg : row_groups_info) {
-    const auto& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
+  for (auto const& rg : row_groups_info) {
+    auto const& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
     auto const row_group_start  = rg.start_row;
     auto const row_group_source = rg.source_index;
     auto const row_group_rows   = std::min<int>(remaining_rows, row_group.num_rows);
@@ -776,7 +827,7 @@ void reader::impl::load_and_decompress_data(
   auto& raw_page_data    = _file_itm_data.raw_page_data;
   auto& decomp_page_data = _file_itm_data.decomp_page_data;
   auto& chunks           = _file_itm_data.chunks;
-  auto& pages_info       = _file_itm_data.pages_info;
+  auto& pages            = _file_itm_data.pages_info;
 
   auto const [has_compressed_data, read_rowgroup_tasks] =
     create_and_read_column_chunks(row_groups_info, num_rows);
@@ -787,13 +838,13 @@ void reader::impl::load_and_decompress_data(
 
   // Process dataset chunk pages into output columns
   auto const total_pages = count_page_headers(chunks, _stream);
-  pages_info             = hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
+  pages = cudf::detail::hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
 
   if (total_pages > 0) {
     // decoding of column/page information
-    decode_page_headers(chunks, pages_info, _stream);
+    _file_itm_data.level_type_size = decode_page_headers(chunks, pages, _stream);
     if (has_compressed_data) {
-      decomp_page_data = decompress_page_data(chunks, pages_info, _stream);
+      decomp_page_data = decompress_page_data(chunks, pages, _stream);
       // Free compressed data
       for (size_t c = 0; c < chunks.size(); c++) {
         if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
@@ -815,9 +866,17 @@ void reader::impl::load_and_decompress_data(
     // create it ourselves.
     // std::vector<output_column_info> output_info = build_output_column_info();
 
-    // nesting information (sizes, etc) stored -per page-
-    // note : even for flat schemas, we allocate 1 level of "nesting" info
-    allocate_nesting_info();
+    // the following two allocate functions modify the page data
+    pages.device_to_host_sync(_stream);
+    {
+      // nesting information (sizes, etc) stored -per page-
+      // note : even for flat schemas, we allocate 1 level of "nesting" info
+      allocate_nesting_info();
+
+      // level decode space
+      allocate_level_decode_space();
+    }
+    pages.host_to_device_async(_stream);
   }
 }
 
@@ -830,9 +889,10 @@ struct cumulative_row_info {
 };
 
 #if defined(PREPROCESS_DEBUG)
-void print_pages(hostdevice_vector<gpu::PageInfo>& pages, rmm::cuda_stream_view _stream)
+void print_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+                 rmm::cuda_stream_view _stream)
 {
-  pages.device_to_host(_stream, true);
+  pages.device_to_host_sync(_stream);
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto const& p = pages[idx];
     // skip dictionary pages
@@ -850,12 +910,12 @@ void print_pages(hostdevice_vector<gpu::PageInfo>& pages, rmm::cuda_stream_view
   }
 }
 
-void print_cumulative_page_info(hostdevice_vector<gpu::PageInfo>& pages,
+void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
                                 rmm::device_uvector<int32_t> const& page_index,
                                 rmm::device_uvector<cumulative_row_info> const& c_info,
                                 rmm::cuda_stream_view stream)
 {
-  pages.device_to_host(stream, true);
+  pages.device_to_host_sync(stream);
 
   printf("------------\nCumulative sizes by page\n");
 
@@ -958,7 +1018,7 @@ struct row_size_functor {
 template <>
 __device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
 {
-  auto const offset_size = sizeof(offset_type);
+  auto const offset_size = sizeof(size_type);
   // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
   // for the entire column, whereas this is adding an extra offset per page.  So we will get a
   // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
@@ -978,7 +1038,7 @@ __device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, boo
 {
   // only returns the size of offsets and validity. the size of the actual string chars
   // is tracked separately.
-  auto const offset_size = sizeof(offset_type);
+  auto const offset_size = sizeof(size_type);
   // see note about offsets in the list_view template.
   return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
 }
@@ -1119,13 +1179,14 @@ std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> c
  * @param id Additional intermediate information required to process the pages
  * @param num_rows Total number of rows to read
  * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
- * @param stream CUDA stream to use, default 0
+ * @param stream CUDA stream to use
  */
-std::vector<gpu::chunk_read_info> compute_splits(hostdevice_vector<gpu::PageInfo>& pages,
-                                                 gpu::chunk_intermediate_data const& id,
-                                                 size_t num_rows,
-                                                 size_t chunk_read_limit,
-                                                 rmm::cuda_stream_view stream)
+std::vector<gpu::chunk_read_info> compute_splits(
+  cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+  gpu::chunk_intermediate_data const& id,
+  size_t num_rows,
+  size_t chunk_read_limit,
+  rmm::cuda_stream_view stream)
 {
   auto const& page_keys  = id.page_keys;
   auto const& page_index = id.page_index;
@@ -1211,8 +1272,12 @@ struct get_page_num_rows {
   __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; }
 };
 
-struct get_page_schema {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.src_col_schema; }
+struct get_page_column_index {
+  gpu::ColumnChunkDesc const* chunks;
+  __device__ size_type operator()(gpu::PageInfo const& page)
+  {
+    return chunks[page.chunk_idx].src_col_index;
+  }
 };
 
 struct input_col_info {
@@ -1386,10 +1451,10 @@ struct row_counts_different {
  * @param expected_row_count Expected row count, if applicable
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void detect_malformed_pages(hostdevice_vector<gpu::PageInfo>& pages,
-                            hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
-                            device_span<const int> page_keys,
-                            device_span<const int> page_index,
+void detect_malformed_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+                            cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
+                            device_span<int const> page_keys,
+                            device_span<int const> page_index,
                             std::optional<size_t> expected_row_count,
                             rmm::cuda_stream_view stream)
 {
@@ -1435,6 +1500,43 @@ void detect_malformed_pages(hostdevice_vector<gpu::PageInfo>& pages,
   }
 }
 
+struct page_to_string_size {
+  gpu::PageInfo* pages;
+  gpu::ColumnChunkDesc const* chunks;
+
+  __device__ size_t operator()(size_type page_idx) const
+  {
+    auto const page  = pages[page_idx];
+    auto const chunk = chunks[page.chunk_idx];
+
+    if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) {
+      return 0;
+    }
+    return pages[page_idx].str_bytes;
+  }
+};
+
+struct page_offset_output_iter {
+  gpu::PageInfo* p;
+  size_type const* index;
+
+  using value_type        = size_type;
+  using difference_type   = size_type;
+  using pointer           = size_type*;
+  using reference         = size_type&;
+  using iterator_category = thrust::output_device_iterator_tag;
+
+  __host__ __device__ page_offset_output_iter operator+(int i)
+  {
+    return page_offset_output_iter{p, index + i};
+  }
+
+  __host__ __device__ void operator++() { index++; }
+
+  __device__ reference operator[](int i) { return p[index[i]].str_offset; }
+  __device__ reference operator*() { return p[*index].str_offset; }
+};
+
 }  // anonymous namespace
 
 void reader::impl::preprocess_pages(size_t skip_rows,
@@ -1471,7 +1573,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
                       pages.device_ptr(),
                       pages.device_ptr() + pages.size(),
                       page_keys.begin(),
-                      get_page_schema{});
+                      get_page_column_index{chunks.device_ptr()});
 
     thrust::sequence(rmm::exec_policy(_stream), page_index.begin(), page_index.end());
     thrust::stable_sort_by_key(rmm::exec_policy(_stream),
@@ -1518,7 +1620,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
 
   // generate string dict indices if necessary
   {
-    auto is_dict_chunk = [](const gpu::ColumnChunkDesc& chunk) {
+    auto is_dict_chunk = [](gpu::ColumnChunkDesc const& chunk) {
       return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
     };
 
@@ -1553,7 +1655,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
     }
 
     if (total_str_dict_indexes > 0) {
-      chunks.host_to_device(_stream);
+      chunks.host_to_device_async(_stream);
       gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
     }
   }
@@ -1575,6 +1677,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
                           std::numeric_limits<size_t>::max(),
                           true,                  // compute num_rows
                           chunk_read_limit > 0,  // compute string sizes
+                          _file_itm_data.level_type_size,
                           _stream);
 
     // computes:
@@ -1589,16 +1692,16 @@ void reader::impl::preprocess_pages(size_t skip_rows,
                                   page_input,
                                   chunk_row_output_iter{pages.device_ptr()});
 
-    // preserve page ordering data
-    _chunk_itm_data.page_keys  = std::move(page_keys);
-    _chunk_itm_data.page_index = std::move(page_index);
-
     // retrieve pages back
-    pages.device_to_host(_stream, true);
+    pages.device_to_host_sync(_stream);
 
     // print_pages(pages, _stream);
   }
 
+  // preserve page ordering data for string decoder
+  _chunk_itm_data.page_keys  = std::move(page_keys);
+  _chunk_itm_data.page_index = std::move(page_index);
+
   // compute splits if necessary. otherwise return a single split representing
   // the whole file.
   _chunk_read_info = chunk_read_limit > 0
@@ -1626,6 +1729,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
                           num_rows,
                           false,  // num_rows is already computed
                           false,  // no need to compute string sizes
+                          _file_itm_data.level_type_size,
                           _stream);
 
     // print_pages(pages, _stream);
@@ -1694,7 +1798,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
         d_cols_info.data(), max_depth, pages.size(), pages.device_ptr(), page_index.begin()});
     auto const reduction_keys =
       cudf::detail::make_counting_transform_iterator(0, get_reduction_key{pages.size()});
-    hostdevice_vector<size_t> sizes{_input_columns.size() * max_depth, _stream};
+    cudf::detail::hostdevice_vector<size_t> sizes{_input_columns.size() * max_depth, _stream};
 
     // find the size of each column
     thrust::reduce_by_key(rmm::exec_policy(_stream),
@@ -1713,7 +1817,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       start_offset_output_iterator{
         pages.device_ptr(), page_index.begin(), 0, d_cols_info.data(), max_depth, pages.size()});
 
-    sizes.device_to_host(_stream, true);
+    sizes.device_to_host_sync(_stream);
     for (size_type idx = 0; idx < static_cast<size_type>(_input_columns.size()); idx++) {
       auto const& input_col = _input_columns[idx];
       auto* cols            = &_output_buffers;
@@ -1740,4 +1844,44 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   }
 }
 
+std::vector<size_t> reader::impl::calculate_page_string_offsets()
+{
+  auto& chunks           = _file_itm_data.chunks;
+  auto& pages            = _file_itm_data.pages_info;
+  auto const& page_keys  = _chunk_itm_data.page_keys;
+  auto const& page_index = _chunk_itm_data.page_index;
+
+  std::vector<size_t> col_sizes(_input_columns.size(), 0L);
+  rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
+
+  // use page_index to fetch page string sizes in the proper order
+  auto val_iter = thrust::make_transform_iterator(
+    page_index.begin(), page_to_string_size{pages.device_ptr(), chunks.device_ptr()});
+
+  // do scan by key to calculate string offsets for each page
+  thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
+                                page_keys.begin(),
+                                page_keys.end(),
+                                val_iter,
+                                page_offset_output_iter{pages.device_ptr(), page_index.data()});
+
+  // now sum up page sizes
+  rmm::device_uvector<int> reduce_keys(col_sizes.size(), _stream);
+  thrust::reduce_by_key(rmm::exec_policy(_stream),
+                        page_keys.begin(),
+                        page_keys.end(),
+                        val_iter,
+                        reduce_keys.begin(),
+                        d_col_sizes.begin());
+
+  cudaMemcpyAsync(col_sizes.data(),
+                  d_col_sizes.data(),
+                  sizeof(size_t) * col_sizes.size(),
+                  cudaMemcpyDeviceToHost,
+                  _stream);
+  _stream.synchronize();
+
+  return col_sizes;
+}
+
 }  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
new file mode 100644
index 00000000000..2545a074a38
--- /dev/null
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "parquet_gpu.hpp"
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+namespace cudf::io::parquet::gpu {
+
+template <int num_threads>
+constexpr int rle_stream_required_run_buffer_size()
+{
+  constexpr int num_rle_stream_decode_warps = (num_threads / cudf::detail::warp_size) - 1;
+  return (num_rle_stream_decode_warps * 2);
+}
+
+/**
+ * @brief Read a 32-bit varint integer
+ *
+ * @param[in,out] cur The current data position, updated after the read
+ * @param[in] end The end data position
+ *
+ * @return The 32-bit value read
+ */
+inline __device__ uint32_t get_vlq32(uint8_t const*& cur, uint8_t const* end)
+{
+  uint32_t v = *cur++;
+  if (v >= 0x80 && cur < end) {
+    v = (v & 0x7f) | ((*cur++) << 7);
+    if (v >= (0x80 << 7) && cur < end) {
+      v = (v & ((0x7f << 7) | 0x7f)) | ((*cur++) << 14);
+      if (v >= (0x80 << 14) && cur < end) {
+        v = (v & ((0x7f << 14) | (0x7f << 7) | 0x7f)) | ((*cur++) << 21);
+        if (v >= (0x80 << 21) && cur < end) {
+          v = (v & ((0x7f << 21) | (0x7f << 14) | (0x7f << 7) | 0x7f)) | ((*cur++) << 28);
+        }
+      }
+    }
+  }
+  return v;
+}
+
+// an individual batch. processed by a warp.
+// batches should be in shared memory.
+template <typename level_t>
+struct rle_batch {
+  uint8_t const* run_start;  // start of the run we are part of
+  int run_offset;            // value offset of this batch from the start of the run
+  level_t* output;
+  int level_run;
+  int size;
+
+  __device__ inline void decode(uint8_t const* const end, int level_bits, int lane, int warp_id)
+  {
+    int output_pos = 0;
+    int remain     = size;
+
+    // for bitpacked/literal runs, total size is always a multiple of 8. so we need to take care if
+    // we are not starting/ending exactly on a run boundary
+    uint8_t const* cur;
+    if (level_run & 1) {
+      int const effective_offset = cudf::util::round_down_safe(run_offset, 8);
+      int const lead_values      = (run_offset - effective_offset);
+      output_pos -= lead_values;
+      remain += lead_values;
+      cur = run_start + ((effective_offset >> 3) * level_bits);
+    }
+
+    // if this is a repeated run, compute the repeated value
+    int level_val;
+    if (!(level_run & 1)) {
+      level_val = run_start[0];
+      if (level_bits > 8) { level_val |= run_start[1] << 8; }
+    }
+
+    // process
+    while (remain > 0) {
+      int const batch_len = min(32, remain);
+
+      // if this is a literal run. each thread computes its own level_val
+      if (level_run & 1) {
+        int const batch_len8 = (batch_len + 7) >> 3;
+        if (lane < batch_len) {
+          int bitpos                = lane * level_bits;
+          uint8_t const* cur_thread = cur + (bitpos >> 3);
+          bitpos &= 7;
+          level_val = 0;
+          if (cur_thread < end) { level_val = cur_thread[0]; }
+          cur_thread++;
+          if (level_bits > 8 - bitpos && cur_thread < end) {
+            level_val |= cur_thread[0] << 8;
+            cur_thread++;
+            if (level_bits > 16 - bitpos && cur_thread < end) { level_val |= cur_thread[0] << 16; }
+          }
+          level_val = (level_val >> bitpos) & ((1 << level_bits) - 1);
+        }
+
+        cur += batch_len8 * level_bits;
+      }
+
+      // store level_val
+      if (lane < batch_len && (lane + output_pos) >= 0) { output[lane + output_pos] = level_val; }
+      remain -= batch_len;
+      output_pos += batch_len;
+    }
+  }
+};
+
+// a single rle run. may be broken up into multiple rle_batches
+template <typename level_t>
+struct rle_run {
+  int size;  // total size of the run
+  int output_pos;
+  uint8_t const* start;
+  int level_run;  // level_run header value
+  int remaining;
+
+  __device__ __inline__ rle_batch<level_t> next_batch(level_t* const output, int max_size)
+  {
+    int const batch_len  = min(max_size, remaining);
+    int const run_offset = size - remaining;
+    remaining -= batch_len;
+    return rle_batch<level_t>{start, run_offset, output, level_run, batch_len};
+  }
+};
+
+// a stream of rle_runs
+template <typename level_t, int decode_threads>
+struct rle_stream {
+  static constexpr int num_rle_stream_decode_threads = decode_threads;
+  // the -1 here is for the look-ahead warp that fills in the list of runs to be decoded
+  // in an overlapped manner. so if we had 16 total warps:
+  // - warp 0 would be filling in batches of runs to be processed
+  // - warps 1-15 would be decoding the previous batch of runs generated
+  static constexpr int num_rle_stream_decode_warps =
+    (num_rle_stream_decode_threads / cudf::detail::warp_size) - 1;
+
+  static constexpr int run_buffer_size = rle_stream_required_run_buffer_size<decode_threads>();
+
+  int level_bits;
+  uint8_t const* start;
+  uint8_t const* cur;
+  uint8_t const* end;
+
+  int max_output_values;
+  int total_values;
+  int cur_values;
+
+  level_t* output;
+
+  rle_run<level_t>* runs;
+  int run_index;
+  int run_count;
+  int output_pos;
+  bool spill;
+
+  int next_batch_run_start;
+  int next_batch_run_count;
+
+  __device__ rle_stream(rle_run<level_t>* _runs) : runs(_runs) {}
+
+  __device__ void init(int _level_bits,
+                       uint8_t const* _start,
+                       uint8_t const* _end,
+                       int _max_output_values,
+                       level_t* _output,
+                       int _total_values)
+  {
+    level_bits = _level_bits;
+    start      = _start;
+    cur        = _start;
+    end        = _end;
+
+    max_output_values = _max_output_values;
+    output            = _output;
+
+    run_index            = 0;
+    run_count            = 0;
+    output_pos           = 0;
+    spill                = false;
+    next_batch_run_start = 0;
+    next_batch_run_count = 0;
+
+    total_values = _total_values;
+    cur_values   = 0;
+  }
+
+  __device__ inline thrust::pair<int, int> get_run_batch()
+  {
+    return {next_batch_run_start, next_batch_run_count};
+  }
+
+  // fill in up to num_rle_stream_decode_warps runs or until we reach the max_count limit.
+  // this function is the critical hotspot.  please be very careful altering it.
+  __device__ inline void fill_run_batch(int max_count)
+  {
+    // if we spilled over, we've already got a run at the beginning
+    next_batch_run_start = spill ? run_index - 1 : run_index;
+    spill                = false;
+
+    // generate runs until we either run out of warps to decode them with, or
+    // we cross the output limit.
+    while (run_count < num_rle_stream_decode_warps && output_pos < max_count && cur < end) {
+      auto& run = runs[rolling_index<run_buffer_size>(run_index)];
+
+      // Encoding::RLE
+
+      // bytes for the varint header
+      uint8_t const* _cur = cur;
+      int const level_run = get_vlq32(_cur, end);
+      int run_bytes       = _cur - cur;
+
+      // literal run
+      if (level_run & 1) {
+        int const run_size  = (level_run >> 1) * 8;
+        run.size            = run_size;
+        int const run_size8 = (run_size + 7) >> 3;
+        run_bytes += run_size8 * level_bits;
+      }
+      // repeated value run
+      else {
+        run.size = (level_run >> 1);
+        run_bytes++;
+        // can this ever be > 16?  it effectively encodes nesting depth so that would require
+        // a nesting depth > 64k.
+        if (level_bits > 8) { run_bytes++; }
+      }
+      run.output_pos = output_pos;
+      run.start      = _cur;
+      run.level_run  = level_run;
+      run.remaining  = run.size;
+      cur += run_bytes;
+
+      output_pos += run.size;
+      run_count++;
+      run_index++;
+    }
+
+    // the above loop computes a batch of runs to be processed. mark down
+    // the number of runs because the code after this point resets run_count
+    // for the next batch. each batch is returned via get_next_batch().
+    next_batch_run_count = run_count;
+
+    // -------------------------------------
+    // prepare for the next run:
+
+    // if we've reached the value output limit on the last run
+    if (output_pos >= max_count) {
+      // first, see if we've spilled over
+      auto const& src       = runs[rolling_index<run_buffer_size>(run_index - 1)];
+      int const spill_count = output_pos - max_count;
+
+      // a spill has occurred in the current run. spill the extra values over into the beginning of
+      // the next run.
+      if (spill_count > 0) {
+        auto& spill_run      = runs[rolling_index<run_buffer_size>(run_index)];
+        spill_run            = src;
+        spill_run.output_pos = 0;
+        spill_run.remaining  = spill_count;
+
+        run_count = 1;
+        run_index++;
+        output_pos = spill_run.remaining;
+        spill      = true;
+      }
+      // no actual spill needed. just reset the output pos
+      else {
+        output_pos = 0;
+        run_count  = 0;
+      }
+    }
+    // didn't cross the limit, so reset the run count
+    else {
+      run_count = 0;
+    }
+  }
+
+  __device__ inline int decode_next(int t)
+  {
+    int const output_count = min(max_output_values, (total_values - cur_values));
+
+    // special case. if level_bits == 0, just return all zeros. this should tremendously speed up
+    // a very common case: columns with no nulls, especially if they are non-nested
+    if (level_bits == 0) {
+      int written = 0;
+      while (written < output_count) {
+        int const batch_size = min(num_rle_stream_decode_threads, output_count - written);
+        if (t < batch_size) { output[written + t] = 0; }
+        written += batch_size;
+      }
+      cur_values += output_count;
+      return output_count;
+    }
+
+    // otherwise, full decode.
+    int const warp_id        = t / cudf::detail::warp_size;
+    int const warp_decode_id = warp_id - 1;
+    int const warp_lane      = t % cudf::detail::warp_size;
+
+    __shared__ int run_start;
+    __shared__ int num_runs;
+    __shared__ int values_processed;
+    if (!t) {
+      // carryover from the last call.
+      thrust::tie(run_start, num_runs) = get_run_batch();
+      values_processed                 = 0;
+    }
+    __syncthreads();
+
+    do {
+      // warp 0 reads ahead and generates batches of runs to be decoded by remaining warps.
+      if (!warp_id) {
+        // fill the next set of runs. fill_runs will generally be the bottleneck for any
+        // kernel that uses an rle_stream.
+        if (warp_lane == 0) { fill_run_batch(output_count); }
+      }
+      // remaining warps decode the runs
+      else if (warp_decode_id < num_runs) {
+        // each warp handles 1 run, regardless of size.
+        // TODO: having each warp handle exactly 32 values would be ideal. as an example, the
+        // repetition levels for one of the list benchmarks decodes in ~3ms total, while the
+        // definition levels take ~11ms - the difference is entirely due to long runs in the
+        // definition levels.
+        auto& run  = runs[rolling_index<run_buffer_size>(run_start + warp_decode_id)];
+        auto batch = run.next_batch(output + run.output_pos,
+                                    min(run.remaining, (output_count - run.output_pos)));
+        batch.decode(end, level_bits, warp_lane, warp_decode_id);
+        // last warp updates total values processed
+        if (warp_lane == 0 && warp_decode_id == num_runs - 1) {
+          values_processed = run.output_pos + batch.size;
+        }
+      }
+      __syncthreads();
+
+      // if we haven't run out of space, retrieve the next batch. otherwise leave it for the next
+      // call.
+      if (!t && values_processed < output_count) {
+        thrust::tie(run_start, num_runs) = get_run_batch();
+      }
+      __syncthreads();
+    } while (num_runs > 0 && values_processed < output_count);
+
+    cur_values += values_processed;
+
+    // valid for every thread
+    return values_processed;
+  }
+};
+
+}  // namespace cudf::io::parquet::gpu
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 898d470d7d1..d2976a3f5d9 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -32,6 +32,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/linked_column.hpp>
 #include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -192,6 +193,20 @@ parquet::Compression to_parquet_compression(compression_type compression)
   }
 }
 
+/**
+ * @brief Convert a mask of encodings to a vector.
+ *
+ * @param encodings Vector of `Encoding`s to populate
+ * @param enc_mask Mask of encodings used
+ */
+void update_chunk_encodings(std::vector<Encoding>& encodings, uint32_t enc_mask)
+{
+  for (uint8_t enc = 0; enc < static_cast<uint8_t>(Encoding::NUM_ENCODINGS); enc++) {
+    auto const enc_enum = static_cast<Encoding>(enc);
+    if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); }
+  }
+}
+
 /**
  * @brief Compute size (in bytes) of the data stored in the given column.
  *
@@ -201,7 +216,7 @@ parquet::Compression to_parquet_compression(compression_type compression)
  */
 size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
 {
-  if (column.size() == 0) { return 0; }
+  if (column.is_empty()) { return 0; }
 
   if (is_fixed_width(column.type())) {
     return size_of(column.type()) * column.size();
@@ -514,9 +529,9 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
                             single_write_mode write_mode)
 {
   if (col_meta.is_nullability_defined()) {
-    CUDF_EXPECTS(col_meta.nullable() || !col->nullable(),
-                 "Mismatch in metadata prescribed nullability and input column nullability. "
-                 "Metadata for nullable input column cannot prescribe nullability = false");
+    CUDF_EXPECTS(col_meta.nullable() or col->null_count() == 0,
+                 "Mismatch in metadata prescribed nullability and input column. "
+                 "Metadata for input column with nulls cannot prescribe nullability = false");
     return col_meta.nullable();
   }
   // For chunked write, when not provided nullability, we assume the worst case scenario
@@ -572,7 +587,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         CUDF_EXPECTS(col_meta.num_children() == 2 or col_meta.num_children() == 0,
                      "Binary column's corresponding metadata should have zero or two children!");
         if (col_meta.num_children() > 0) {
-          CUDF_EXPECTS(col->children[lists_column_view::child_column_index]->children.size() == 0,
+          CUDF_EXPECTS(col->children[lists_column_view::child_column_index]->children.empty(),
                        "Binary column must not be nested!");
         }
 
@@ -858,7 +873,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
 
   _is_list = (_max_rep_level > 0);
 
-  if (cudf_col.size() == 0) { return; }
+  if (cudf_col.is_empty()) { return; }
 
   if (_is_list) {
     // Top level column's offsets are not applied to all children. Get the effective offset and
@@ -923,7 +938,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragmen
   auto d_partitions = cudf::detail::make_device_uvector_async(
     partitions, stream, rmm::mr::get_current_device_resource());
   gpu::InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
-  frag.device_to_host(stream, true);
+  frag.device_to_host_sync(stream);
 }
 
 /**
@@ -995,12 +1010,13 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                      uint32_t num_columns,
                      size_t max_page_size_bytes,
                      size_type max_page_size_rows,
+                     bool write_v2_headers,
                      Compression compression_codec,
                      rmm::cuda_stream_view stream)
 {
-  if (chunks.is_empty()) { return hostdevice_vector<size_type>{}; }
+  if (chunks.is_empty()) { return cudf::detail::hostdevice_vector<size_type>{}; }
 
-  chunks.host_to_device(stream);
+  chunks.host_to_device_async(stream);
   // Calculate number of pages and store in respective chunks
   gpu::InitEncoderPages(chunks,
                         {},
@@ -1011,21 +1027,22 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                         max_page_size_bytes,
                         max_page_size_rows,
                         page_alignment(compression_codec),
+                        write_v2_headers,
                         nullptr,
                         nullptr,
                         stream);
-  chunks.device_to_host(stream, true);
+  chunks.device_to_host_sync(stream);
 
   int num_pages = 0;
   for (auto& chunk : chunks.host_view().flat_view()) {
     chunk.first_page = num_pages;
     num_pages += chunk.num_pages;
   }
-  chunks.host_to_device(stream);
+  chunks.host_to_device_async(stream);
 
   // Now that we know the number of pages, allocate an array to hold per page size and get it
   // populated
-  hostdevice_vector<size_type> page_sizes(num_pages, stream);
+  cudf::detail::hostdevice_vector<size_type> page_sizes(num_pages, stream);
   gpu::InitEncoderPages(chunks,
                         {},
                         page_sizes,
@@ -1035,20 +1052,21 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                         max_page_size_bytes,
                         max_page_size_rows,
                         page_alignment(compression_codec),
+                        write_v2_headers,
                         nullptr,
                         nullptr,
                         stream);
-  page_sizes.device_to_host(stream, true);
+  page_sizes.device_to_host_sync(stream);
 
   // Get per-page max compressed size
-  hostdevice_vector<size_type> comp_page_sizes(num_pages, stream);
+  cudf::detail::hostdevice_vector<size_type> comp_page_sizes(num_pages, stream);
   std::transform(page_sizes.begin(),
                  page_sizes.end(),
                  comp_page_sizes.begin(),
                  [compression_codec](auto page_size) {
                    return max_compression_output_size(compression_codec, page_size);
                  });
-  comp_page_sizes.host_to_device(stream);
+  comp_page_sizes.host_to_device_async(stream);
 
   // Use per-page max compressed size to calculate chunk.compressed_size
   gpu::InitEncoderPages(chunks,
@@ -1060,10 +1078,11 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                         max_page_size_bytes,
                         max_page_size_rows,
                         page_alignment(compression_codec),
+                        write_v2_headers,
                         nullptr,
                         nullptr,
                         stream);
-  chunks.device_to_host(stream, true);
+  chunks.device_to_host_sync(stream);
   return comp_page_sizes;
 }
 
@@ -1098,12 +1117,12 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   std::vector<rmm::device_uvector<size_type>> dict_data;
   std::vector<rmm::device_uvector<size_type>> dict_index;
 
-  if (h_chunks.size() == 0) { return std::pair(std::move(dict_data), std::move(dict_index)); }
+  if (h_chunks.empty()) { return std::pair(std::move(dict_data), std::move(dict_index)); }
 
   if (dict_policy == dictionary_policy::NEVER) {
     thrust::for_each(
       h_chunks.begin(), h_chunks.end(), [](auto& chunk) { chunk.use_dictionary = false; });
-    chunks.host_to_device(stream);
+    chunks.host_to_device_async(stream);
     return std::pair(std::move(dict_data), std::move(dict_index));
   }
 
@@ -1125,12 +1144,12 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
     }
   }
 
-  chunks.host_to_device(stream);
+  chunks.host_to_device_async(stream);
 
   gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
   gpu::populate_chunk_hash_maps(frags, stream);
 
-  chunks.device_to_host(stream, true);
+  chunks.device_to_host_sync(stream);
 
   // Make decision about which chunks have dictionary
   for (auto& ck : h_chunks) {
@@ -1174,7 +1193,7 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
     chunk.dict_data           = inserted_dict_data.data();
     chunk.dict_index          = inserted_dict_index.data();
   }
-  chunks.host_to_device(stream);
+  chunks.host_to_device_async(stream);
   gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
   gpu::get_dictionary_indices(frags, stream);
 
@@ -1196,12 +1215,13 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param compression Compression format
  * @param max_page_size_bytes Maximum uncompressed page size, in bytes
  * @param max_page_size_rows Maximum page size, in rows
+ * @param write_v2_headers True if version 2 page headers are to be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                         device_span<gpu::parquet_column_device_view const> col_desc,
                         device_span<gpu::EncPage> pages,
-                        hostdevice_vector<size_type>& comp_page_sizes,
+                        cudf::detail::hostdevice_vector<size_type>& comp_page_sizes,
                         statistics_chunk* page_stats,
                         statistics_chunk* frag_stats,
                         uint32_t num_columns,
@@ -1210,10 +1230,11 @@ void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                         Compression compression,
                         size_t max_page_size_bytes,
                         size_type max_page_size_rows,
+                        bool write_v2_headers,
                         rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<statistics_merge_group> page_stats_mrg(num_stats_bfr, stream);
-  chunks.host_to_device(stream);
+  chunks.host_to_device_async(stream);
   InitEncoderPages(chunks,
                    pages,
                    {},
@@ -1223,6 +1244,7 @@ void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                    max_page_size_bytes,
                    max_page_size_rows,
                    page_alignment(compression),
+                   write_v2_headers,
                    (num_stats_bfr) ? page_stats_mrg.data() : nullptr,
                    (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr,
                    stream);
@@ -1255,8 +1277,10 @@ void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param page_stats optional page-level statistics (nullptr if none)
  * @param chunk_stats optional chunk-level statistics (nullptr if none)
  * @param column_stats optional page-level statistics for column index (nullptr if none)
+ * @param comp_stats optional compression statistics (nullopt if none)
  * @param compression compression format
  * @param column_index_truncate_length maximum length of min or max values in column index, in bytes
+ * @param write_v2_headers True if V2 page headers should be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
@@ -1265,11 +1289,13 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                   uint32_t first_page_in_batch,
                   uint32_t rowgroups_in_batch,
                   uint32_t first_rowgroup,
-                  const statistics_chunk* page_stats,
-                  const statistics_chunk* chunk_stats,
-                  const statistics_chunk* column_stats,
+                  statistics_chunk const* page_stats,
+                  statistics_chunk const* chunk_stats,
+                  statistics_chunk const* column_stats,
+                  std::optional<writer_compression_statistics>& comp_stats,
                   Compression compression,
                   int32_t column_index_truncate_length,
+                  bool write_v2_headers,
                   rmm::cuda_stream_view stream)
 {
   auto batch_pages = pages.subspan(first_page_in_batch, pages_in_batch);
@@ -1290,7 +1316,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                comp_res.end(),
                compression_result{0, compression_status::FAILURE});
 
-  gpu::EncodePages(batch_pages, comp_in, comp_out, comp_res, stream);
+  gpu::EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
   switch (compression) {
     case parquet::Compression::SNAPPY:
       if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
@@ -1334,6 +1360,10 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                                 d_chunks_in_batch.flat_view().size_bytes(),
                                 cudaMemcpyDefault,
                                 stream.value()));
+
+  if (comp_stats.has_value()) {
+    comp_stats.value() += collect_compression_statistics(comp_in, comp_res, stream);
+  }
   stream.synchronize();
 }
 
@@ -1412,10 +1442,12 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
  * @param column_index_truncate_length maximum length of min or max values in column index, in bytes
  * @param stats_granularity Level of statistics requested in output file
  * @param compression Compression format
+ * @param collect_statistics Flag to indicate if statistics should be collected
  * @param dict_policy Policy for dictionary use
  * @param max_dictionary_size Maximum dictionary size, in bytes
  * @param single_write_mode Flag to indicate that we are guaranteeing a single table write
  * @param int96_timestamps Flag to indicate if timestamps will be written as INT96
+ * @param write_v2_headers True if V2 page headers are to be written
  * @param out_sink Sink for checking if device write is supported, should not be used to write any
  *        data in this function
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -1434,10 +1466,12 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    int32_t column_index_truncate_length,
                                    statistics_freq stats_granularity,
                                    Compression compression,
+                                   bool collect_compression_statistics,
                                    dictionary_policy dict_policy,
                                    size_t max_dictionary_size,
                                    single_write_mode write_mode,
                                    bool int96_timestamps,
+                                   bool write_v2_headers,
                                    host_span<std::unique_ptr<data_sink> const> out_sink,
                                    rmm::cuda_stream_view stream)
 {
@@ -1462,7 +1496,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::vector<SchemaElement> this_table_schema(schema_tree.begin(), schema_tree.end());
 
   // Initialize column description
-  hostdevice_vector<gpu::parquet_column_device_view> col_desc(parquet_columns.size(), stream);
+  cudf::detail::hostdevice_vector<gpu::parquet_column_device_view> col_desc(parquet_columns.size(),
+                                                                            stream);
   std::transform(
     parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) {
       return pcol.get_device_view(stream);
@@ -1498,7 +1533,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     // unbalanced in final page sizes, so using 4 which seems to be a good
     // compromise at smoothing things out without getting fragment sizes too small.
     auto frag_size_fn = [&](auto const& col, size_type col_size) {
-      const int target_frags_per_page = is_col_fixed_width(col) ? 1 : 4;
+      int const target_frags_per_page = is_col_fixed_width(col) ? 1 : 4;
       auto const avg_len =
         target_frags_per_page * util::div_rounding_up_safe<size_type>(col_size, input.num_rows());
       if (avg_len > 0) {
@@ -1549,7 +1584,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   if (num_fragments != 0) {
     // Move column info to device
-    col_desc.host_to_device(stream);
+    col_desc.host_to_device_async(stream);
     leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
@@ -1650,6 +1685,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         ck.start_row         = start_row;
         ck.num_rows          = (uint32_t)row_group.num_rows;
         ck.first_fragment    = c * num_fragments + f;
+        ck.encodings         = 0;
         auto chunk_fragments = row_group_fragments[c].subspan(f, fragments_in_chunk);
         // In fragment struct, add a pointer to the chunk it belongs to
         // In each fragment in chunk_fragments, update the chunk pointer here.
@@ -1666,7 +1702,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
           });
         auto& column_chunk_meta          = row_group.columns[c].meta_data;
         column_chunk_meta.type           = parquet_columns[c].physical_type();
-        column_chunk_meta.encodings      = {Encoding::PLAIN, Encoding::RLE};
         column_chunk_meta.path_in_schema = parquet_columns[c].get_path_in_schema();
         column_chunk_meta.codec          = UNCOMPRESSED;
         column_chunk_meta.num_values     = ck.num_values;
@@ -1679,40 +1714,21 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     }
   }
 
-  row_group_fragments.host_to_device(stream);
+  row_group_fragments.host_to_device_async(stream);
   [[maybe_unused]] auto dict_info_owner = build_chunk_dictionaries(
     chunks, col_desc, row_group_fragments, compression, dict_policy, max_dictionary_size, stream);
-  for (size_t p = 0; p < partitions.size(); p++) {
-    for (int rg = 0; rg < num_rg_in_part[p]; rg++) {
-      size_t global_rg = global_rowgroup_base[p] + rg;
-      for (int col = 0; col < num_columns; col++) {
-        if (chunks.host_view()[rg][col].use_dictionary) {
-          agg_meta->file(p).row_groups[global_rg].columns[col].meta_data.encodings.push_back(
-            Encoding::PLAIN_DICTIONARY);
-        }
-      }
-    }
-  }
 
   // The code preceding this used a uniform fragment size for all columns. Now recompute
   // fragments with a (potentially) varying number of fragments per column.
 
   // first figure out the total number of fragments and calculate the start offset for each column
-  std::vector<size_type> frag_offsets;
-  size_type const total_frags = [&]() {
-    if (frags_per_column.size() > 0) {
-      std::exclusive_scan(frags_per_column.data(),
-                          frags_per_column.data() + num_columns + 1,
-                          std::back_inserter(frag_offsets),
-                          0);
-      return frag_offsets[num_columns];
-    } else {
-      return 0;
-    }
-  }();
+  std::vector<size_type> frag_offsets(num_columns, 0);
+  std::exclusive_scan(frags_per_column.begin(), frags_per_column.end(), frag_offsets.begin(), 0);
+  size_type const total_frags =
+    frags_per_column.empty() ? 0 : frag_offsets.back() + frags_per_column.back();
 
   rmm::device_uvector<statistics_chunk> frag_stats(0, stream);
-  hostdevice_vector<gpu::PageFragment> page_fragments(total_frags, stream);
+  cudf::detail::hostdevice_vector<gpu::PageFragment> page_fragments(total_frags, stream);
 
   // update fragments and/or prepare for fragment statistics calculation if necessary
   if (total_frags != 0) {
@@ -1746,10 +1762,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       }
     }
 
-    chunks.host_to_device(stream);
+    chunks.host_to_device_async(stream);
 
     // re-initialize page fragments
-    page_fragments.host_to_device(stream);
+    page_fragments.host_to_device_async(stream);
     calculate_page_fragments(page_fragments, column_frag_size, stream);
 
     // and gather fragment statistics
@@ -1762,8 +1778,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   }
 
   // Build chunk dictionaries and count pages. Sends chunks to device.
-  hostdevice_vector<size_type> comp_page_sizes = init_page_sizes(
-    chunks, col_desc, num_columns, max_page_size_bytes, max_page_size_rows, compression, stream);
+  cudf::detail::hostdevice_vector<size_type> comp_page_sizes = init_page_sizes(chunks,
+                                                                               col_desc,
+                                                                               num_columns,
+                                                                               max_page_size_bytes,
+                                                                               max_page_size_rows,
+                                                                               write_v2_headers,
+                                                                               compression,
+                                                                               stream);
 
   // Find which partition a rg belongs to
   std::vector<int> rg_to_part;
@@ -1771,10 +1793,15 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     std::fill_n(std::back_inserter(rg_to_part), num_rg_in_part[p], p);
   }
 
+  // Batch processing is no longer supported.
+  // This line disables batch processing (so batch size will no longer be limited at 1GB as before).
+  // TODO: All the relevant code will be removed in the follow-up work:
+  // https://github.com/rapidsai/cudf/issues/13440
+  auto const max_bytes_in_batch = std::numeric_limits<size_t>::max();
+
   // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
   std::vector<size_type> batch_list;
   size_type num_pages          = 0;
-  size_t max_bytes_in_batch    = 1024 * 1024 * 1024;  // 1GB - TODO: Tune this
   size_t max_uncomp_bfr_size   = 0;
   size_t max_comp_bfr_size     = 0;
   size_t max_chunk_bfr_size    = 0;
@@ -1825,8 +1852,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   // Initialize data pointers in batch
   uint32_t const num_stats_bfr =
     (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_pages + num_chunks : 0;
-  rmm::device_buffer uncomp_bfr(max_uncomp_bfr_size, stream);
-  rmm::device_buffer comp_bfr(max_comp_bfr_size, stream);
+
+  // Buffers need to be padded.
+  // Required by `gpuGatherPages`.
+  rmm::device_buffer uncomp_bfr(
+    cudf::util::round_up_safe(max_uncomp_bfr_size, BUFFER_PADDING_MULTIPLE), stream);
+  rmm::device_buffer comp_bfr(cudf::util::round_up_safe(max_comp_bfr_size, BUFFER_PADDING_MULTIPLE),
+                              stream);
+
   rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream);
   rmm::device_uvector<gpu::EncPage> pages(num_pages, stream);
 
@@ -1865,12 +1898,15 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                        compression,
                        max_page_size_bytes,
                        max_page_size_rows,
+                       write_v2_headers,
                        stream);
   }
 
   // Check device write support for all chunks and initialize bounce_buffer.
   bool all_device_write   = true;
   uint32_t max_write_size = 0;
+  std::optional<writer_compression_statistics> comp_stats;
+  if (collect_compression_statistics) { comp_stats = writer_compression_statistics{}; }
 
   // Encode row groups in batches
   for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
@@ -1892,8 +1928,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       (stats_granularity != statistics_freq::STATISTICS_NONE) ? page_stats.data() + num_pages
                                                               : nullptr,
       (stats_granularity == statistics_freq::STATISTICS_COLUMN) ? page_stats.data() : nullptr,
+      comp_stats,
       compression,
       column_index_truncate_length,
+      write_v2_headers,
       stream);
 
     bool need_sync{false};
@@ -1914,13 +1952,13 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
         max_write_size = std::max(max_write_size, ck.compressed_size);
 
+        update_chunk_encodings(column_chunk_meta.encodings, ck.encodings);
+
         if (ck.ck_stat_size != 0) {
-          column_chunk_meta.statistics_blob.resize(ck.ck_stat_size);
-          CUDF_CUDA_TRY(cudaMemcpyAsync(column_chunk_meta.statistics_blob.data(),
-                                        dev_bfr,
-                                        ck.ck_stat_size,
-                                        cudaMemcpyDefault,
-                                        stream.value()));
+          std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
+            device_span<uint8_t const>(dev_bfr, ck.ck_stat_size), stream);
+          cudf::io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
+          cp.read(&column_chunk_meta.statistics);
           need_sync = true;
         }
 
@@ -1944,6 +1982,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                     std::move(first_rg_in_part),
                     std::move(batch_list),
                     std::move(rg_to_part),
+                    std::move(comp_stats),
                     std::move(uncomp_bfr),
                     std::move(comp_bfr),
                     std::move(col_idx_bfr),
@@ -1967,10 +2006,12 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
     _int96_timestamps(options.is_enabled_int96_timestamps()),
+    _write_v2_headers(options.is_enabled_write_v2_headers()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
     _single_write_mode(mode),
-    _out_sink(std::move(sinks))
+    _out_sink(std::move(sinks)),
+    _compression_statistics{options.get_compression_statistics()}
 {
   if (options.get_metadata()) {
     _table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
@@ -1993,10 +2034,12 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
     _int96_timestamps(options.is_enabled_int96_timestamps()),
+    _write_v2_headers(options.is_enabled_write_v2_headers()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
     _single_write_mode(mode),
-    _out_sink(std::move(sinks))
+    _out_sink(std::move(sinks)),
+    _compression_statistics{options.get_compression_statistics()}
 {
   if (options.get_metadata()) {
     _table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
@@ -2018,6 +2061,14 @@ void writer::impl::init_state()
   std::fill_n(_current_chunk_offset.begin(), _current_chunk_offset.size(), sizeof(file_header_s));
 }
 
+void writer::impl::update_compression_statistics(
+  std::optional<writer_compression_statistics> const& compression_stats)
+{
+  if (compression_stats.has_value() and _compression_statistics != nullptr) {
+    *_compression_statistics += compression_stats.value();
+  }
+}
+
 void writer::impl::write(table_view const& input, std::vector<partition_info> const& partitions)
 {
   _last_write_successful = false;
@@ -2036,6 +2087,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                          first_rg_in_part,
                          batch_list,
                          rg_to_part,
+                         comp_stats,
                          uncomp_bfr,   // unused, but contains data for later write to sink
                          comp_bfr,     // unused, but contains data for later write to sink
                          col_idx_bfr,  // unused, but contains data for later write to sink
@@ -2054,10 +2106,12 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                                            _column_index_truncate_length,
                                            _stats_granularity,
                                            _compression,
+                                           _compression_statistics != nullptr,
                                            _dict_policy,
                                            _max_dictionary_size,
                                            _single_write_mode,
                                            _int96_timestamps,
+                                           _write_v2_headers,
                                            _out_sink,
                                            _stream);
     } catch (...) {  // catch any exception type
@@ -2078,6 +2132,8 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                              rg_to_part,
                              bounce_buffer);
 
+  update_compression_statistics(comp_stats);
+
   _last_write_successful = true;
 }
 
@@ -2170,7 +2226,7 @@ void writer::impl::write_parquet_data_to_sink(
             auto const& enc_page = h_pages[curr_page_idx++];
 
             // skip dict pages
-            if (enc_page.page_type != PageType::DATA_PAGE) { continue; }
+            if (enc_page.page_type == PageType::DICTIONARY_PAGE) { continue; }
 
             int32_t this_page_size = enc_page.hdr_size + enc_page.max_data_size;
             // first_row_idx is relative to start of row group
@@ -2244,14 +2300,14 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
     std::vector<uint8_t> buffer;
     CompactProtocolWriter cpw(&buffer);
     buffer.insert(buffer.end(),
-                  reinterpret_cast<const uint8_t*>(&fhdr),
-                  reinterpret_cast<const uint8_t*>(&fhdr) + sizeof(fhdr));
+                  reinterpret_cast<uint8_t const*>(&fhdr),
+                  reinterpret_cast<uint8_t const*>(&fhdr) + sizeof(fhdr));
     file_ender_s fendr;
     fendr.magic      = parquet_magic;
     fendr.footer_len = static_cast<uint32_t>(cpw.write(_agg_meta->get_merged_metadata()));
     buffer.insert(buffer.end(),
-                  reinterpret_cast<const uint8_t*>(&fendr),
-                  reinterpret_cast<const uint8_t*>(&fendr) + sizeof(fendr));
+                  reinterpret_cast<uint8_t const*>(&fendr),
+                  reinterpret_cast<uint8_t const*>(&fendr) + sizeof(fendr));
     return std::make_unique<std::vector<uint8_t>>(std::move(buffer));
   } else {
     return {nullptr};
@@ -2301,7 +2357,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   FileMetaData md;
 
   md.row_groups.reserve(metadata_list.size());
-  for (const auto& blob : metadata_list) {
+  for (auto const& blob : metadata_list) {
     CompactProtocolReader cpreader(
       blob.get()->data(),
       std::max<size_t>(blob.get()->size(), sizeof(file_ender_s)) - sizeof(file_ender_s));
@@ -2318,10 +2374,16 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
     }
   }
   // Reader doesn't currently populate column_order, so infer it here
-  if (md.row_groups.size() != 0) {
+  if (not md.row_groups.empty()) {
+    auto const is_valid_stats = [](auto const& stats) {
+      return not stats.max.empty() || not stats.min.empty() || stats.null_count != -1 ||
+             stats.distinct_count != -1 || not stats.max_value.empty() ||
+             not stats.min_value.empty();
+    };
+
     uint32_t num_columns = static_cast<uint32_t>(md.row_groups[0].columns.size());
     md.column_order_listsize =
-      (num_columns > 0 && md.row_groups[0].columns[0].meta_data.statistics_blob.size())
+      (num_columns > 0 && is_valid_stats(md.row_groups[0].columns[0].meta_data.statistics))
         ? num_columns
         : 0;
   }
@@ -2330,13 +2392,13 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   file_ender_s fendr;
   fhdr.magic = parquet_magic;
   output.insert(output.end(),
-                reinterpret_cast<const uint8_t*>(&fhdr),
-                reinterpret_cast<const uint8_t*>(&fhdr) + sizeof(fhdr));
+                reinterpret_cast<uint8_t const*>(&fhdr),
+                reinterpret_cast<uint8_t const*>(&fhdr) + sizeof(fhdr));
   fendr.footer_len = static_cast<uint32_t>(cpw.write(md));
   fendr.magic      = parquet_magic;
   output.insert(output.end(),
-                reinterpret_cast<const uint8_t*>(&fendr),
-                reinterpret_cast<const uint8_t*>(&fendr) + sizeof(fendr));
+                reinterpret_cast<uint8_t const*>(&fendr),
+                reinterpret_cast<uint8_t const*>(&fendr) + sizeof(fendr));
   return std::make_unique<std::vector<uint8_t>>(std::move(output));
 }
 
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index a9fe5612bfb..89ef85ba2bd 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -25,7 +25,6 @@
 #include "parquet_gpu.hpp"
 
 #include <cudf/io/data_sink.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/detail/parquet.hpp>
@@ -93,6 +92,14 @@ class writer::impl {
    */
   void init_state();
 
+  /**
+   * @brief Updates writer-level statistics with data from the current table.
+   *
+   * @param compression_stats Optional compression statistics from the current table
+   */
+  void update_compression_statistics(
+    std::optional<writer_compression_statistics> const& compression_stats);
+
   /**
    * @brief Writes a single subtable as part of a larger parquet file/table write,
    * normally used for chunked writing.
@@ -154,6 +161,7 @@ class writer::impl {
   size_t const _max_dictionary_size;
   std::optional<size_type> const _max_page_fragment_size;
   bool const _int96_timestamps;
+  bool const _write_v2_headers;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
   single_write_mode const _single_write_mode;  // Special parameter only used by `write()` to
@@ -167,6 +175,7 @@ class writer::impl {
   std::vector<std::size_t> _current_chunk_offset;  // To track if the last write(table) call
                                                    // completed successfully current write
                                                    // position for rowgroups/chunks.
+  std::shared_ptr<writer_compression_statistics> _compression_statistics;  // Optional output
   bool _last_write_successful = false;
   bool _closed                = false;  // To track if the output has been written to sink.
 };
diff --git a/cpp/src/io/statistics/byte_array_view.cuh b/cpp/src/io/statistics/byte_array_view.cuh
index 10c647d6124..0fe6c17db89 100644
--- a/cpp/src/io/statistics/byte_array_view.cuh
+++ b/cpp/src/io/statistics/byte_array_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -120,7 +120,7 @@ class byte_array_view {
    * @param rhs Target byte_array_view to compare with this byte_array_view.
    * @return true if this byte_array_view is ordered before rhs
    */
-  [[nodiscard]] __device__ inline bool operator<(const byte_array_view& rhs) const
+  [[nodiscard]] __device__ inline bool operator<(byte_array_view const& rhs) const
   {
     return compare(rhs) < 0;
   }
@@ -130,7 +130,7 @@ class byte_array_view {
    * @param rhs Target byte_array_view to compare with this byte_array_view.
    * @return true if rhs is ordered before this byte_array_view
    */
-  [[nodiscard]] __device__ inline bool operator>(const byte_array_view& rhs) const
+  [[nodiscard]] __device__ inline bool operator>(byte_array_view const& rhs) const
   {
     return compare(rhs) > 0;
   }
@@ -141,7 +141,7 @@ class byte_array_view {
    * @param rhs Target byte_array_view to compare with this byte_array_view.
    * @return true if this byte_array_view is ordered before rhs
    */
-  [[nodiscard]] __device__ inline bool operator<=(const byte_array_view& rhs) const
+  [[nodiscard]] __device__ inline bool operator<=(byte_array_view const& rhs) const
   {
     return compare(rhs) <= 0;
   }
@@ -151,7 +151,7 @@ class byte_array_view {
    * @param rhs Target byte_array_view to compare with this byte_array_view.
    * @return true if rhs is ordered before this byte_array_view
    */
-  [[nodiscard]] __device__ inline bool operator>=(const byte_array_view& rhs) const
+  [[nodiscard]] __device__ inline bool operator>=(byte_array_view const& rhs) const
   {
     return compare(rhs) >= 0;
   }
diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh
index 0b09cb63d19..28e77f62a43 100644
--- a/cpp/src/io/statistics/column_statistics.cuh
+++ b/cpp/src/io/statistics/column_statistics.cuh
@@ -192,8 +192,8 @@ struct merge_group_statistics_functor {
   template <typename T,
             std::enable_if_t<detail::statistics_type_category<T, IO>::ignore>* = nullptr>
   __device__ void operator()(merge_state_s& s,
-                             const statistics_chunk* chunks,
-                             const uint32_t num_chunks,
+                             statistics_chunk const* chunks,
+                             uint32_t const num_chunks,
                              uint32_t t)
   {
     // No-op for unsupported aggregation types
@@ -204,8 +204,8 @@ struct merge_group_statistics_functor {
                              (IO == detail::io_file_format::ORC or
                               !std::is_same_v<T, list_view>)>* = nullptr>
   __device__ void operator()(merge_state_s& s,
-                             const statistics_chunk* chunks,
-                             const uint32_t num_chunks,
+                             statistics_chunk const* chunks,
+                             uint32_t const num_chunks,
                              uint32_t t)
   {
     detail::storage_wrapper<block_size> storage(temp_storage);
@@ -227,8 +227,8 @@ struct merge_group_statistics_functor {
                              IO == detail::io_file_format::PARQUET and
                              std::is_same_v<T, list_view>>* = nullptr>
   __device__ void operator()(merge_state_s& s,
-                             const statistics_chunk* chunks,
-                             const uint32_t num_chunks,
+                             statistics_chunk const* chunks,
+                             uint32_t const num_chunks,
                              uint32_t t)
   {
     operator()<statistics::byte_array_view>(s, chunks, num_chunks, t);
@@ -239,8 +239,8 @@ struct merge_group_statistics_functor {
     std::enable_if_t<detail::statistics_type_category<T, IO>::include_count and
                      not detail::statistics_type_category<T, IO>::include_extrema>* = nullptr>
   __device__ void operator()(merge_state_s& s,
-                             const statistics_chunk* chunks,
-                             const uint32_t num_chunks,
+                             statistics_chunk const* chunks,
+                             uint32_t const num_chunks,
                              uint32_t t)
   {
     detail::storage_wrapper<block_size> storage(temp_storage);
@@ -266,7 +266,7 @@ struct merge_group_statistics_functor {
  * @tparam T Type of object
  */
 template <typename T>
-__device__ void cooperative_load(T& destination, const T* source = nullptr)
+__device__ void cooperative_load(T& destination, T const* source = nullptr)
 {
   using load_type = std::conditional_t<((sizeof(T) % sizeof(uint32_t)) == 0), uint32_t, uint8_t>;
   if (source == nullptr) {
@@ -275,7 +275,7 @@ __device__ void cooperative_load(T& destination, const T* source = nullptr)
     }
   } else {
     for (auto i = threadIdx.x; i < sizeof(T) / sizeof(load_type); i += blockDim.x) {
-      reinterpret_cast<load_type*>(&destination)[i] = reinterpret_cast<const load_type*>(source)[i];
+      reinterpret_cast<load_type*>(&destination)[i] = reinterpret_cast<load_type const*>(source)[i];
     }
   }
 }
@@ -291,8 +291,8 @@ __device__ void cooperative_load(T& destination, const T* source = nullptr)
 template <int block_size, detail::io_file_format IO>
 __global__ void __launch_bounds__(block_size, 1)
   gpu_calculate_group_statistics(statistics_chunk* chunks,
-                                 const statistics_group* groups,
-                                 const bool int96_timestamps)
+                                 statistics_group const* groups,
+                                 bool const int96_timestamps)
 {
   __shared__ __align__(8) stats_state_s state;
   __shared__ block_reduce_storage<block_size> storage;
@@ -348,10 +348,10 @@ namespace detail {
  */
 template <detail::io_file_format IO>
 void calculate_group_statistics(statistics_chunk* chunks,
-                                const statistics_group* groups,
+                                statistics_group const* groups,
                                 uint32_t num_chunks,
                                 rmm::cuda_stream_view stream,
-                                const bool int96_timestamps = false)
+                                bool const int96_timestamps = false)
 {
   constexpr int block_size = 256;
   gpu_calculate_group_statistics<block_size, IO>
@@ -370,8 +370,8 @@ void calculate_group_statistics(statistics_chunk* chunks,
 template <int block_size, detail::io_file_format IO>
 __global__ void __launch_bounds__(block_size, 1)
   gpu_merge_group_statistics(statistics_chunk* chunks_out,
-                             const statistics_chunk* chunks_in,
-                             const statistics_merge_group* groups)
+                             statistics_chunk const* chunks_in,
+                             statistics_merge_group const* groups)
 {
   __shared__ __align__(8) merge_state_s state;
   __shared__ block_reduce_storage<block_size> storage;
@@ -402,8 +402,8 @@ __global__ void __launch_bounds__(block_size, 1)
  */
 template <detail::io_file_format IO>
 void merge_group_statistics(statistics_chunk* chunks_out,
-                            const statistics_chunk* chunks_in,
-                            const statistics_merge_group* groups,
+                            statistics_chunk const* chunks_in,
+                            statistics_merge_group const* groups,
                             uint32_t num_chunks,
                             rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/statistics/orc_column_statistics.cu b/cpp/src/io/statistics/orc_column_statistics.cu
index 26ea41817af..f3356b3a331 100644
--- a/cpp/src/io/statistics/orc_column_statistics.cu
+++ b/cpp/src/io/statistics/orc_column_statistics.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,13 +27,13 @@ namespace detail {
 
 template <>
 void merge_group_statistics<detail::io_file_format::ORC>(statistics_chunk* chunks_out,
-                                                         const statistics_chunk* chunks_in,
-                                                         const statistics_merge_group* groups,
+                                                         statistics_chunk const* chunks_in,
+                                                         statistics_merge_group const* groups,
                                                          uint32_t num_chunks,
                                                          rmm::cuda_stream_view stream);
 template <>
 void calculate_group_statistics<detail::io_file_format::ORC>(statistics_chunk* chunks,
-                                                             const statistics_group* groups,
+                                                             statistics_group const* groups,
                                                              uint32_t num_chunks,
                                                              rmm::cuda_stream_view stream,
                                                              bool int96_timestamp);
diff --git a/cpp/src/io/statistics/parquet_column_statistics.cu b/cpp/src/io/statistics/parquet_column_statistics.cu
index 52c5cc99f4b..091f08d8fee 100644
--- a/cpp/src/io/statistics/parquet_column_statistics.cu
+++ b/cpp/src/io/statistics/parquet_column_statistics.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,13 +27,13 @@ namespace detail {
 
 template <>
 void merge_group_statistics<detail::io_file_format::PARQUET>(statistics_chunk* chunks_out,
-                                                             const statistics_chunk* chunks_in,
-                                                             const statistics_merge_group* groups,
+                                                             statistics_chunk const* chunks_in,
+                                                             statistics_merge_group const* groups,
                                                              uint32_t num_chunks,
                                                              rmm::cuda_stream_view stream);
 template <>
 void calculate_group_statistics<detail::io_file_format::PARQUET>(statistics_chunk* chunks,
-                                                                 const statistics_group* groups,
+                                                                 statistics_group const* groups,
                                                                  uint32_t num_chunks,
                                                                  rmm::cuda_stream_view stream,
                                                                  bool int96_timestamp);
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index 8d24d443d42..805ca43553e 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -65,10 +65,10 @@ struct stats_column_desc {
 
 template <typename ReturnType, typename InternalType>
 struct t_array_stats {
-  const InternalType* ptr;  //!< ptr to data
+  InternalType const* ptr;  //!< ptr to data
   size_type length;         //!< length of data
   __host__ __device__ __forceinline__ volatile t_array_stats& operator=(
-    const ReturnType& val) volatile
+    ReturnType const& val) volatile
   {
     ptr    = val.data();
     length = val.size_bytes();
@@ -108,7 +108,7 @@ struct statistics_chunk {
 };
 
 struct statistics_group {
-  const stats_column_desc* col;  //!< Column information
+  stats_column_desc const* col;  //!< Column information
   uint32_t start_row;            //!< Start row of this group
   uint32_t num_rows;             //!< Number of rows in group
   uint32_t non_leaf_nulls;       //!< Number of null non-leaf values in the group
@@ -132,7 +132,7 @@ __device__ T get_element(column_device_view const& col, uint32_t row)
 {
   using et              = typename T::element_type;
   size_type const index = row + col.offset();  // account for this view's _offset
-  auto const* d_offsets = col.child(lists_column_view::offsets_column_index).data<offset_type>();
+  auto const* d_offsets = col.child(lists_column_view::offsets_column_index).data<size_type>();
   auto const* d_data    = col.child(lists_column_view::child_column_index).data<et>();
   auto const offset     = d_offsets[index];
   return T(d_data + offset, d_offsets[index + 1] - offset);
diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh
index 72ec166a539..32931d7d34d 100644
--- a/cpp/src/io/statistics/statistics_type_identification.cuh
+++ b/cpp/src/io/statistics/statistics_type_identification.cuh
@@ -92,7 +92,7 @@ class type_conversion {
   using type = typename type_selector::template type<T>;
 
   template <typename T>
-  static constexpr __device__ typename type_selector::template type<T> convert(const T& elem)
+  static constexpr __device__ typename type_selector::template type<T> convert(T const& elem)
   {
     using Type = typename type_selector::template type<T>;
     if constexpr (cudf::is_duration<T>()) {
@@ -153,7 +153,7 @@ class extrema_type {
   /**
    * @brief Function that converts an element of a leaf column into its extrema type
    */
-  __device__ static type convert(const T& val)
+  __device__ static type convert(T const& val)
   {
     if constexpr (std::is_arithmetic_v<T> or std::is_same_v<T, string_view> or
                   std::is_same_v<T, byte_array_view>) {
@@ -215,7 +215,7 @@ class aggregation_type {
   /**
    * @brief Function that converts an element of a leaf column into its aggregate type
    */
-  __device__ static type convert(const T& val)
+  __device__ static type convert(T const& val)
   {
     if constexpr (std::is_same_v<T, string_view> or std::is_same_v<T, byte_array_view>) {
       return val.size_bytes();
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
index 6c4e7c6dc88..d007209a12a 100644
--- a/cpp/src/io/statistics/typed_statistics_chunk.cuh
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -42,7 +42,7 @@ namespace io {
  */
 class union_member {
   template <typename U, typename V>
-  using reference_type = std::conditional_t<std::is_const_v<U>, const V&, V&>;
+  using reference_type = std::conditional_t<std::is_const_v<U>, V const&, V&>;
 
  public:
   template <typename T, typename U>
@@ -127,7 +127,7 @@ struct typed_statistics_chunk<T, true> {
   {
   }
 
-  __device__ void reduce(const T& elem)
+  __device__ void reduce(T const& elem)
   {
     non_nulls++;
     minimum_value = thrust::min<E>(minimum_value, detail::extrema_type<T>::convert(elem));
@@ -136,7 +136,7 @@ struct typed_statistics_chunk<T, true> {
     has_minmax = true;
   }
 
-  __device__ void reduce(const statistics_chunk& chunk)
+  __device__ void reduce(statistics_chunk const& chunk)
   {
     if (chunk.has_minmax) {
       minimum_value = thrust::min<E>(minimum_value, union_member::get<E>(chunk.min_value));
@@ -166,7 +166,7 @@ struct typed_statistics_chunk<T, false> {
   {
   }
 
-  __device__ void reduce(const T& elem)
+  __device__ void reduce(T const& elem)
   {
     non_nulls++;
     minimum_value = thrust::min<E>(minimum_value, detail::extrema_type<T>::convert(elem));
@@ -174,7 +174,7 @@ struct typed_statistics_chunk<T, false> {
     has_minmax    = true;
   }
 
-  __device__ void reduce(const statistics_chunk& chunk)
+  __device__ void reduce(statistics_chunk const& chunk)
   {
     if (chunk.has_minmax) {
       minimum_value = thrust::min<E>(minimum_value, union_member::get<E>(chunk.min_value));
@@ -236,7 +236,7 @@ __inline__ __device__ typed_statistics_chunk<T, include_aggregate> block_reduce(
  */
 template <typename T, bool include_aggregate>
 __inline__ __device__ statistics_chunk
-get_untyped_chunk(const typed_statistics_chunk<T, include_aggregate>& chunk)
+get_untyped_chunk(typed_statistics_chunk<T, include_aggregate> const& chunk)
 {
   using E = typename detail::extrema_type<T>::type;
   statistics_chunk stat{};
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index 715f70605df..77647c18b20 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -14,11 +14,13 @@
  * limitations under the License.
  */
 
-#include "io/comp/nvcomp_adapter.hpp"
 #include "io/text/device_data_chunks.hpp"
-#include "io/utilities/config_utils.hpp"
+
+#include <io/comp/nvcomp_adapter.hpp>
+#include <io/utilities/config_utils.hpp>
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
@@ -48,14 +50,14 @@ struct bgzip_nvcomp_transform_functor {
   uint8_t const* compressed_ptr;
   uint8_t* decompressed_ptr;
 
-  __device__ thrust::tuple<device_span<const uint8_t>, device_span<uint8_t>> operator()(
+  __device__ thrust::tuple<device_span<uint8_t const>, device_span<uint8_t>> operator()(
     thrust::tuple<std::size_t, std::size_t, std::size_t, std::size_t> t)
   {
     auto const compressed_begin   = thrust::get<0>(t);
     auto const compressed_end     = thrust::get<1>(t);
     auto const decompressed_begin = thrust::get<2>(t);
     auto const decompressed_end   = thrust::get<3>(t);
-    return thrust::make_tuple(device_span<const uint8_t>{compressed_ptr + compressed_begin,
+    return thrust::make_tuple(device_span<uint8_t const>{compressed_ptr + compressed_begin,
                                                          compressed_end - compressed_begin},
                               device_span<uint8_t>{decompressed_ptr + decompressed_begin,
                                                    decompressed_end - decompressed_begin});
@@ -69,7 +71,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
                              rmm::device_uvector<T>& device,
                              rmm::cuda_stream_view stream)
   {
-    device.resize(host.size(), stream);
+    // Buffer needs to be padded.
+    // Required by `inflate_kernel`.
+    device.resize(cudf::util::round_up_safe(host.size(), BUFFER_PADDING_MULTIPLE), stream);
     CUDF_CUDA_TRY(cudaMemcpyAsync(
       device.data(), host.data(), host.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
   }
@@ -88,7 +92,7 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
     rmm::device_uvector<char> d_decompressed_blocks;
     rmm::device_uvector<std::size_t> d_compressed_offsets;
     rmm::device_uvector<std::size_t> d_decompressed_offsets;
-    rmm::device_uvector<device_span<const uint8_t>> d_compressed_spans;
+    rmm::device_uvector<device_span<uint8_t const>> d_compressed_spans;
     rmm::device_uvector<device_span<uint8_t>> d_decompressed_spans;
     rmm::device_uvector<compression_result> d_decompression_results;
     std::size_t compressed_size_with_headers{};
@@ -139,7 +143,7 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
         offset_it + num_blocks(),
         span_it,
         bgzip_nvcomp_transform_functor{reinterpret_cast<uint8_t const*>(d_compressed_blocks.data()),
-                                       reinterpret_cast<uint8_t*>(d_decompressed_blocks.begin())});
+                                       reinterpret_cast<uint8_t*>(d_decompressed_blocks.data())});
       if (decompressed_size() > 0) {
         if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
           gpuinflate(d_compressed_spans,
diff --git a/cpp/src/io/text/bgzip_utils.cpp b/cpp/src/io/text/bgzip_utils.cpp
index dd08387a6b5..43e2c26f132 100644
--- a/cpp/src/io/text/bgzip_utils.cpp
+++ b/cpp/src/io/text/bgzip_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -120,7 +120,7 @@ void write_header(std::ostream& output_stream,
     4,    // xfl: irrelevant
     3     // OS: irrelevant
   }};
-  output_stream.write(reinterpret_cast<const char*>(header_data.data()), header_data.size());
+  output_stream.write(reinterpret_cast<char const*>(header_data.data()), header_data.size());
   auto const extra_size = pre_size_subfield.size() + extra_blocklen_field_header.size() +
                           sizeof(uint16_t) + post_size_subfield.size();
   auto const block_size =
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 1ff1a4f8ebf..9d1d0498ace 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -182,7 +182,7 @@ class istream_data_chunk_reader : public data_chunk_reader {
  */
 class host_span_data_chunk_reader : public data_chunk_reader {
  public:
-  host_span_data_chunk_reader(cudf::host_span<const char> data) : _data(data) {}
+  host_span_data_chunk_reader(cudf::host_span<char const> data) : _data(data) {}
 
   void skip_bytes(std::size_t read_size) override
   {
@@ -215,7 +215,7 @@ class host_span_data_chunk_reader : public data_chunk_reader {
 
  private:
   std::size_t _position = 0;
-  cudf::host_span<const char> _data;
+  cudf::host_span<char const> _data;
 };
 
 /**
@@ -288,14 +288,14 @@ class file_data_chunk_source : public data_chunk_source {
  */
 class host_span_data_chunk_source : public data_chunk_source {
  public:
-  host_span_data_chunk_source(host_span<const char> data) : _data(data) {}
+  host_span_data_chunk_source(host_span<char const> data) : _data(data) {}
   [[nodiscard]] std::unique_ptr<data_chunk_reader> create_reader() const override
   {
     return std::make_unique<host_span_data_chunk_reader>(_data);
   }
 
  private:
-  host_span<const char> _data;
+  host_span<char const> _data;
 };
 
 /**
@@ -320,7 +320,7 @@ std::unique_ptr<data_chunk_source> make_source(datasource& data)
   return std::make_unique<datasource_chunk_source>(data);
 }
 
-std::unique_ptr<data_chunk_source> make_source(host_span<const char> data)
+std::unique_ptr<data_chunk_source> make_source(host_span<char const> data)
 {
   return std::make_unique<host_span_data_chunk_source>(data);
 }
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index a04c7d84463..818bbc0a18a 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
@@ -143,7 +144,7 @@ __global__ void multibyte_split_init_kernel(
   cudf::io::text::detail::scan_tile_status status =
     cudf::io::text::detail::scan_tile_status::invalid)
 {
-  auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  auto const thread_idx = cudf::detail::grid_1d::global_thread_id();
   if (thread_idx < num_tiles) {
     auto const tile_idx = base_tile_idx + thread_idx;
     tile_multistates.set_status(tile_idx, status);
@@ -151,19 +152,6 @@ __global__ void multibyte_split_init_kernel(
   }
 }
 
-__global__ void multibyte_split_seed_kernel(
-  cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
-  cudf::io::text::detail::scan_tile_state_view<output_offset> tile_output_offsets,
-  multistate tile_multistate_seed,
-  output_offset tile_output_offset)
-{
-  auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (thread_idx == 0) {
-    tile_multistates.set_inclusive_prefix(-1, tile_multistate_seed);
-    tile_output_offsets.set_inclusive_prefix(-1, tile_output_offset);
-  }
-}
-
 __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
   cudf::size_type base_tile_idx,
   byte_offset base_input_offset,
@@ -185,10 +173,12 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
     typename OffsetScan::TempStorage offset_scan;
   } temp_storage;
 
-  int32_t const tile_idx            = base_tile_idx + blockIdx.x;
-  int32_t const tile_input_offset   = blockIdx.x * ITEMS_PER_TILE;
-  int32_t const thread_input_offset = tile_input_offset + threadIdx.x * ITEMS_PER_THREAD;
-  int32_t const thread_input_size   = chunk_input_chars.size() - thread_input_offset;
+  auto const tile_idx          = base_tile_idx + blockIdx.x;
+  auto const tile_input_offset = blockIdx.x * ITEMS_PER_TILE;
+  auto const thread_input_offset =
+    tile_input_offset + cudf::thread_index_type{threadIdx.x} * ITEMS_PER_THREAD;
+  auto const thread_input_size =
+    std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
 
   // STEP 1: Load inputs
 
@@ -258,10 +248,12 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
     typename OffsetScan::TempStorage offset_scan;
   } temp_storage;
 
-  int32_t const tile_idx            = base_tile_idx + blockIdx.x;
-  int32_t const tile_input_offset   = blockIdx.x * ITEMS_PER_TILE;
-  int32_t const thread_input_offset = tile_input_offset + threadIdx.x * ITEMS_PER_THREAD;
-  int32_t const thread_input_size   = chunk_input_chars.size() - thread_input_offset;
+  auto const tile_idx          = base_tile_idx + blockIdx.x;
+  auto const tile_input_offset = blockIdx.x * ITEMS_PER_TILE;
+  auto const thread_input_offset =
+    tile_input_offset + cudf::thread_index_type{threadIdx.x} * ITEMS_PER_THREAD;
+  auto const thread_input_size =
+    std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
 
   // STEP 1: Load inputs
 
@@ -401,11 +393,14 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
   // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
   // would have to follow separate logic.
-  multibyte_split_seed_kernel<<<1, 1, 0, stream.value()>>>(  //
-    tile_multistates,
-    tile_offsets,
-    multistate_seed,
-    0);
+  cudf::detail::device_single_thread(
+    [tm = scan_tile_state_view<multistate>(tile_multistates),
+     to = scan_tile_state_view<output_offset>(tile_offsets),
+     multistate_seed] __device__() mutable {
+      tm.set_inclusive_prefix(-1, multistate_seed);
+      to.set_inclusive_prefix(-1, 0);
+    },
+    stream);
 
   auto reader               = source.create_reader();
   auto chunk_offset         = std::max<byte_offset>(0, byte_range.offset() - delimiter.size());
diff --git a/cpp/src/io/utilities/arrow_io_source.cpp b/cpp/src/io/utilities/arrow_io_source.cpp
new file mode 100644
index 00000000000..d647f3c0a4b
--- /dev/null
+++ b/cpp/src/io/utilities/arrow_io_source.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/arrow_io_source.hpp>
+
+#include <arrow/buffer.h>
+#include <arrow/filesystem/filesystem.h>
+#include <arrow/result.h>
+
+#include <memory>
+#include <string>
+
+namespace cudf::io {
+
+/**
+ * @brief Implementation for an owning buffer where `arrow::Buffer` holds the data.
+ */
+class arrow_io_buffer : public datasource::buffer {
+  std::shared_ptr<arrow::Buffer> arrow_buffer;
+
+ public:
+  explicit arrow_io_buffer(std::shared_ptr<arrow::Buffer> arrow_buffer) : arrow_buffer(arrow_buffer)
+  {
+  }
+  [[nodiscard]] size_t size() const override { return arrow_buffer->size(); }
+  [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); }
+};
+
+arrow_io_source::arrow_io_source(std::string const& arrow_uri)
+{
+  std::string const uri_start_delimiter = "//";
+  std::string const uri_end_delimiter   = "?";
+
+  auto const result = arrow::fs::FileSystemFromUri(arrow_uri);
+  CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI.");
+  filesystem = result.ValueOrDie();
+
+  // Parse the path from the URI
+  auto const start = [&]() {
+    auto const delim_start = arrow_uri.find(uri_start_delimiter);
+    return delim_start == std::string::npos ? 0 : delim_start + uri_start_delimiter.size();
+  }();
+  auto const end  = arrow_uri.find(uri_end_delimiter) - start;
+  auto const path = arrow_uri.substr(start, end);
+
+  auto const in_stream = filesystem->OpenInputFile(path);
+  CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile");
+  arrow_file = in_stream.ValueOrDie();
+}
+
+std::unique_ptr<datasource::buffer> arrow_io_source::host_read(size_t offset, size_t size)
+{
+  auto const result = arrow_file->ReadAt(offset, size);
+  CUDF_EXPECTS(result.ok(), "Cannot read file data");
+  return std::make_unique<arrow_io_buffer>(result.ValueOrDie());
+}
+
+size_t arrow_io_source::host_read(size_t offset, size_t size, uint8_t* dst)
+{
+  auto const result = arrow_file->ReadAt(offset, size, dst);
+  CUDF_EXPECTS(result.ok(), "Cannot read file data");
+  return result.ValueOrDie();
+}
+
+[[nodiscard]] size_t arrow_io_source::size() const
+{
+  auto const result = arrow_file->GetSize();
+  CUDF_EXPECTS(result.ok(), "Cannot get file size");
+  return result.ValueOrDie();
+}
+
+}  // namespace cudf::io
diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh
index d73f0ebc9b7..f028b0bb367 100644
--- a/cpp/src/io/utilities/block_utils.cuh
+++ b/cpp/src/io/utilities/block_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,18 +124,18 @@ inline __device__ double Int128ToDouble_rn(uint64_t lo, int64_t hi)
   return sign * __fma_rn(__ll2double_rn(hi), 4294967296.0 * 4294967296.0, __ull2double_rn(lo));
 }
 
-inline __device__ uint32_t unaligned_load32(const uint8_t* p)
+inline __device__ uint32_t unaligned_load32(uint8_t const* p)
 {
   uint32_t ofs    = 3 & reinterpret_cast<uintptr_t>(p);
-  const auto* p32 = reinterpret_cast<const uint32_t*>(p - ofs);
+  auto const* p32 = reinterpret_cast<uint32_t const*>(p - ofs);
   uint32_t v      = p32[0];
   return (ofs) ? __funnelshift_r(v, p32[1], ofs * 8) : v;
 }
 
-inline __device__ uint64_t unaligned_load64(const uint8_t* p)
+inline __device__ uint64_t unaligned_load64(uint8_t const* p)
 {
   uint32_t ofs    = 3 & reinterpret_cast<uintptr_t>(p);
-  const auto* p32 = reinterpret_cast<const uint32_t*>(p - ofs);
+  auto const* p32 = reinterpret_cast<uint32_t const*>(p - ofs);
   uint32_t v0     = p32[0];
   uint32_t v1     = p32[1];
   if (ofs) {
@@ -146,10 +146,10 @@ inline __device__ uint64_t unaligned_load64(const uint8_t* p)
 }
 
 template <unsigned int nthreads, bool sync_before_store>
-inline __device__ void memcpy_block(void* dstv, const void* srcv, uint32_t len, uint32_t t)
+inline __device__ void memcpy_block(void* dstv, void const* srcv, uint32_t len, uint32_t t)
 {
   auto* dst       = static_cast<uint8_t*>(dstv);
-  const auto* src = static_cast<const uint8_t*>(srcv);
+  auto const* src = static_cast<uint8_t const*>(srcv);
   uint32_t dst_align_bytes, src_align_bytes, src_align_bits;
   // Align output to 32-bit
   dst_align_bytes = 3 & -reinterpret_cast<intptr_t>(dst);
@@ -166,7 +166,7 @@ inline __device__ void memcpy_block(void* dstv, const void* srcv, uint32_t len,
   src_align_bytes = (uint32_t)(3 & reinterpret_cast<uintptr_t>(src));
   src_align_bits  = src_align_bytes * 8;
   while (len >= 4) {
-    const auto* src32 = reinterpret_cast<const uint32_t*>(src - src_align_bytes);
+    auto const* src32 = reinterpret_cast<uint32_t const*>(src - src_align_bytes);
     uint32_t copy_cnt = min(len >> 2, nthreads);
     uint32_t v;
     if (t < copy_cnt) {
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index d0783fe8a01..f3a43cbc63c 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -26,43 +26,66 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
-namespace cudf {
-namespace io {
-namespace detail {
+namespace cudf::io::detail {
 
-void column_buffer::create(size_type _size,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
 {
-  size = _size;
+  CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
+  // The contents of _strings will never be directly returned to the user.
+  // Due to the fact that make_strings_column copies the input data to
+  // produce its outputs, _strings is actually a temporary. As a result, we
+  // do not pass the provided mr to the call to
+  // make_zeroed_device_uvector_async here and instead let it use the
+  // default rmm memory resource.
+  _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
+    cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+      size, stream, rmm::mr::get_current_device_resource()));
+}
 
-  switch (type.id()) {
-    case type_id::STRING:
-      // The contents of _strings will never be directly returned to the user.
-      // Due to the fact that make_strings_column copies the input data to
-      // produce its outputs, _strings is actually a temporary. As a result, we
-      // do not pass the provided mr to the call to
-      // make_zeroed_device_uvector_async here and instead let it use the
-      // default rmm memory resource.
-      _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
-        cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-          size, stream, rmm::mr::get_current_device_resource()));
-      break;
+std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
+{
+  // make_strings_column allocates new memory, it does not simply move
+  // from the inputs, so we need to pass it the memory resource given to
+  // the buffer on construction so that the memory is allocated using the
+  // resource that the calling code expected.
+  return make_strings_column(*_strings, stream, _mr);
+}
 
-    // list columns store a buffer of int32's as offsets to represent
-    // their individual rows
-    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, mr); break;
+void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
+  // size + 1 for final offset. _string_data will be initialized later.
+  _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr);
+}
 
-    // struct columns store no data themselves.  just validity and children.
-    case type_id::STRUCT: break;
+void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
+{
+  _string_data = rmm::device_buffer(num_bytes, stream, _mr);
+}
 
-    default: _data = create_data(type, size, stream, mr); break;
-  }
-  if (is_nullable) {
-    _null_mask =
-      cudf::detail::create_null_mask(size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
-  }
-  this->mr = mr;
+std::unique_ptr<column> inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
+{
+  // no need for copies, just transfer ownership of the data_buffers to the columns
+  auto const state = mask_state::UNALLOCATED;
+  auto str_col =
+    _string_data.is_empty()
+      ? make_empty_column(data_type{type_id::INT8})
+      : std::make_unique<column>(data_type{type_id::INT8},
+                                 string_size(),
+                                 std::move(_string_data),
+                                 cudf::detail::create_null_mask(size, state, stream, _mr),
+                                 state_null_count(state, size),
+                                 std::vector<std::unique_ptr<column>>{});
+  auto offsets_col =
+    std::make_unique<column>(data_type{type_to_id<size_type>()},
+                             size + 1,
+                             std::move(_data),
+                             cudf::detail::create_null_mask(size + 1, state, stream, _mr),
+                             state_null_count(state, size + 1),
+                             std::vector<std::unique_ptr<column>>{});
+
+  return make_strings_column(
+    size, std::move(offsets_col), std::move(str_col), null_count(), std::move(_null_mask));
 }
 
 namespace {
@@ -73,31 +96,63 @@ namespace {
  * @param buff The old output buffer
  * @param new_buff The new output buffer
  */
-void copy_buffer_data(column_buffer const& buff, column_buffer& new_buff)
+template <class string_policy>
+void copy_buffer_data(string_policy const& buff, string_policy& new_buff)
 {
   new_buff.name      = buff.name;
   new_buff.user_data = buff.user_data;
   for (auto const& child : buff.children) {
-    auto& new_child = new_buff.children.emplace_back(column_buffer(child.type, child.is_nullable));
+    auto& new_child = new_buff.children.emplace_back(string_policy(child.type, child.is_nullable));
     copy_buffer_data(child, new_child);
   }
 }
 
 }  // namespace
 
-column_buffer column_buffer::empty_like(column_buffer const& input)
+template <class string_policy>
+void column_buffer_base<string_policy>::create(size_type _size,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
 {
-  auto new_buff = column_buffer(input.type, input.is_nullable);
+  size = _size;
+  _mr  = mr;
+
+  switch (type.id()) {
+    case type_id::STRING: static_cast<string_policy*>(this)->allocate_strings_data(stream); break;
+
+    // list columns store a buffer of int32's as offsets to represent
+    // their individual rows
+    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, _mr); break;
+
+    // struct columns store no data themselves.  just validity and children.
+    case type_id::STRUCT: break;
+
+    default: _data = create_data(type, size, stream, _mr); break;
+  }
+  if (is_nullable) {
+    _null_mask = cudf::detail::create_null_mask(
+      size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), _mr);
+  }
+}
+
+template <class string_policy>
+string_policy column_buffer_base<string_policy>::empty_like(string_policy const& input)
+{
+  auto new_buff = string_policy(input.type, input.is_nullable);
   copy_buffer_data(input, new_buff);
   return new_buff;
 }
 
-std::unique_ptr<column> make_column(column_buffer& buffer,
+template <class string_policy>
+std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     column_name_info* schema_info,
                                     std::optional<reader_column_schema> const& schema,
                                     rmm::cuda_stream_view stream)
 {
-  if (schema_info != nullptr) { schema_info->name = buffer.name; }
+  if (schema_info != nullptr) {
+    schema_info->name        = buffer.name;
+    schema_info->is_nullable = buffer.is_nullable;
+  }
 
   switch (buffer.type.id()) {
     case type_id::STRING:
@@ -111,10 +166,10 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
         // from the inputs, so we need to pass it the memory resource given to
         // the buffer on construction so that the memory is allocated using the
         // resource that the calling code expected.
-        return make_strings_column(*buffer._strings, stream, buffer.mr);
+        return buffer.make_string_column(stream);
       } else {
         // convert to binary
-        auto const string_col = make_strings_column(*buffer._strings, stream, buffer.mr);
+        auto const string_col = buffer.make_string_column(stream);
         auto const num_rows   = string_col->size();
         auto const null_count = string_col->null_count();
         auto col_content      = string_col->release();
@@ -142,8 +197,8 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
 
     case type_id::LIST: {
       // make offsets column
-      auto offsets =
-        std::make_unique<column>(data_type{type_id::INT32}, buffer.size, std::move(buffer._data));
+      auto offsets = std::make_unique<column>(
+        data_type{type_id::INT32}, buffer.size, std::move(buffer._data), rmm::device_buffer{}, 0);
 
       column_name_info* child_info = nullptr;
       if (schema_info != nullptr) {
@@ -160,7 +215,7 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = make_column(buffer.children[0], child_info, child_schema, stream);
+      auto child = make_column<string_policy>(buffer.children[0], child_info, child_schema, stream);
 
       // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
       // less)
@@ -170,7 +225,7 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
                                buffer._null_count,
                                std::move(buffer._null_mask),
                                stream,
-                               buffer.mr);
+                               buffer._mr);
     } break;
 
     case type_id::STRUCT: {
@@ -190,7 +245,7 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
                                     : std::nullopt;
 
         output_children.emplace_back(
-          make_column(buffer.children[i], child_info, child_schema, stream));
+          make_column<string_policy>(buffer.children[i], child_info, child_schema, stream));
       }
 
       return make_structs_column(buffer.size,
@@ -198,7 +253,7 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
                                  buffer._null_count,
                                  std::move(buffer._null_mask),
                                  stream,
-                                 buffer.mr);
+                                 buffer._mr);
     } break;
 
     default: {
@@ -214,7 +269,8 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
 /**
  * @copydoc cudf::io::detail::empty_like
  */
-std::unique_ptr<column> empty_like(column_buffer& buffer,
+template <class string_policy>
+std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
@@ -235,7 +291,8 @@ std::unique_ptr<column> empty_like(column_buffer& buffer,
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = empty_like(buffer.children[0], child_info, stream, mr);
+      auto child =
+        cudf::io::detail::empty_like<string_policy>(buffer.children[0], child_info, stream, mr);
 
       // make the final list column
       return make_lists_column(
@@ -248,13 +305,14 @@ std::unique_ptr<column> empty_like(column_buffer& buffer,
       std::transform(buffer.children.begin(),
                      buffer.children.end(),
                      std::back_inserter(output_children),
-                     [&](column_buffer& col) {
+                     [&](auto& col) {
                        column_name_info* child_info = nullptr;
                        if (schema_info != nullptr) {
                          schema_info->children.push_back(column_name_info{""});
                          child_info = &schema_info->children.back();
                        }
-                       return empty_like(col, child_info, stream, mr);
+                       return cudf::io::detail::empty_like<string_policy>(
+                         col, child_info, stream, mr);
                      });
 
       return make_structs_column(
@@ -265,6 +323,34 @@ std::unique_ptr<column> empty_like(column_buffer& buffer,
   }
 }
 
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+using pointer_type = gather_column_buffer;
+using string_type  = inline_column_buffer;
+
+using pointer_column_buffer = column_buffer_base<pointer_type>;
+using string_column_buffer  = column_buffer_base<string_type>;
+
+template std::unique_ptr<column> make_column<string_type>(
+  string_column_buffer& buffer,
+  column_name_info* schema_info,
+  std::optional<reader_column_schema> const& schema,
+  rmm::cuda_stream_view stream);
+
+template std::unique_ptr<column> make_column<pointer_type>(
+  pointer_column_buffer& buffer,
+  column_name_info* schema_info,
+  std::optional<reader_column_schema> const& schema,
+  rmm::cuda_stream_view stream);
+
+template std::unique_ptr<column> empty_like<string_type>(string_column_buffer& buffer,
+                                                         column_name_info* schema_info,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr);
+
+template std::unique_ptr<column> empty_like<pointer_type>(pointer_column_buffer& buffer,
+                                                          column_name_info* schema_info,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource* mr);
+
+template class column_buffer_base<pointer_type>;
+template class column_buffer_base<string_type>;
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 1535cc5c06a..2ee7c17e480 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -60,44 +60,64 @@ inline rmm::device_buffer create_data(data_type type,
   return data;
 }
 
-using string_index_pair = thrust::pair<const char*, size_type>;
+using string_index_pair = thrust::pair<char const*, size_type>;
+
+// forward declare friend functions
+template <typename string_policy>
+class column_buffer_base;
 
 /**
- * @brief Class for holding device memory buffers to column data that eventually
- * will be used to create a column.
+ * @brief Creates a column from an existing set of device memory buffers.
+ *
+ * @throws std::bad_alloc if device memory allocation fails
+ *
+ * @param buffer Column buffer descriptors
+ * @param schema_info Schema information for the column to write optionally.
+ * @param schema Optional schema used to control string to binary conversions.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ *
+ * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
-struct column_buffer {
-  column_buffer() = default;
-
-  // construct without a known size. call create() later to actually
-  // allocate memory
-  column_buffer(data_type _type, bool _is_nullable) : type(_type), is_nullable(_is_nullable) {}
-
-  // construct with a known size. allocates memory
-  column_buffer(data_type _type,
-                size_type _size,
-                bool _is_nullable,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr)
-    : column_buffer(_type, _is_nullable)
+template <class string_policy>
+std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
+                                    column_name_info* schema_info,
+                                    std::optional<reader_column_schema> const& schema,
+                                    rmm::cuda_stream_view stream);
+
+template <typename string_policy>
+class column_buffer_base {
+ public:
+  column_buffer_base() = default;
+
+  // construct without a known size. call create() later to actually allocate memory
+  column_buffer_base(data_type _type, bool _is_nullable) : type(_type), is_nullable(_is_nullable) {}
+
+  column_buffer_base(data_type _type,
+                     size_type _size,
+                     bool _is_nullable,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr)
+    : column_buffer_base(_type, _is_nullable)
   {
-    create(_size, stream, mr);
   }
 
   // move constructor
-  column_buffer(column_buffer&& col)            = default;
-  column_buffer& operator=(column_buffer&& col) = default;
+  column_buffer_base(column_buffer_base&& col)            = default;
+  column_buffer_base& operator=(column_buffer_base&& col) = default;
 
   // copy constructor
-  column_buffer(column_buffer const& col)            = delete;
-  column_buffer& operator=(column_buffer const& col) = delete;
+  column_buffer_base(column_buffer_base const& col)            = delete;
+  column_buffer_base& operator=(column_buffer_base const& col) = delete;
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
   void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
-  auto data() { return _strings ? _strings->data() : _data.data(); }
-  auto data_size() const { return _strings ? _strings->size() : _data.size(); }
+  // Create a new column_buffer that has empty data but with the same basic information as the
+  // input column, including same type, nullability, name, and user_data.
+  static string_policy empty_like(string_policy const& input);
+
+  void set_null_mask(rmm::device_buffer&& mask) { _null_mask = std::move(mask); }
 
   template <typename T = uint32_t>
   auto null_mask()
@@ -105,43 +125,112 @@ struct column_buffer {
     return static_cast<T*>(_null_mask.data());
   }
   auto null_mask_size() { return _null_mask.size(); }
-
   auto& null_count() { return _null_count; }
 
-  // Create a new column_buffer that has empty data but with the same basic information as the
-  // input column, including same type, nullability, name, and user_data.
-  static column_buffer empty_like(column_buffer const& input);
+  auto data() { return static_cast<string_policy*>(this)->data_impl(); }
+  auto data() const { return static_cast<string_policy const*>(this)->data_impl(); }
+  auto data_size() const { return static_cast<string_policy const*>(this)->data_size_impl(); }
 
-  std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
+  std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream)
+  {
+    return static_cast<string_policy*>(this)->make_string_column_impl(stream);
+  }
+
+ protected:
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
+  rmm::mr::device_memory_resource* _mr;
 
+ public:
   data_type type{type_id::EMPTY};
   bool is_nullable{false};
   size_type size{0};
-  std::vector<column_buffer> children;
   uint32_t user_data{0};  // arbitrary user data
   std::string name;
 
-  rmm::mr::device_memory_resource* mr;
+  std::vector<string_policy> children;
+
+  friend std::unique_ptr<column> make_column<string_policy>(
+    column_buffer_base& buffer,
+    column_name_info* schema_info,
+    std::optional<reader_column_schema> const& schema,
+    rmm::cuda_stream_view stream);
 };
 
-/**
- * @brief Creates a column from an existing set of device memory buffers.
- *
- * @throws std::bad_alloc if device memory allocation fails
- *
- * @param buffer Column buffer descriptors
- * @param schema_info Schema information for the column to write optionally.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- *
- * @return `std::unique_ptr<cudf::column>` Column from the existing device data
- */
-std::unique_ptr<column> make_column(column_buffer& buffer,
-                                    column_name_info* schema_info,
-                                    std::optional<reader_column_schema> const& schema,
-                                    rmm::cuda_stream_view stream);
+// column buffer that uses a string_index_pair for strings data, requiring a gather step when
+// creating a string column
+class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
+ public:
+  gather_column_buffer() = default;
+
+  // construct without a known size. call create() later to actually allocate memory
+  gather_column_buffer(data_type _type, bool _is_nullable)
+    : column_buffer_base<gather_column_buffer>(_type, _is_nullable)
+  {
+  }
+
+  gather_column_buffer(data_type _type,
+                       size_type _size,
+                       bool _is_nullable,
+                       rmm::cuda_stream_view stream,
+                       rmm::mr::device_memory_resource* mr)
+    : column_buffer_base<gather_column_buffer>(_type, _size, _is_nullable, stream, mr)
+  {
+    create(_size, stream, mr);
+  }
+
+  void allocate_strings_data(rmm::cuda_stream_view stream);
+
+  void* data_impl() { return _strings ? _strings->data() : _data.data(); }
+  void const* data_impl() const { return _strings ? _strings->data() : _data.data(); }
+  size_t data_size_impl() const { return _strings ? _strings->size() : _data.size(); }
+
+  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
+
+ public:
+  std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
+};
+
+// column buffer that stores string data internally which can be passed directly when
+// creating a string column
+class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
+ public:
+  inline_column_buffer() = default;
+
+  // construct without a known size. call create() later to actually allocate memory
+  inline_column_buffer(data_type _type, bool _is_nullable)
+    : column_buffer_base<inline_column_buffer>(_type, _is_nullable)
+  {
+  }
+
+  inline_column_buffer(data_type _type,
+                       size_type _size,
+                       bool _is_nullable,
+                       rmm::cuda_stream_view stream,
+                       rmm::mr::device_memory_resource* mr)
+    : column_buffer_base<inline_column_buffer>(_type, _size, _is_nullable, stream, mr)
+  {
+    create(_size, stream, mr);
+  }
+
+  void allocate_strings_data(rmm::cuda_stream_view stream);
+
+  void* data_impl() { return _data.data(); }
+  void const* data_impl() const { return _data.data(); }
+  size_t data_size_impl() const { return _data.size(); }
+  std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
+
+  void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream);
+  void* string_data() { return _string_data.data(); }
+  void const* string_data() const { return _string_data.data(); }
+  size_t string_size() const { return _string_data.size(); }
+
+ private:
+  rmm::device_buffer _string_data{};
+};
+
+using column_buffer = gather_column_buffer;
 
 /**
  * @brief Creates an equivalent empty column from an existing set of device memory buffers.
@@ -158,7 +247,8 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
  *
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
-std::unique_ptr<column> empty_like(column_buffer& buffer,
+template <class string_policy>
+std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);
diff --git a/cpp/src/io/utilities/column_utils.cuh b/cpp/src/io/utilities/column_utils.cuh
index 598c93a1a4f..5f4bf646452 100644
--- a/cpp/src/io/utilities/column_utils.cuh
+++ b/cpp/src/io/utilities/column_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ namespace io {
 template <typename ColumnDescriptor>
 rmm::device_uvector<column_device_view> create_leaf_column_device_views(
   typename cudf::device_span<ColumnDescriptor> col_desc,
-  const table_device_view& parent_table_device_view,
+  table_device_view const& parent_table_device_view,
   rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<column_device_view> leaf_column_views(parent_table_device_view.num_columns(),
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 40b70986eca..aaad68619ae 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -38,6 +38,8 @@ class file_sink : public data_sink {
 
     if (detail::cufile_integration::is_kvikio_enabled()) {
       _kvikio_file = kvikio::FileHandle(filepath, "w");
+      CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.",
+                    _kvikio_file.is_compat_mode_on() ? "on" : "off");
     } else {
       _cufile_out = detail::make_cufile_output(filepath);
     }
@@ -194,7 +196,7 @@ class user_sink_wrapper : public data_sink {
   cudf::io::data_sink* const user_sink;
 };
 
-std::unique_ptr<data_sink> data_sink::create(const std::string& filepath)
+std::unique_ptr<data_sink> data_sink::create(std::string const& filepath)
 {
   return std::make_unique<file_sink>(filepath);
 }
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index e2cea7a56ff..7a7121aa91d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -17,6 +17,7 @@
 #include "file_io_utilities.hpp"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
@@ -25,10 +26,14 @@
 #include <kvikio/file_handle.hpp>
 #include <rmm/device_buffer.hpp>
 
+#include <arrow/io/memory.h>
+
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
 
+#include <unordered_map>
+
 namespace cudf {
 namespace io {
 namespace {
@@ -38,10 +43,12 @@ namespace {
  */
 class file_source : public datasource {
  public:
-  explicit file_source(const char* filepath) : _file(filepath, O_RDONLY)
+  explicit file_source(char const* filepath) : _file(filepath, O_RDONLY)
   {
     if (detail::cufile_integration::is_kvikio_enabled()) {
       _kvikio_file = kvikio::FileHandle(filepath);
+      CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
+                    _kvikio_file.is_compat_mode_on() ? "on" : "off");
     } else {
       _cufile_in = detail::make_cufile_input(filepath);
     }
@@ -102,6 +109,27 @@ class file_source : public datasource {
   static constexpr size_t _gds_read_preferred_threshold = 128 << 10;  // 128KB
 };
 
+/**
+ * @brief Memoized pageableMemoryAccessUsesHostPageTables device property.
+ */
+[[nodiscard]] bool pageableMemoryAccessUsesHostPageTables()
+{
+  static std::unordered_map<int, bool> result_cache{};
+
+  int deviceId{};
+  CUDF_CUDA_TRY(cudaGetDevice(&deviceId));
+
+  if (result_cache.find(deviceId) == result_cache.end()) {
+    cudaDeviceProp props{};
+    CUDF_CUDA_TRY(cudaGetDeviceProperties(&props, deviceId));
+    result_cache[deviceId] = (props.pageableMemoryAccessUsesHostPageTables == 1);
+    CUDF_LOG_INFO(
+      "Device {} pageableMemoryAccessUsesHostPageTables: {}", deviceId, result_cache[deviceId]);
+  }
+
+  return result_cache[deviceId];
+}
+
 /**
  * @brief Implementation class for reading from a file using memory mapped access.
  *
@@ -110,15 +138,21 @@ class file_source : public datasource {
  */
 class memory_mapped_source : public file_source {
  public:
-  explicit memory_mapped_source(const char* filepath, size_t offset, size_t size)
+  explicit memory_mapped_source(char const* filepath, size_t offset, size_t size)
     : file_source(filepath)
   {
-    if (_file.size() != 0) map(_file.desc(), offset, size);
+    if (_file.size() != 0) {
+      map(_file.desc(), offset, size);
+      register_mmap_buffer();
+    }
   }
 
   ~memory_mapped_source() override
   {
-    if (_map_addr != nullptr) { munmap(_map_addr, _map_size); }
+    if (_map_addr != nullptr) {
+      munmap(_map_addr, _map_size);
+      unregister_mmap_buffer();
+    }
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
@@ -145,6 +179,38 @@ class memory_mapped_source : public file_source {
   }
 
  private:
+  /**
+   * @brief Page-locks (registers) the memory range of the mapped file.
+   *
+   * Fixes nvbugs/4215160
+   */
+  void register_mmap_buffer()
+  {
+    if (_map_addr == nullptr or _map_size == 0 or not pageableMemoryAccessUsesHostPageTables()) {
+      return;
+    }
+
+    auto const result = cudaHostRegister(_map_addr, _map_size, cudaHostRegisterDefault);
+    if (result == cudaSuccess) {
+      _is_map_registered = true;
+    } else {
+      CUDF_LOG_WARN("cudaHostRegister failed with {} ({})", result, cudaGetErrorString(result));
+    }
+  }
+
+  /**
+   * @brief Unregisters the memory range of the mapped file.
+   */
+  void unregister_mmap_buffer()
+  {
+    if (not _is_map_registered) { return; }
+
+    auto const result = cudaHostUnregister(_map_addr);
+    if (result != cudaSuccess) {
+      CUDF_LOG_WARN("cudaHostUnregister failed with {} ({})", result, cudaGetErrorString(result));
+    }
+  }
+
   void map(int fd, size_t offset, size_t size)
   {
     CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file");
@@ -163,9 +229,10 @@ class memory_mapped_source : public file_source {
   }
 
  private:
-  size_t _map_size   = 0;
-  size_t _map_offset = 0;
-  void* _map_addr    = nullptr;
+  size_t _map_size        = 0;
+  size_t _map_offset      = 0;
+  void* _map_addr         = nullptr;
+  bool _is_map_registered = false;
 };
 
 /**
@@ -176,7 +243,7 @@ class memory_mapped_source : public file_source {
  */
 class direct_read_source : public file_source {
  public:
-  explicit direct_read_source(const char* filepath) : file_source(filepath) {}
+  explicit direct_read_source(char const* filepath) : file_source(filepath) {}
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
@@ -316,7 +383,7 @@ class user_datasource_wrapper : public datasource {
 
 }  // namespace
 
-std::unique_ptr<datasource> datasource::create(const std::string& filepath,
+std::unique_ptr<datasource> datasource::create(std::string const& filepath,
                                                size_t offset,
                                                size_t size)
 {
@@ -340,7 +407,7 @@ std::unique_ptr<datasource> datasource::create(cudf::host_span<std::byte const>
 {
   // Use Arrow IO buffer class for zero-copy reads of host memory
   return std::make_unique<arrow_io_source>(std::make_shared<arrow::io::BufferReader>(
-    reinterpret_cast<const uint8_t*>(buffer.data()), buffer.size()));
+    reinterpret_cast<uint8_t const*>(buffer.data()), buffer.size()));
 }
 
 std::unique_ptr<datasource> datasource::create(cudf::device_span<std::byte const> buffer)
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 470bd04fcc3..28eae8b8e97 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -288,8 +288,9 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath
 {
   if (cufile_integration::is_gds_enabled()) {
     try {
-      auto const cufile_in = std::make_unique<cufile_input_impl>(filepath);
+      auto cufile_in = std::make_unique<cufile_input_impl>(filepath);
       CUDF_LOG_INFO("File successfully opened for reading with GDS.");
+      return cufile_in;
     } catch (...) {
       if (cufile_integration::is_always_enabled()) {
         CUDF_LOG_ERROR(
@@ -302,15 +303,16 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath
         "buffer (possible performance impact).");
     }
   }
-  return nullptr;
+  return {};
 }
 
 std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepath)
 {
   if (cufile_integration::is_gds_enabled()) {
     try {
-      auto const cufile_out = std::make_unique<cufile_output_impl>(filepath);
+      auto cufile_out = std::make_unique<cufile_output_impl>(filepath);
       CUDF_LOG_INFO("File successfully opened for writing with GDS.");
+      return cufile_out;
     } catch (...) {
       if (cufile_integration::is_always_enabled()) {
         CUDF_LOG_ERROR(
@@ -323,7 +325,7 @@ std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepa
         "buffer (possible performance impact).");
     }
   }
-  return nullptr;
+  return {};
 }
 
 std::vector<file_io_slice> make_file_io_slices(size_t size, size_t max_slice_size)
diff --git a/cpp/src/io/utilities/hostdevice_span.hpp b/cpp/src/io/utilities/hostdevice_span.hpp
index 4b0dc7f672a..539e8e84e59 100644
--- a/cpp/src/io/utilities/hostdevice_span.hpp
+++ b/cpp/src/io/utilities/hostdevice_span.hpp
@@ -20,6 +20,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+namespace cudf::detail {
+
 template <typename T>
 class hostdevice_span {
  public:
@@ -144,18 +146,28 @@ class hostdevice_span {
     return hostdevice_span<T>(_host_data + offset, _device_data + offset, count);
   }
 
-  void host_to_device(rmm::cuda_stream_view stream, bool synchronize = false)
+  void host_to_device_async(rmm::cuda_stream_view stream)
   {
     CUDF_CUDA_TRY(
       cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
-    if (synchronize) { stream.synchronize(); }
   }
 
-  void device_to_host(rmm::cuda_stream_view stream, bool synchronize = false)
+  void host_to_device_sync(rmm::cuda_stream_view stream)
+  {
+    host_to_device_async(stream);
+    stream.synchronize();
+  }
+
+  void device_to_host_async(rmm::cuda_stream_view stream)
   {
     CUDF_CUDA_TRY(
       cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
-    if (synchronize) { stream.synchronize(); }
+  }
+
+  void device_to_host_sync(rmm::cuda_stream_view stream)
+  {
+    device_to_host_async(stream);
+    stream.synchronize();
   }
 
  private:
@@ -163,3 +175,5 @@ class hostdevice_span {
   T* _device_data{};  ///< Pointer to device memory containing elements
   T* _host_data{};    ///< Pointer to host memory containing elements
 };
+
+}  // namespace cudf::detail
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 566132f8463..a6a93c41472 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -31,6 +31,8 @@
 
 #include <variant>
 
+namespace cudf::detail {
+
 inline bool hostdevice_vector_uses_pageable_buffer()
 {
   static bool const use_pageable =
@@ -82,7 +84,7 @@ class hostdevice_vector {
     d_data.resize(max_size, stream);
   }
 
-  void push_back(const T& data)
+  void push_back(T const& data)
   {
     CUDF_EXPECTS(size() < capacity(),
                  "Cannot insert data into hostdevice_vector because capacity has been exceeded.");
@@ -92,6 +94,7 @@ class hostdevice_vector {
   [[nodiscard]] size_t capacity() const noexcept { return d_data.size(); }
   [[nodiscard]] size_t size() const noexcept { return current_size; }
   [[nodiscard]] size_t size_bytes() const noexcept { return sizeof(T) * size(); }
+  [[nodiscard]] bool empty() const noexcept { return size() == 0; }
 
   [[nodiscard]] T& operator[](size_t i) { return host_data[i]; }
   [[nodiscard]] T const& operator[](size_t i) const { return host_data[i]; }
@@ -137,18 +140,28 @@ class hostdevice_vector {
   operator cudf::device_span<T>() { return {device_ptr(), size()}; }
   operator cudf::device_span<T const>() const { return {device_ptr(), size()}; }
 
-  void host_to_device(rmm::cuda_stream_view stream, bool synchronize = false)
+  void host_to_device_async(rmm::cuda_stream_view stream)
   {
     CUDF_CUDA_TRY(
       cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
-    if (synchronize) { stream.synchronize(); }
   }
 
-  void device_to_host(rmm::cuda_stream_view stream, bool synchronize = false)
+  void host_to_device_sync(rmm::cuda_stream_view stream)
+  {
+    host_to_device_async(stream);
+    stream.synchronize();
+  }
+
+  void device_to_host_async(rmm::cuda_stream_view stream)
   {
     CUDF_CUDA_TRY(
       cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
-    if (synchronize) { stream.synchronize(); }
+  }
+
+  void device_to_host_sync(rmm::cuda_stream_view stream)
+  {
+    device_to_host_async(stream);
+    stream.synchronize();
   }
 
   /**
@@ -183,9 +196,6 @@ class hostdevice_vector {
   rmm::device_uvector<T> d_data;
 };
 
-namespace cudf {
-namespace detail {
-
 /**
  * @brief Wrapper around hostdevice_vector to enable two-dimensional indexing.
  *
@@ -234,20 +244,15 @@ class hostdevice_2dvector {
 
   size_t size_bytes() const noexcept { return _data.size_bytes(); }
 
-  void host_to_device(rmm::cuda_stream_view stream, bool synchronize = false)
-  {
-    _data.host_to_device(stream, synchronize);
-  }
+  void host_to_device_async(rmm::cuda_stream_view stream) { _data.host_to_device_async(stream); }
+  void host_to_device_sync(rmm::cuda_stream_view stream) { _data.host_to_device_sync(stream); }
 
-  void device_to_host(rmm::cuda_stream_view stream, bool synchronize = false)
-  {
-    _data.device_to_host(stream, synchronize);
-  }
+  void device_to_host_async(rmm::cuda_stream_view stream) { _data.device_to_host_async(stream); }
+  void device_to_host_sync(rmm::cuda_stream_view stream) { _data.device_to_host_sync(stream); }
 
  private:
   hostdevice_vector<T> _data;
   typename host_2dspan<T>::size_type _size;
 };
 
-}  // namespace detail
-}  // namespace cudf
+}  // namespace cudf::detail
diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh
index 7d02e53182c..1858912a871 100644
--- a/cpp/src/io/utilities/output_builder.cuh
+++ b/cpp/src/io/utilities/output_builder.cuh
@@ -106,33 +106,33 @@ class split_device_span_iterator {
 
   [[nodiscard]] constexpr reference operator[](size_type i) const { return _span[_offset + i]; }
 
-  [[nodiscard]] constexpr friend bool operator==(const it& lhs, const it& rhs)
+  [[nodiscard]] constexpr friend bool operator==(it const& lhs, it const& rhs)
   {
     return lhs._offset == rhs._offset;
   }
 
-  [[nodiscard]] constexpr friend bool operator!=(const it& lhs, const it& rhs)
+  [[nodiscard]] constexpr friend bool operator!=(it const& lhs, it const& rhs)
   {
     return !(lhs == rhs);
   }
-  [[nodiscard]] constexpr friend bool operator<(const it& lhs, const it& rhs)
+  [[nodiscard]] constexpr friend bool operator<(it const& lhs, it const& rhs)
   {
     return lhs._offset < rhs._offset;
   }
 
-  [[nodiscard]] constexpr friend bool operator>=(const it& lhs, const it& rhs)
+  [[nodiscard]] constexpr friend bool operator>=(it const& lhs, it const& rhs)
   {
     return !(lhs < rhs);
   }
 
-  [[nodiscard]] constexpr friend bool operator>(const it& lhs, const it& rhs) { return rhs < lhs; }
+  [[nodiscard]] constexpr friend bool operator>(it const& lhs, it const& rhs) { return rhs < lhs; }
 
-  [[nodiscard]] constexpr friend bool operator<=(const it& lhs, const it& rhs)
+  [[nodiscard]] constexpr friend bool operator<=(it const& lhs, it const& rhs)
   {
     return !(lhs > rhs);
   }
 
-  [[nodiscard]] constexpr friend difference_type operator-(const it& lhs, const it& rhs)
+  [[nodiscard]] constexpr friend difference_type operator-(it const& lhs, it const& rhs)
   {
     return lhs._offset - rhs._offset;
   }
@@ -215,9 +215,9 @@ class output_builder {
   }
 
   output_builder(output_builder&&)                 = delete;
-  output_builder(const output_builder&)            = delete;
+  output_builder(output_builder const&)            = delete;
   output_builder& operator=(output_builder&&)      = delete;
-  output_builder& operator=(const output_builder&) = delete;
+  output_builder& operator=(output_builder const&) = delete;
 
   /**
    * @brief Returns the next free chunk of `max_write_size` elements from the underlying storage.
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 5c5cbd1c01d..06b86f33c85 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -46,7 +47,7 @@ constexpr T divCeil(T dividend, T divisor) noexcept
  * @brief Sets the specified element of the array to the passed value
  */
 template <class T, class V>
-__device__ __forceinline__ void setElement(T* array, cudf::size_type idx, const T& t, const V&)
+__device__ __forceinline__ void setElement(T* array, cudf::size_type idx, T const& t, V const&)
 {
   array[idx] = t;
 }
@@ -58,8 +59,8 @@ __device__ __forceinline__ void setElement(T* array, cudf::size_type idx, const
 template <class T, class V>
 __device__ __forceinline__ void setElement(thrust::pair<T, V>* array,
                                            cudf::size_type idx,
-                                           const T& t,
-                                           const V& v)
+                                           T const& t,
+                                           V const& v)
 {
   array[idx] = {t, v};
 }
@@ -69,7 +70,7 @@ __device__ __forceinline__ void setElement(thrust::pair<T, V>* array,
  * Does not do anything, indexing is not allowed with void* arrays.
  */
 template <class T, class V>
-__device__ __forceinline__ void setElement(void*, cudf::size_type, const T&, const V&)
+__device__ __forceinline__ void setElement(void*, cudf::size_type, T const&, V const&)
 {
 }
 
@@ -86,26 +87,26 @@ __device__ __forceinline__ void setElement(void*, cudf::size_type, const T&, con
  * @param[out] positions Array containing the output positions
  */
 template <class T>
-__global__ void count_and_set_positions(const char* data,
+__global__ void count_and_set_positions(char const* data,
                                         uint64_t size,
                                         uint64_t offset,
-                                        const char key,
+                                        char const key,
                                         cudf::size_type* count,
                                         T* positions)
 {
   // thread IDs range per block, so also need the block id
-  const uint64_t tid = threadIdx.x + (blockDim.x * blockIdx.x);
-  const uint64_t did = tid * bytes_per_find_thread;
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  auto const did = tid * bytes_per_find_thread;
 
-  const char* raw = (data + did);
+  char const* raw = (data + did);
 
-  const long byteToProcess =
+  long const byteToProcess =
     ((did + bytes_per_find_thread) < size) ? bytes_per_find_thread : (size - did);
 
   // Process the data
   for (long i = 0; i < byteToProcess; i++) {
     if (raw[i] == key) {
-      const auto idx = atomicAdd(count, (cudf::size_type)1);
+      auto const idx = atomicAdd(count, (cudf::size_type)1);
       setElement(positions, idx, did + offset + i, key);
     }
   }
@@ -124,7 +125,7 @@ cudf::size_type find_all_from_set(device_span<char const> data,
   int min_grid_size = 0;  // minimum block count required
   CUDF_CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
-  const int grid_size = divCeil(data.size(), (size_t)block_size);
+  int const grid_size = divCeil(data.size(), (size_t)block_size);
 
   auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
     1, stream, rmm::mr::get_current_device_resource());
@@ -138,7 +139,7 @@ cudf::size_type find_all_from_set(device_span<char const> data,
 
 template <class T>
 cudf::size_type find_all_from_set(host_span<char const> data,
-                                  const std::vector<char>& keys,
+                                  std::vector<char> const& keys,
                                   uint64_t result_offset,
                                   T* positions,
                                   rmm::cuda_stream_view stream)
@@ -152,13 +153,13 @@ cudf::size_type find_all_from_set(host_span<char const> data,
   CUDF_CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
 
-  const size_t chunk_count = divCeil(data.size(), max_chunk_bytes);
+  size_t const chunk_count = divCeil(data.size(), max_chunk_bytes);
   for (size_t ci = 0; ci < chunk_count; ++ci) {
-    const auto chunk_offset = ci * max_chunk_bytes;
-    const auto h_chunk      = data.data() + chunk_offset;
-    const int chunk_bytes = std::min((size_t)(data.size() - ci * max_chunk_bytes), max_chunk_bytes);
-    const auto chunk_bits = divCeil(chunk_bytes, bytes_per_find_thread);
-    const int grid_size   = divCeil(chunk_bits, block_size);
+    auto const chunk_offset = ci * max_chunk_bytes;
+    auto const h_chunk      = data.data() + chunk_offset;
+    int const chunk_bytes = std::min((size_t)(data.size() - ci * max_chunk_bytes), max_chunk_bytes);
+    auto const chunk_bits = divCeil(chunk_bytes, bytes_per_find_thread);
+    int const grid_size   = divCeil(chunk_bits, block_size);
 
     // Copy chunk to device
     CUDF_CUDA_TRY(
@@ -210,7 +211,7 @@ cudf::size_type count_all_from_set(device_span<char const> data,
 }
 
 cudf::size_type count_all_from_set(host_span<char const> data,
-                                   const std::vector<char>& keys,
+                                   std::vector<char> const& keys,
                                    rmm::cuda_stream_view stream)
 {
   return find_all_from_set<void>(data, keys, 0, nullptr, stream);
diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp
index 1b79a59aa9e..ae5c7b5fbda 100644
--- a/cpp/src/io/utilities/row_selection.cpp
+++ b/cpp/src/io/utilities/row_selection.cpp
@@ -24,18 +24,19 @@
 namespace cudf::io::detail {
 
 std::pair<uint64_t, size_type> skip_rows_num_rows_from_options(
-  uint64_t skip_rows_opt, std::optional<size_type> const& num_rows_opt, uint64_t num_source_rows)
+  uint64_t skip_rows, std::optional<size_type> const& num_rows, uint64_t num_source_rows)
 {
-  auto const rows_to_skip = std::min(skip_rows_opt, num_source_rows);
-  if (not num_rows_opt.has_value()) {
+  auto const rows_to_skip = std::min(skip_rows, num_source_rows);
+  if (not num_rows.has_value()) {
     CUDF_EXPECTS(num_source_rows - rows_to_skip <= std::numeric_limits<size_type>::max(),
-                 "The requested number of rows to read exceeds the largest cudf column size");
+                 "The requested number of rows exceeds the column size limit",
+                 std::overflow_error);
     return {rows_to_skip, num_source_rows - rows_to_skip};
   }
   // Limit the number of rows to the end of the input
-  return {rows_to_skip,
-          static_cast<size_type>(
-            std::min<uint64_t>(num_rows_opt.value(), num_source_rows - rows_to_skip))};
+  return {
+    rows_to_skip,
+    static_cast<size_type>(std::min<uint64_t>(num_rows.value(), num_source_rows - rows_to_skip))};
 }
 
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp
index 66a3a83a61e..211726816de 100644
--- a/cpp/src/io/utilities/row_selection.hpp
+++ b/cpp/src/io/utilities/row_selection.hpp
@@ -27,15 +27,14 @@ namespace cudf::io::detail {
  * @brief Adjusts the input skip_rows and num_rows options to the actual number of rows to
  * skip/read, based on the number of rows in the ORC file(s).
  *
- * @param skip_rows_opt skip_rows as passed by the user
- * @param num_rows_opt num_rows as passed by the user
+ * @param skip_rows skip_rows as passed by the user
+ * @param num_rows Optional num_rows as passed by the user
  * @param num_source_rows number of rows in the ORC file(s)
  * @return A std::pair containing the number of rows to skip and the number of rows to read
  *
- * @throw cudf::logic_error when the requested number of rows to read exceeds the largest cudf
- * column size
+ * @throw std::overflow_exception The requested number of rows exceeds the column size limit
  */
 std::pair<uint64_t, size_type> skip_rows_num_rows_from_options(
-  uint64_t skip_rows_opt, std::optional<size_type> const& num_rows_opt, uint64_t num_source_rows);
+  uint64_t skip_rows, std::optional<size_type> const& num_rows, uint64_t num_source_rows);
 
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/thread_pool.hpp b/cpp/src/io/utilities/thread_pool.hpp
index 952ab58813a..74a2531710b 100644
--- a/cpp/src/io/utilities/thread_pool.hpp
+++ b/cpp/src/io/utilities/thread_pool.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,7 +55,7 @@ class thread_pool {
    * will be twice the number of CPU cores. If the argument is zero, the default value will be used
    * instead.
    */
-  thread_pool(const ui32& _thread_count = std::thread::hardware_concurrency())
+  thread_pool(ui32 const& _thread_count = std::thread::hardware_concurrency())
     : thread_count(_thread_count ? _thread_count : std::thread::hardware_concurrency()),
       threads(new std::thread[_thread_count ? _thread_count : std::thread::hardware_concurrency()])
   {
@@ -81,7 +81,7 @@ class thread_pool {
    */
   [[nodiscard]] size_t get_tasks_queued() const
   {
-    const std::scoped_lock lock(queue_mutex);
+    std::scoped_lock const lock(queue_mutex);
     return tasks.size();
   }
 
@@ -121,7 +121,7 @@ class thread_pool {
    * number of threads in the pool.
    */
   template <typename T, typename F>
-  void parallelize_loop(T first_index, T last_index, const F& loop, ui32 num_tasks = 0)
+  void parallelize_loop(T first_index, T last_index, F const& loop, ui32 num_tasks = 0)
   {
     if (num_tasks == 0) num_tasks = thread_count;
     if (last_index < first_index) std::swap(last_index, first_index);
@@ -154,11 +154,11 @@ class thread_pool {
    * @param task The function to push.
    */
   template <typename F>
-  void push_task(const F& task)
+  void push_task(F const& task)
   {
     tasks_total++;
     {
-      const std::scoped_lock lock(queue_mutex);
+      std::scoped_lock const lock(queue_mutex);
       tasks.push(std::function<void()>(task));
     }
   }
@@ -176,7 +176,7 @@ class thread_pool {
    * @param args The arguments to pass to the function.
    */
   template <typename F, typename... A>
-  void push_task(const F& task, const A&... args)
+  void push_task(F const& task, A const&... args)
   {
     push_task([task, args...] { task(args...); });
   }
@@ -193,7 +193,7 @@ class thread_pool {
    * will be twice the number of CPU cores. If the argument is zero, the default value will be used
    * instead.
    */
-  void reset(const ui32& _thread_count = std::thread::hardware_concurrency())
+  void reset(ui32 const& _thread_count = std::thread::hardware_concurrency())
   {
     bool was_paused = paused;
     paused          = true;
@@ -222,7 +222,7 @@ class thread_pool {
   template <typename F,
             typename... A,
             typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>>
-  std::future<R> submit(const F& task, const A&... args)
+  std::future<R> submit(F const& task, A const&... args)
   {
     std::shared_ptr<std::promise<R>> promise(new std::promise<R>);
     std::future<R> future = promise->get_future();
@@ -305,7 +305,7 @@ class thread_pool {
    */
   bool pop_task(std::function<void()>& task)
   {
-    const std::scoped_lock lock(queue_mutex);
+    std::scoped_lock const lock(queue_mutex);
     if (tasks.empty())
       return false;
     else {
diff --git a/cpp/src/io/utilities/trie.cu b/cpp/src/io/utilities/trie.cu
index 16f32b1ce35..3be1a8332ca 100644
--- a/cpp/src/io/utilities/trie.cu
+++ b/cpp/src/io/utilities/trie.cu
@@ -33,7 +33,7 @@
 namespace cudf {
 namespace detail {
 
-rmm::device_uvector<serial_trie_node> create_serialized_trie(const std::vector<std::string>& keys,
+rmm::device_uvector<serial_trie_node> create_serialized_trie(std::vector<std::string> const& keys,
                                                              rmm::cuda_stream_view stream)
 {
   if (keys.empty()) { return rmm::device_uvector<serial_trie_node>{0, stream}; }
@@ -49,10 +49,10 @@ rmm::device_uvector<serial_trie_node> create_serialized_trie(const std::vector<s
   // The trie takes a lot of memory, but the lookup is fast:
   // allows direct addressing of children nodes
   TreeTrieNode tree_trie;
-  for (const auto& key : keys) {
+  for (auto const& key : keys) {
     auto* current_node = &tree_trie;
 
-    for (const char character : key) {
+    for (char const character : key) {
       if (current_node->children[character] == nullptr)
         current_node->children[character] = std::make_unique<TreeTrieNode>();
 
@@ -80,9 +80,9 @@ rmm::device_uvector<serial_trie_node> create_serialized_trie(const std::vector<s
   // Add root node to queue. this node is not included to the serialized trie
   to_visit.emplace_back(&tree_trie, -1);
   while (!to_visit.empty()) {
-    const auto node_and_idx = to_visit.front();
-    const auto node         = node_and_idx.pnode;
-    const auto idx          = node_and_idx.idx;
+    auto const node_and_idx = to_visit.front();
+    auto const node         = node_and_idx.pnode;
+    auto const idx          = node_and_idx.idx;
     to_visit.pop_front();
 
     bool has_children = false;
diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh
index 0f87de81653..677743d77d0 100644
--- a/cpp/src/io/utilities/trie.cuh
+++ b/cpp/src/io/utilities/trie.cuh
@@ -67,7 +67,7 @@ inline trie_view make_trie_view(optional_trie const& t)
  *
  * @return A host vector of nodes representing the serialized trie
  */
-trie create_serialized_trie(const std::vector<std::string>& keys, rmm::cuda_stream_view stream);
+trie create_serialized_trie(std::vector<std::string> const& keys, rmm::cuda_stream_view stream);
 
 /*
  * @brief Searches for a string in a serialized trie.
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index 8228ff6da1f..46b347d39b1 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,7 +87,7 @@ std::filesystem::path get_cache_dir()
     try {
       // `mkdir -p` the kernel cache path if it doesn't exist
       std::filesystem::create_directories(kernel_cache_path);
-    } catch (const std::exception& e) {
+    } catch (std::exception const& e) {
       // if directory creation fails for any reason, return empty path
       return std::filesystem::path();
     }
diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp
index f305fc096e2..1bc126d3be9 100644
--- a/cpp/src/jit/parser.cpp
+++ b/cpp/src/jit/parser.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,9 +29,9 @@ namespace cudf {
 namespace jit {
 constexpr char percent_escape[] = "_";
 
-inline bool is_white(const char c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
+inline bool is_white(char const c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; }
 
-std::string ptx_parser::escape_percent(const std::string& src)
+std::string ptx_parser::escape_percent(std::string const& src)
 {
   // b/c we're transforming into inline ptx we aren't allowed to have register names starting with %
   auto f = std::find_if_not(src.begin(), src.end(), [](auto c) { return is_white(c) || c == '['; });
@@ -43,7 +43,7 @@ std::string ptx_parser::escape_percent(const std::string& src)
   return src;
 }
 
-std::string ptx_parser::remove_nonalphanumeric(const std::string& src)
+std::string ptx_parser::remove_nonalphanumeric(std::string const& src)
 {
   std::string out = src;
   auto f = std::find_if_not(out.begin(), out.end(), [](auto c) { return is_white(c) || c == '['; });
@@ -53,7 +53,7 @@ std::string ptx_parser::remove_nonalphanumeric(const std::string& src)
   return std::string(f, l);
 }
 
-std::string ptx_parser::register_type_to_contraint(const std::string& src)
+std::string ptx_parser::register_type_to_contraint(std::string const& src)
 {
   if (src == ".b8" || src == ".u8" || src == ".s8")
     return "h";
@@ -71,7 +71,7 @@ std::string ptx_parser::register_type_to_contraint(const std::string& src)
     return "x_reg";
 }
 
-std::string ptx_parser::register_type_to_cpp_type(const std::string& register_type)
+std::string ptx_parser::register_type_to_cpp_type(std::string const& register_type)
 {
   if (register_type == ".b8" || register_type == ".s8" || register_type == ".u8")
     return "char";
@@ -99,11 +99,11 @@ std::string ptx_parser::register_type_to_cpp_type(const std::string& register_ty
     return "x_cpptype";
 }
 
-std::string ptx_parser::parse_instruction(const std::string& src)
+std::string ptx_parser::parse_instruction(std::string const& src)
 {
   // I am assuming for an instruction statement the starting phrase is an
   // instruction.
-  const size_t length = src.size();
+  size_t const length = src.size();
   std::string output;
   std::string suffix;
 
@@ -211,13 +211,13 @@ std::string ptx_parser::parse_instruction(const std::string& src)
   return "asm volatile (\"" + output + "\"" + suffix + ");" + original_code;
 }
 
-std::string ptx_parser::parse_statement(const std::string& src)
+std::string ptx_parser::parse_statement(std::string const& src)
 {
   auto f = std::find_if_not(src.cbegin(), src.cend(), [](auto c) { return is_white(c); });
   return f == src.cend() ? " \n" : parse_instruction(std::string(f, src.cend()));
 }
 
-std::vector<std::string> ptx_parser::parse_function_body(const std::string& src)
+std::vector<std::string> ptx_parser::parse_function_body(std::string const& src)
 {
   auto f = src.cbegin();
   std::vector<std::string> statements;
@@ -230,7 +230,7 @@ std::vector<std::string> ptx_parser::parse_function_body(const std::string& src)
   return statements;
 }
 
-std::string ptx_parser::parse_param(const std::string& src)
+std::string ptx_parser::parse_param(std::string const& src)
 {
   auto i = 0;
   auto f = src.cbegin();
@@ -244,7 +244,7 @@ std::string ptx_parser::parse_param(const std::string& src)
   return "";
 }
 
-std::string ptx_parser::parse_param_list(const std::string& src)
+std::string ptx_parser::parse_param_list(std::string const& src)
 {
   auto f = src.begin();
 
@@ -282,7 +282,7 @@ std::string ptx_parser::parse_param_list(const std::string& src)
   return "\n  " + output + "\n";
 }
 
-std::string ptx_parser::parse_function_header(const std::string& src)
+std::string ptx_parser::parse_function_header(std::string const& src)
 {
   // Essentially we only need the information inside the two pairs of parentheses.
   auto f = [&] {
@@ -302,7 +302,7 @@ std::string ptx_parser::parse_function_header(const std::string& src)
   return "\n__device__ __inline__ void " + function_name + "(" + input_arg + "){" + "\n";
 }
 
-std::string remove_comments(const std::string& src)
+std::string remove_comments(std::string const& src)
 {
   std::string output;
   auto f = src.cbegin();
@@ -363,10 +363,10 @@ std::string ptx_parser::parse()
   return final_output + " asm volatile (\"RETTGT:}\");}";
 }
 
-ptx_parser::ptx_parser(const std::string& ptx_,
-                       const std::string& function_name_,
-                       const std::string& output_arg_type_,
-                       const std::set<int>& pointer_arg_list_)
+ptx_parser::ptx_parser(std::string const& ptx_,
+                       std::string const& function_name_,
+                       std::string const& output_arg_type_,
+                       std::set<int> const& pointer_arg_list_)
   : ptx(ptx_),
     function_name(function_name_),
     output_arg_type(output_arg_type_),
@@ -375,13 +375,13 @@ ptx_parser::ptx_parser(const std::string& ptx_,
 }
 
 // The interface
-std::string parse_single_function_cuda(const std::string& src, const std::string& function_name)
+std::string parse_single_function_cuda(std::string const& src, std::string const& function_name)
 {
   std::string no_comments = remove_comments(src);
 
   // For CUDA device function we just need to find the function
   // name and replace it with the specified one.
-  const size_t length = no_comments.size();
+  size_t const length = no_comments.size();
   size_t start        = 0;
   size_t stop         = start;
 
diff --git a/cpp/src/jit/parser.hpp b/cpp/src/jit/parser.hpp
index 65e3dc54d88..86f869c5e97 100644
--- a/cpp/src/jit/parser.hpp
+++ b/cpp/src/jit/parser.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,7 +77,7 @@ class ptx_parser {
     )
 
    */
-  std::string parse_function_header(const std::string& src);
+  std::string parse_function_header(std::string const& src);
 
   /**
    * @brief parse and transform input parameter list of the PTX code into the
@@ -86,7 +86,7 @@ class ptx_parser {
    * @param src The input parameter list part of the PTX code
    * @return The parsed CUDA input parameter list
    */
-  std::string parse_param_list(const std::string& src);
+  std::string parse_param_list(std::string const& src);
 
   /**
    * @brief parse and transform an input parameter line of the PTX code into the
@@ -95,15 +95,15 @@ class ptx_parser {
    * @param src The input parameter line of the PTX code
    * @return The parsed CUDA input parameter
    */
-  static std::string parse_param(const std::string& src);
+  static std::string parse_param(std::string const& src);
 
   /**
-   * @brief parse function body of the the PTX code into statements by `;`s.
+   * @brief parse function body of the PTX code into statements by `;`s.
    *
    * @param src The function body of the PTX code
    * @return The parsed statements
    */
-  std::vector<std::string> parse_function_body(const std::string& src);
+  std::vector<std::string> parse_function_body(std::string const& src);
 
   /**
    * @brief Remove leading white characters and call `parse_instruction`.
@@ -111,7 +111,7 @@ class ptx_parser {
    * @param src The statement to be parsed.
    * @return The resulting CUDA statement.
    */
-  std::string parse_statement(const std::string& src);
+  std::string parse_statement(std::string const& src);
 
   /**
    * @brief Convert the input PTX instruction into an inline PTX
@@ -134,7 +134,7 @@ class ptx_parser {
    * @param src The statement to be parsed.
    * @return The resulting CUDA inline PTX statement.
    */
-  std::string parse_instruction(const std::string& src);
+  std::string parse_instruction(std::string const& src);
 
   /**
    * @brief Convert register type (e.g. ".f32") to the corresponding
@@ -145,7 +145,7 @@ class ptx_parser {
    * @param src The input code
    * @return The resulting code
    */
-  static std::string register_type_to_cpp_type(const std::string& register_type);
+  static std::string register_type_to_cpp_type(std::string const& register_type);
 
   /**
    * @brief Convert register type (e.g. ".f32") to the corresponding
@@ -156,7 +156,7 @@ class ptx_parser {
    * @param src The input code
    * @return The resulting code
    */
-  static std::string register_type_to_contraint(const std::string& src);
+  static std::string register_type_to_contraint(std::string const& src);
 
   /**
    * @brief Replace any non-alphanumeric characters that are not underscore with
@@ -167,7 +167,7 @@ class ptx_parser {
    * @param src The input code
    * @return The resulting code
    */
-  static std::string remove_nonalphanumeric(const std::string& src);
+  static std::string remove_nonalphanumeric(std::string const& src);
 
   /**
    * @brief Replace leading `%` in register identifiers with `_`.
@@ -180,7 +180,7 @@ class ptx_parser {
    * @param src The input code
    * @return The resulting code
    */
-  static std::string escape_percent(const std::string& src);
+  static std::string escape_percent(std::string const& src);
 
  public:
   ptx_parser() = delete;
@@ -195,10 +195,10 @@ class ptx_parser {
    * function.
    * @param pointer_arg_list_ A list of the parameters that are pointers.
    */
-  ptx_parser(const std::string& ptx_,
-             const std::string& function_name_,
-             const std::string& output_arg_type_,
-             const std::set<int>& pointer_arg_list_);
+  ptx_parser(std::string const& ptx_,
+             std::string const& function_name_,
+             std::string const& output_arg_type_,
+             std::set<int> const& pointer_arg_list_);
 
   // parse the source!!!
   std::string parse();
@@ -216,10 +216,10 @@ class ptx_parser {
  * @param pointer_arg_list A list of the parameters that are pointers.
  * @return The output CUDA device function
  */
-inline std::string parse_single_function_ptx(const std::string& src,
-                                             const std::string& function_name,
-                                             const std::string& output_arg_type,
-                                             const std::set<int>& pointer_arg_list = {0})
+inline std::string parse_single_function_ptx(std::string const& src,
+                                             std::string const& function_name,
+                                             std::string const& output_arg_type,
+                                             std::set<int> const& pointer_arg_list = {0})
 {
   ptx_parser instance(src, function_name, output_arg_type, pointer_arg_list);
 
@@ -236,7 +236,7 @@ inline std::string parse_single_function_ptx(const std::string& src,
  * will have.
  * @return The output CUDA device function
  */
-std::string parse_single_function_cuda(const std::string& src, const std::string& function_name);
+std::string parse_single_function_cuda(std::string const& src, std::string const& function_name);
 
 }  // namespace jit
 }  // namespace cudf
diff --git a/cpp/src/jit/util.cpp b/cpp/src/jit/util.cpp
index 355d3e89251..0585e02a031 100644
--- a/cpp/src/jit/util.cpp
+++ b/cpp/src/jit/util.cpp
@@ -30,7 +30,7 @@ struct get_data_ptr_functor {
   template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   void const* operator()(column_view const& view)
   {
-    return static_cast<const void*>(view.template data<T>());
+    return static_cast<void const*>(view.template data<T>());
   }
 
   template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
@@ -47,7 +47,7 @@ struct get_data_ptr_functor {
   {
     using ScalarType = scalar_type_t<T>;
     auto s1          = static_cast<ScalarType const*>(&s);
-    return static_cast<const void*>(s1->data());
+    return static_cast<void const*>(s1->data());
   }
 
   template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
@@ -57,12 +57,12 @@ struct get_data_ptr_functor {
   }
 };
 
-const void* get_data_ptr(column_view const& view)
+void const* get_data_ptr(column_view const& view)
 {
   return type_dispatcher<dispatch_storage_type>(view.type(), get_data_ptr_functor{}, view);
 }
 
-const void* get_data_ptr(scalar const& s)
+void const* get_data_ptr(scalar const& s)
 {
   return type_dispatcher<dispatch_storage_type>(s.type(), get_data_ptr_functor{}, s);
 }
diff --git a/cpp/src/jit/util.hpp b/cpp/src/jit/util.hpp
index cf0911faee2..a7e9ba62464 100644
--- a/cpp/src/jit/util.hpp
+++ b/cpp/src/jit/util.hpp
@@ -26,12 +26,12 @@ namespace jit {
 /**
  * @brief Get the raw pointer to data in a (mutable_)column_view
  */
-const void* get_data_ptr(column_view const& view);
+void const* get_data_ptr(column_view const& view);
 
 /**
  * @brief Get the raw pointer to data in a scalar
  */
-const void* get_data_ptr(scalar const& s);
+void const* get_data_ptr(scalar const& s);
 
 }  // namespace jit
 }  // namespace cudf
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 30650f6769f..f665aba698f 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,23 +67,25 @@ __global__ void compute_conditional_join_output_size(
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
   std::size_t thread_counter{0};
-  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
-  cudf::size_type const stride         = block_size * gridDim.x;
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
-  auto const inner_num_rows            = (swap_tables ? left_num_rows : right_num_rows);
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
+  auto const stride    = cudf::detail::grid_1d::grid_stride();
+
+  cudf::thread_index_type const left_num_rows  = left_table.num_rows();
+  cudf::thread_index_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows                    = (swap_tables ? right_num_rows : left_num_rows);
+  auto const inner_num_rows                    = (swap_tables ? left_num_rows : right_num_rows);
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
     left_table, right_table, device_expression_data);
 
-  for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
+  for (cudf::thread_index_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
        outer_row_index += stride) {
     bool found_match = false;
-    for (cudf::size_type inner_row_index = 0; inner_row_index < inner_num_rows; inner_row_index++) {
-      auto output_dest           = cudf::ast::detail::value_expression_result<bool, has_nulls>();
-      auto const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
-      auto const right_row_index = swap_tables ? outer_row_index : inner_row_index;
+    for (cudf::thread_index_type inner_row_index = 0; inner_row_index < inner_num_rows;
+         ++inner_row_index) {
+      auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+      cudf::size_type const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
+      cudf::size_type const right_row_index = swap_tables ? outer_row_index : inner_row_index;
       evaluator.evaluate(
         output_dest, left_row_index, right_row_index, 0, thread_intermediate_storage);
       if (output_dest.is_valid() && output_dest.value()) {
@@ -106,7 +108,10 @@ __global__ void compute_conditional_join_output_size(
   std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
-  if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
+  if (threadIdx.x == 0) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
+    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
+  }
 }
 
 /**
@@ -158,18 +163,18 @@ __global__ void conditional_join(table_device_view left_table,
   auto thread_intermediate_storage =
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
-  int const warp_id                    = threadIdx.x / detail::warp_size;
-  int const lane_id                    = threadIdx.x % detail::warp_size;
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
-  auto const inner_num_rows            = (swap_tables ? left_num_rows : right_num_rows);
+  int const warp_id                            = threadIdx.x / detail::warp_size;
+  int const lane_id                            = threadIdx.x % detail::warp_size;
+  cudf::thread_index_type const left_num_rows  = left_table.num_rows();
+  cudf::thread_index_type const right_num_rows = right_table.num_rows();
+  cudf::thread_index_type const outer_num_rows = (swap_tables ? right_num_rows : left_num_rows);
+  cudf::thread_index_type const inner_num_rows = (swap_tables ? left_num_rows : right_num_rows);
 
   if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
 
   __syncwarp();
 
-  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
+  auto outer_row_index = cudf::detail::grid_1d::global_thread_id();
 
   unsigned int const activemask = __ballot_sync(0xffff'ffffu, outer_row_index < outer_num_rows);
 
@@ -178,7 +183,8 @@ __global__ void conditional_join(table_device_view left_table,
 
   if (outer_row_index < outer_num_rows) {
     bool found_match = false;
-    for (size_type inner_row_index(0); inner_row_index < inner_num_rows; ++inner_row_index) {
+    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+         ++inner_row_index) {
       auto output_dest           = cudf::ast::detail::value_expression_result<bool, has_nulls>();
       auto const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
       auto const right_row_index = swap_tables ? outer_row_index : inner_row_index;
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 33f44dbf8f5..453257ab228 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -16,7 +16,6 @@
 #include "join_common_utils.cuh"
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/join.hpp>
 #include <cudf/detail/structs/utilities.hpp>
@@ -75,8 +74,8 @@ std::size_t compute_join_output_size(
   cudf::null_equality nulls_equal,
   rmm::cuda_stream_view stream)
 {
-  const size_type build_table_num_rows{build_table.num_rows()};
-  const size_type probe_table_num_rows{probe_table.num_rows()};
+  size_type const build_table_num_rows{build_table.num_rows()};
+  size_type const probe_table_num_rows{probe_table.num_rows()};
 
   // If the build table is empty, we know exactly how large the output
   // will be for the different types of joins and can return immediately
@@ -376,8 +375,6 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty");
-  CUDF_EXPECTS(build.num_rows() < cudf::detail::MAX_JOIN_SIZE,
-               "Build column size is too big for hash join");
 
   if (_is_empty) { return; }
 
@@ -558,8 +555,6 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
                                      rmm::mr::device_memory_resource* mr) const
 {
   CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
-  CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
-               "Probe column size is too big for hash join");
 
   CUDF_EXPECTS(_build.num_columns() == probe.num_columns(),
                "Mismatch in number of columns to be joined on");
@@ -576,7 +571,7 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
                           std::cend(_build),
                           std::cbegin(probe),
                           std::cend(probe),
-                          [](const auto& b, const auto& p) { return b.type() == p.type(); }),
+                          [](auto const& b, auto const& p) { return b.type() == p.type(); }),
                "Mismatch in joining column data types");
 
   return probe_join_indices(probe, join, output_size, stream, mr);
@@ -597,7 +592,7 @@ hash_join::hash_join(cudf::table_view const& build,
                      nullable_join has_nulls,
                      null_equality compare_nulls,
                      rmm::cuda_stream_view stream)
-  : _impl{std::make_unique<const impl_type>(
+  : _impl{std::make_unique<impl_type const>(
       build, has_nulls == nullable_join::YES, compare_nulls, stream)}
 {
 }
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 0784e2a8ea3..f3ce6de4598 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -77,7 +77,7 @@ class row_is_valid {
  public:
   row_is_valid(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {}
 
-  __device__ __inline__ bool operator()(const size_type& i) const noexcept
+  __device__ __inline__ bool operator()(size_type const& i) const noexcept
   {
     return cudf::bit_is_set(_row_bitmask, i);
   }
@@ -254,9 +254,9 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
 template <typename T>
 struct valid_range {
   T start, stop;
-  __host__ __device__ valid_range(const T begin, const T end) : start(begin), stop(end) {}
+  __host__ __device__ valid_range(T const begin, T const end) : start(begin), stop(end) {}
 
-  __host__ __device__ __forceinline__ bool operator()(const T index)
+  __host__ __device__ __forceinline__ bool operator()(T const index)
   {
     return ((index >= start) && (index < stop));
   }
@@ -273,10 +273,10 @@ struct valid_range {
  * @param[out] joined_shared_l Pointer to the shared memory cache for left indices
  * @param[out] joined_shared_r Pointer to the shared memory cache for right indices
  */
-__inline__ __device__ void add_pair_to_cache(const size_type first,
-                                             const size_type second,
+__inline__ __device__ void add_pair_to_cache(size_type const first,
+                                             size_type const second,
                                              size_type* current_idx_shared,
-                                             const int warp_id,
+                                             int const warp_id,
                                              size_type* joined_shared_l,
                                              size_type* joined_shared_r)
 {
@@ -288,10 +288,10 @@ __inline__ __device__ void add_pair_to_cache(const size_type first,
 }
 
 template <int num_warps, cudf::size_type output_cache_size>
-__device__ void flush_output_cache(const unsigned int activemask,
-                                   const cudf::size_type max_size,
-                                   const int warp_id,
-                                   const int lane_id,
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   cudf::size_type const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
                                    cudf::size_type* current_idx,
                                    cudf::size_type current_idx_shared[num_warps],
                                    size_type join_shared_l[num_warps][output_cache_size],
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 39ec8884ba4..4c1b1ed98b1 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -16,8 +16,7 @@
 #pragma once
 
 #include <cudf/detail/join.hpp>
-#include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
@@ -30,17 +29,18 @@
 #include <cuco/static_map.cuh>
 #include <cuco/static_multimap.cuh>
 
+#include <cuda/atomic>
+
 #include <limits>
 
 namespace cudf {
 namespace detail {
-constexpr size_type MAX_JOIN_SIZE{std::numeric_limits<size_type>::max()};
 
 constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128;
 constexpr int DEFAULT_JOIN_CACHE_SIZE = 128;
 constexpr size_type JoinNoneValue     = std::numeric_limits<size_type>::min();
 
-using pair_type = cuco::pair_type<hash_value_type, size_type>;
+using pair_type = cuco::pair<hash_value_type, size_type>;
 
 using hash_type = cuco::murmurhash3_32<hash_value_type>;
 
@@ -60,7 +60,8 @@ using mixed_multimap_type = cuco::static_multimap<hash_value_type,
 using semi_map_type = cuco::
   static_map<hash_value_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
 
-using row_hash_legacy = cudf::row_hasher<default_hash, cudf::nullate::DYNAMIC>;
+using row_hash_legacy =
+  cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
 
 using row_equality_legacy = cudf::row_equality_comparator<cudf::nullate::DYNAMIC>;
 
diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh
index 80c3cef9899..38e5b75ade6 100644
--- a/cpp/src/join/mixed_join_common_utils.cuh
+++ b/cpp/src/join/mixed_join_common_utils.cuh
@@ -30,7 +30,8 @@ namespace cudf {
 namespace detail {
 
 using row_hash =
-  cudf::experimental::row::hash::device_row_hasher<default_hash, cudf::nullate::DYNAMIC>;
+  cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                   cudf::nullate::DYNAMIC>;
 
 // // This alias is used by mixed_joins, which support only non-nested types
 using row_equality = cudf::experimental::row::equality::strong_index_comparator_adapter<
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 0b7596dbe6b..ef377dadc4b 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -95,7 +95,10 @@ __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
   std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
-  if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
+  if (threadIdx.x == 0) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
+    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
+  }
 }
 
 }  // namespace detail
diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu
index 009669252fb..fd7bf0234e9 100644
--- a/cpp/src/join/mixed_join_size_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_size_kernels_semi.cu
@@ -83,7 +83,10 @@ __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size_sem
   std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
-  if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
+  if (threadIdx.x == 0) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
+    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
+  }
 }
 
 template __global__ void compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 4c3469c679e..1a603785a41 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,13 +89,13 @@ struct bin_finder {
     return (m_right_comp(value, m_right_begin[index])) ? index : NULL_VALUE;
   }
 
-  const RandomAccessIterator
+  RandomAccessIterator const
     m_left_begin{};  // The beginning of the range containing the left bin edges.
-  const RandomAccessIterator m_left_end{};  // The end of the range containing the left bin edges.
-  const RandomAccessIterator
+  RandomAccessIterator const m_left_end{};  // The end of the range containing the left bin edges.
+  RandomAccessIterator const
     m_right_begin{};                   // The beginning of the range containing the right bin edges.
-  const LeftComparator m_left_comp{};  // Comparator used for left edges.
-  const RightComparator m_right_comp{};  // Comparator used for right edges.
+  LeftComparator const m_left_comp{};  // Comparator used for left edges.
+  RightComparator const m_right_comp{};  // Comparator used for right edges.
 };
 
 // Functor to identify rows that should be filtered out based on the sentinel set by
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 257b0aed82f..fbe297765f8 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -19,7 +19,9 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/lists_column_view.hpp>
@@ -53,11 +55,10 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
 {
   auto const num_rows = input.size();
 
-  static_assert(std::is_same_v<offset_type, int32_t> && std::is_same_v<size_type, int32_t>);
   auto out_offsets = make_numeric_column(
-    data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
 
-  auto const d_out_offsets  = out_offsets->mutable_view().template begin<offset_type>();
+  auto const d_out_offsets  = out_offsets->mutable_view().template begin<size_type>();
   auto const d_row_offsets  = lists_column_view(input).offsets_begin();
   auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin();
 
@@ -121,13 +122,8 @@ generate_list_offsets_and_validities(column_view const& input,
 {
   auto const num_rows = input.size();
 
-  static_assert(std::is_same_v<offset_type, int32_t> && std::is_same_v<size_type, int32_t>);
-  auto out_offsets = make_numeric_column(
-    data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
-
   auto const lists_of_lists_dv_ptr = column_device_view::create(input, stream);
   auto const lists_dv_ptr   = column_device_view::create(lists_column_view(input).child(), stream);
-  auto const d_out_offsets  = out_offsets->mutable_view().template begin<offset_type>();
   auto const d_row_offsets  = lists_column_view(input).offsets_begin();
   auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin();
 
@@ -135,23 +131,19 @@ generate_list_offsets_and_validities(column_view const& input,
   auto validities = rmm::device_uvector<int8_t>(num_rows, stream);
 
   // Compute output list sizes and validities.
-  auto const iter = thrust::make_counting_iterator<size_type>(0);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    iter,
-    iter + num_rows,
-    d_out_offsets,
+  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
+    0,
     [lists_of_lists_dv = *lists_of_lists_dv_ptr,
      lists_dv          = *lists_dv_ptr,
      d_row_offsets,
      d_list_offsets,
-     d_validities = validities.begin(),
-     iter] __device__(auto const idx) {
+     d_validities = validities.begin()] __device__(auto const idx) {
       if (d_row_offsets[idx] == d_row_offsets[idx + 1]) {  // This is a null/empty row.
         d_validities[idx] = static_cast<int8_t>(lists_of_lists_dv.is_valid(idx));
         return size_type{0};
       }
       // The output row will not be null only if all lists on the input row are not null.
+      auto const iter = thrust::make_counting_iterator<size_type>(0);
       auto const is_valid =
         thrust::all_of(thrust::seq,
                        iter + d_row_offsets[idx],
@@ -163,10 +155,9 @@ generate_list_offsets_and_validities(column_view const& input,
       // Compute size of the output list as sum of sizes of all lists in the current input row.
       return d_list_offsets[d_row_offsets[idx + 1]] - d_list_offsets[d_row_offsets[idx]];
     });
-
   // Compute offsets from sizes.
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), d_out_offsets, d_out_offsets + num_rows + 1, d_out_offsets);
+  auto out_offsets = std::get<0>(
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + num_rows, stream, mr));
 
   return {std::move(out_offsets), std::move(validities)};
 }
@@ -198,7 +189,7 @@ std::unique_ptr<column> gather_list_entries(column_view const& input,
      d_list_offsets,
      d_indices = gather_map.begin(),
      d_out_list_offsets =
-       output_list_offsets.template begin<offset_type>()] __device__(size_type const idx) {
+       output_list_offsets.template begin<size_type>()] __device__(size_type const idx) {
       // The output row has been identified as a null/empty list during list size computation.
       if (d_out_list_offsets[idx + 1] == d_out_list_offsets[idx]) { return; }
 
@@ -256,24 +247,18 @@ std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
-  auto type = input.type();  // Column that is lists of lists.
-  CUDF_EXPECTS(
-    type.id() == type_id::LIST, "Input column must be a lists column.", std::invalid_argument);
-
-  auto col = lists_column_view(input).child();  // Rows, which are lists.
-  type     = col.type();
-  CUDF_EXPECTS(
-    type.id() == type_id::LIST, "Rows of the input column must be lists.", std::invalid_argument);
+  CUDF_EXPECTS(input.type().id() == type_id::LIST,
+               "Input column must be a lists column.",
+               std::invalid_argument);
 
-  col  = lists_column_view(col).child();  // The last level entries what we need to check.
-  type = col.type();
-  CUDF_EXPECTS(type.id() == type_id::LIST || !cudf::is_nested(type),
-               "Entry of the input lists column must be of list or non-nested types.");
+  auto const child = lists_column_view(input).child();
+  CUDF_EXPECTS(child.type().id() == type_id::LIST,
+               "Child of the input lists column must also be a lists column.",
+               std::invalid_argument);
 
   if (input.size() == 0) { return cudf::empty_like(input); }
 
-  bool has_null_list = lists_column_view(input).child().has_nulls();
-
+  bool const has_null_list = child.has_nulls();
   return (null_policy == concatenate_null_policy::IGNORE || !has_null_list)
            ? concatenate_lists_ignore_null(input, has_null_list, stream, mr)
            : concatenate_lists_nullifying_rows(input, stream, mr);
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 993d5e3fc78..658538b0195 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -77,11 +77,8 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input,
                                          rmm::mr::device_memory_resource* mr)
 {
   // outgoing offsets.
-  auto offsets = cudf::make_fixed_width_column(data_type{type_to_id<offset_type>()},
-                                               input.num_rows() + 1,
-                                               mask_state::UNALLOCATED,
-                                               stream,
-                                               mr);
+  auto offsets = cudf::make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, input.num_rows() + 1, mask_state::UNALLOCATED, stream, mr);
 
   auto keys = thrust::make_transform_iterator(thrust::make_counting_iterator(size_t{0}),
                                               [num_columns = input.num_columns()] __device__(
@@ -91,7 +88,7 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input,
   auto values = thrust::make_transform_iterator(
     thrust::make_counting_iterator(size_t{0}),
     [input, row_null_counts = row_null_counts.data(), null_policy] __device__(
-      size_t i) -> offset_type {
+      size_t i) -> size_type {
       auto const col_index = i % input.num_columns();
       auto const row_index = i / input.num_columns();
 
@@ -105,7 +102,7 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input,
         }
       }
       auto offsets =
-        input.column(col_index).child(lists_column_view::offsets_column_index).data<offset_type>() +
+        input.column(col_index).child(lists_column_view::offsets_column_index).data<size_type>() +
         input.column(col_index).offset();
       return offsets[row_index + 1] - offsets[row_index];
     });
@@ -115,13 +112,13 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input,
                         keys + (input.num_rows() * input.num_columns()),
                         values,
                         thrust::make_discard_iterator(),
-                        offsets->mutable_view().begin<offset_type>());
+                        offsets->mutable_view().begin<size_type>());
 
   // convert to offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
-                         offsets->view().begin<offset_type>(),
-                         offsets->view().begin<offset_type>() + input.num_rows() + 1,
-                         offsets->mutable_view().begin<offset_type>(),
+                         offsets->view().begin<size_type>(),
+                         offsets->view().begin<size_type>() + input.num_rows() + 1,
+                         offsets->mutable_view().begin<size_type>(),
                          0);
 
   // generate appropriate null mask
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index a3293e36825..df1d043bdb6 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -18,6 +18,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/contains.hpp>
+#include <cudf/lists/detail/lists_column_factories.hpp>
 #include <cudf/lists/list_device_view.cuh>
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/lists/lists_column_view.hpp>
@@ -70,11 +71,13 @@ static auto constexpr is_supported_non_nested_type()
 /**
  * @brief Check if the given type is supported in `cudf::lists::contains`.
  */
-template <typename Element>
-auto constexpr is_supported_type()
-{
-  return is_supported_non_nested_type<Element>() || cudf::is_nested<Element>();
-}
+struct is_supported_type_fn {
+  template <typename Element>
+  auto constexpr operator()()
+  {
+    return is_supported_non_nested_type<Element>() || cudf::is_nested<Element>();
+  }
+};
 
 /**
  * @brief Return a pair of index iterators {begin, end} to loop through elements within a
@@ -104,66 +107,20 @@ __device__ auto element_index_pair_iter(size_type const size)
   }
 }
 
-/**
- * @brief Functor to perform searching for index of a key element in a given list, specialized
- * for non-nested types.
- */
-struct search_list_non_nested_types_fn {
-  duplicate_find_option const find_option;
-
-  template <typename Element, CUDF_ENABLE_IF(is_supported_non_nested_type<Element>())>
-  __device__ size_type operator()(list_device_view const list,
-                                  thrust::optional<Element> const key_opt) const
-  {
-    // A null list or null key will result in a null output row.
-    if (list.is_null() || !key_opt) { return NULL_SENTINEL; }
-
-    return find_option == duplicate_find_option::FIND_FIRST
-             ? search_list<true, Element>(list, *key_opt)
-             : search_list<false, Element>(list, *key_opt);
-  }
-
-  template <typename Element, CUDF_ENABLE_IF(!is_supported_non_nested_type<Element>())>
-  __device__ size_type operator()(list_device_view const, thrust::optional<Element> const) const
-  {
-    CUDF_UNREACHABLE("Unsupported type.");
-  }
-
- private:
-  template <bool forward, typename Element, CUDF_ENABLE_IF(is_supported_non_nested_type<Element>())>
-  static __device__ inline size_type search_list(list_device_view const list,
-                                                 Element const search_key)
-  {
-    auto const [begin, end] = element_index_pair_iter<forward>(list.size());
-    auto const found_iter =
-      thrust::find_if(thrust::seq, begin, end, [=] __device__(auto const idx) {
-        return !list.is_null(idx) &&
-               cudf::equality_compare(list.template element<Element>(idx), search_key);
-      });
-    // If the key is found, return its found position in the list from `found_iter`.
-    return found_iter == end ? NOT_FOUND_SENTINEL : *found_iter;
-  }
-};
-
 /**
  * @brief Functor to perform searching for index of a key element in a given list, specialized
  * for nested types.
  */
 template <typename KeyValidityIter, typename EqComparator>
-struct search_list_nested_types_fn {
+struct search_list_fn {
   duplicate_find_option const find_option;
   KeyValidityIter const key_validity_iter;
   EqComparator const d_comp;
-  bool const search_key_is_scalar;
-
-  search_list_nested_types_fn(duplicate_find_option const find_option,
-                              KeyValidityIter const key_validity_iter,
-                              EqComparator const& d_comp,
-                              bool search_key_is_scalar)
-    : find_option(find_option),
-      key_validity_iter(key_validity_iter),
-      d_comp(d_comp),
-      search_key_is_scalar(search_key_is_scalar)
+
+  search_list_fn(duplicate_find_option const find_option,
+                 KeyValidityIter const key_validity_iter,
+                 EqComparator const& d_comp)
+    : find_option(find_option), key_validity_iter(key_validity_iter), d_comp(d_comp)
   {
   }
 
@@ -172,13 +129,13 @@ struct search_list_nested_types_fn {
     // A null list or null key will result in a null output row.
     if (list.is_null() || !key_validity_iter[list.row_index()]) { return NULL_SENTINEL; }
 
-    return find_option == duplicate_find_option::FIND_FIRST ? search_list<true>(list)
-                                                            : search_list<false>(list);
+    return find_option == duplicate_find_option::FIND_FIRST ? search_list_op<true>(list)
+                                                            : search_list_op<false>(list);
   }
 
  private:
   template <bool forward>
-  __device__ inline size_type search_list(list_device_view const list) const
+  __device__ inline size_type search_list_op(list_device_view const list) const
   {
     using cudf::experimental::row::lhs_index_type;
     using cudf::experimental::row::rhs_index_type;
@@ -186,9 +143,8 @@ struct search_list_nested_types_fn {
     auto const [begin, end] = element_index_pair_iter<forward>(list.size());
     auto const found_iter =
       thrust::find_if(thrust::seq, begin, end, [=] __device__(auto const idx) {
-        return !list.is_null(idx) &&
-               d_comp(static_cast<lhs_index_type>(list.element_offset(idx)),
-                      static_cast<rhs_index_type>(search_key_is_scalar ? 0 : list.row_index()));
+        return !list.is_null(idx) && d_comp(static_cast<lhs_index_type>(list.element_offset(idx)),
+                                            static_cast<rhs_index_type>(list.row_index()));
       });
     // If the key is found, return its found position in the list from `found_iter`.
     return found_iter == end ? NOT_FOUND_SENTINEL : *found_iter;
@@ -196,186 +152,89 @@ struct search_list_nested_types_fn {
 };
 
 /**
- * @brief Function to search for key element(s) in the corresponding rows of a lists column,
- * specialized for non-nested types.
+ * @brief Function to search for index of key element(s) in the corresponding rows of a lists
+ * column, specialized for nested types.
  */
-template <bool search_key_is_scalar,
-          typename Element,
-          typename InputIterator,
-          typename OutputIterator,
-          typename SearchKeyType>
-void index_of_non_nested_types(InputIterator input_it,
-                               size_type num_rows,
-                               OutputIterator output_it,
-                               SearchKeyType const& search_keys,
-                               bool search_keys_have_nulls,
-                               duplicate_find_option find_option,
-                               rmm::cuda_stream_view stream)
+template <typename InputIterator, typename OutputIterator, typename DeviceComp>
+void index_of(InputIterator input_it,
+              size_type num_rows,
+              OutputIterator output_it,
+              column_view const& child,
+              column_view const& search_keys,
+              duplicate_find_option find_option,
+              DeviceComp d_comp,
+              rmm::cuda_stream_view stream)
 {
-  auto const do_search = [=](auto const keys_iter) {
-    thrust::transform(rmm::exec_policy(stream),
-                      input_it,
-                      input_it + num_rows,
-                      keys_iter,
-                      output_it,
-                      search_list_non_nested_types_fn{find_option});
-  };
-
-  if constexpr (search_key_is_scalar) {
-    auto const keys_iter = cudf::detail::make_optional_iterator<Element>(
-      search_keys, nullate::DYNAMIC{search_keys_have_nulls});
-    do_search(keys_iter);
-  } else {
-    auto const keys_cdv_ptr = column_device_view::create(search_keys, stream);
-    auto const keys_iter    = cudf::detail::make_optional_iterator<Element>(
-      *keys_cdv_ptr, nullate::DYNAMIC{search_keys_have_nulls});
-    do_search(keys_iter);
-  }
+  auto const keys_dv_ptr       = column_device_view::create(search_keys, stream);
+  auto const key_validity_iter = cudf::detail::make_validity_iterator<true>(*keys_dv_ptr);
+  thrust::transform(rmm::exec_policy(stream),
+                    input_it,
+                    input_it + num_rows,
+                    output_it,
+                    search_list_fn{find_option, key_validity_iter, d_comp});
 }
 
 /**
- * @brief Function to search for index of key element(s) in the corresponding rows of a lists
- * column, specialized for nested types.
+ * @brief Dispatch function to search for index of key element(s) in the corresponding rows of a
+ * lists column.
  */
-template <bool search_key_is_scalar,
-          typename InputIterator,
-          typename OutputIterator,
-          typename SearchKeyType>
-void index_of_nested_types(InputIterator input_it,
-                           size_type num_rows,
-                           OutputIterator output_it,
-                           column_view const& child,
-                           SearchKeyType const& search_keys,
-                           duplicate_find_option find_option,
-                           rmm::cuda_stream_view stream)
+std::unique_ptr<column> dispatch_index_of(lists_column_view const& lists,
+                                          column_view const& search_keys,
+                                          duplicate_find_option find_option,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
 {
-  // Create a `table_view` from the search key(s).
-  // If the input search key is a (nested type) scalar, a new column is materialized from that
-  // scalar before a `table_view` is generated from it. As such, the new created column will also be
-  // returned to keep the result `table_view` valid.
-  [[maybe_unused]] auto const [keys_tview, unused_column] =
-    [&]() -> std::pair<table_view, std::unique_ptr<column>> {
-    if constexpr (search_key_is_scalar) {
-      auto tmp_column = make_column_from_scalar(search_keys, 1, stream);
-      return {table_view{{tmp_column->view()}}, std::move(tmp_column)};
-    } else {
-      return {table_view{{search_keys}}, nullptr};
-    }
-  }();
-
-  auto const child_tview = table_view{{child}};
+  CUDF_EXPECTS(cudf::type_dispatcher(search_keys.type(), is_supported_type_fn{}),
+               "Unsupported type in `dispatch_index_of` function.");
+  // Access the child column through `child()` method, not `get_sliced_child()`.
+  // This is because slicing offset has already been taken into account during row
+  // comparisons.
+  auto const child = lists.child();
+
+  CUDF_EXPECTS(child.type() == search_keys.type(),
+               "Type/Scale of search key does not match list column element type.",
+               cudf::data_type_error);
+  CUDF_EXPECTS(search_keys.type().id() != type_id::EMPTY, "Type cannot be empty.");
+
+  auto const search_keys_have_nulls = search_keys.has_nulls();
+
+  auto const num_rows = lists.size();
+
+  auto const lists_cdv_ptr = column_device_view::create(lists.parent(), stream);
+  auto const input_it      = cudf::detail::make_counting_transform_iterator(
+    size_type{0},
+    [lists = cudf::detail::lists_column_device_view{*lists_cdv_ptr}] __device__(auto const idx) {
+      return list_device_view{lists, idx};
+    });
+
+  auto out_positions = make_numeric_column(
+    data_type{type_to_id<size_type>()}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto const output_it = out_positions->mutable_view().template begin<size_type>();
+
+  auto const keys_tview  = cudf::table_view{{search_keys}};
+  auto const child_tview = cudf::table_view{{child}};
   auto const has_nulls   = has_nested_nulls(child_tview) || has_nested_nulls(keys_tview);
   auto const comparator =
     cudf::experimental::row::equality::two_table_comparator(child_tview, keys_tview, stream);
-  auto const d_comp = comparator.equal_to<true>(nullate::DYNAMIC{has_nulls});
-
-  auto const do_search = [=](auto const key_validity_iter) {
-    thrust::transform(
-      rmm::exec_policy(stream),
-      input_it,
-      input_it + num_rows,
-      output_it,
-      search_list_nested_types_fn{find_option, key_validity_iter, d_comp, search_key_is_scalar});
-  };
-
-  if constexpr (search_key_is_scalar) {
-    auto const key_validity_iter = cudf::detail::make_validity_iterator<true>(search_keys);
-    do_search(key_validity_iter);
+  if (cudf::is_nested(search_keys.type())) {
+    auto const d_comp = comparator.equal_to<true>(nullate::DYNAMIC{has_nulls});
+    index_of(input_it, num_rows, output_it, child, search_keys, find_option, d_comp, stream);
   } else {
-    auto const keys_dv_ptr       = column_device_view::create(search_keys, stream);
-    auto const key_validity_iter = cudf::detail::make_validity_iterator<true>(*keys_dv_ptr);
-    do_search(key_validity_iter);
+    auto const d_comp = comparator.equal_to<false>(nullate::DYNAMIC{has_nulls});
+    index_of(input_it, num_rows, output_it, child, search_keys, find_option, d_comp, stream);
   }
-}
 
-/**
- * @brief Dispatch functor to search for index of key element(s) in the corresponding rows of a
- * lists column.
- */
-struct dispatch_index_of {
-  // SFINAE with conditional return type because we need to support device lambda in this function.
-  // This is required due to a limitation of nvcc.
-  template <typename Element, typename SearchKeyType>
-  std::enable_if_t<is_supported_type<Element>(), std::unique_ptr<column>> operator()(
-    lists_column_view const& lists,
-    SearchKeyType const& search_keys,
-    duplicate_find_option find_option,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
-  {
-    // Access the child column through `child()` method, not `get_sliced_child()`.
-    // This is because slicing offset has already been taken into account during row
-    // comparisons.
-    auto const child = lists.child();
-
-    CUDF_EXPECTS(child.type() == search_keys.type(),
-                 "Type/Scale of search key does not match list column element type.",
-                 cudf::data_type_error);
-    CUDF_EXPECTS(search_keys.type().id() != type_id::EMPTY, "Type cannot be empty.");
-
-    auto constexpr search_key_is_scalar = std::is_same_v<SearchKeyType, cudf::scalar>;
-    auto const search_keys_have_nulls   = [&search_keys, stream] {
-      if constexpr (search_key_is_scalar) {
-        return !search_keys.is_valid(stream);
-      } else {
-        return search_keys.has_nulls();
-      }
-    }();
-
-    auto const num_rows = lists.size();
-
-    if (search_key_is_scalar && search_keys_have_nulls) {
-      // If the scalar key is invalid/null, the entire output column will be all nulls.
-      return make_numeric_column(data_type{cudf::type_to_id<size_type>()},
-                                 num_rows,
-                                 cudf::create_null_mask(num_rows, mask_state::ALL_NULL, mr),
-                                 num_rows,
-                                 stream,
-                                 mr);
-    }
-
-    auto const lists_cdv_ptr = column_device_view::create(lists.parent(), stream);
-    auto const input_it      = cudf::detail::make_counting_transform_iterator(
-      size_type{0},
-      [lists = cudf::detail::lists_column_device_view{*lists_cdv_ptr}] __device__(auto const idx) {
-        return list_device_view{lists, idx};
-      });
-
-    auto out_positions = make_numeric_column(
-      data_type{type_to_id<size_type>()}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr);
-    auto const output_it = out_positions->mutable_view().template begin<size_type>();
-
-    if constexpr (not cudf::is_nested<Element>()) {
-      index_of_non_nested_types<search_key_is_scalar, Element>(
-        input_it, num_rows, output_it, search_keys, search_keys_have_nulls, find_option, stream);
-    } else {  // list + struct
-      index_of_nested_types<search_key_is_scalar>(
-        input_it, num_rows, output_it, child, search_keys, find_option, stream);
-    }
-
-    if (search_keys_have_nulls || lists.has_nulls()) {
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        output_it,
-        output_it + num_rows,
-        [] __device__(auto const idx) { return idx != NULL_SENTINEL; },
-        stream,
-        mr);
-      out_positions->set_null_mask(std::move(null_mask), null_count);
-    }
-    return out_positions;
-  }
-
-  template <typename Element, typename SearchKeyType>
-  std::enable_if_t<!is_supported_type<Element>(), std::unique_ptr<column>> operator()(
-    lists_column_view const&,
-    SearchKeyType const&,
-    duplicate_find_option,
-    rmm::cuda_stream_view,
-    rmm::mr::device_memory_resource*) const
-  {
-    CUDF_FAIL("Unsupported type in `dispatch_index_of` functor.");
+  if (search_keys_have_nulls || lists.has_nulls()) {
+    auto [null_mask, null_count] = cudf::detail::valid_if(
+      output_it,
+      output_it + num_rows,
+      [] __device__(auto const idx) { return idx != NULL_SENTINEL; },
+      stream,
+      mr);
+    out_positions->set_null_mask(std::move(null_mask), null_count);
   }
-};
+  return out_positions;
+}
 
 /**
  * @brief Converts key-positions vector (from `index_of()`) to a BOOL8 vector, indicating if
@@ -414,8 +273,21 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return cudf::type_dispatcher(
-    search_key.type(), dispatch_index_of{}, lists, search_key, find_option, stream, mr);
+  if (!search_key.is_valid(stream)) {
+    return make_numeric_column(data_type{cudf::type_to_id<size_type>()},
+                               lists.size(),
+                               cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr),
+                               lists.size(),
+                               stream,
+                               mr);
+  }
+  if (lists.size() == 0) {
+    return make_numeric_column(
+      data_type{type_to_id<size_type>()}, 0, cudf::mask_state::UNALLOCATED, stream, mr);
+  }
+
+  auto search_key_col = cudf::make_column_from_scalar(search_key, lists.size(), stream, mr);
+  return index_of(lists, search_key_col->view(), find_option, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
@@ -426,8 +298,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
 {
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
-  return cudf::type_dispatcher(
-    search_keys.type(), dispatch_index_of{}, lists, search_keys, find_option, stream, mr);
+  return dispatch_index_of(lists, search_keys, find_option, stream, mr);
 }
 
 std::unique_ptr<column> contains(lists_column_view const& lists,
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index 69d6949ad4a..ddd0dfbe084 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -17,10 +17,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/concatenate.hpp>
+#include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -124,14 +125,9 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
   rmm::device_buffer null_mask = create_null_mask(
     total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
-  cudf::size_type null_count{0};
-  if (has_nulls) {
-    cudf::detail::concatenate_masks(columns, static_cast<bitmask_type*>(null_mask.data()), stream);
-    null_count =
-      std::transform_reduce(columns.begin(), columns.end(), 0, std::plus{}, [](auto const& col) {
-        return col.null_count();
-      });
-  }
+  auto null_mask_data = static_cast<bitmask_type*>(null_mask.data());
+  auto const null_count =
+    has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0};
 
   // assemble into outgoing list column
   return make_lists_column(total_list_count,
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index 0e4b631d56b..2d3826c8004 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -64,8 +64,11 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
     offsets_data + end + 1,  // size of offsets column is 1 greater than slice length
     out_offsets.data(),
     [start_offset] __device__(cudf::size_type i) { return i - start_offset; });
-  auto offsets = std::make_unique<cudf::column>(
-    cudf::data_type{cudf::type_id::INT32}, offsets_count, out_offsets.release());
+  auto offsets = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
+                                                offsets_count,
+                                                out_offsets.release(),
+                                                rmm::device_buffer{},
+                                                0);
 
   // Compute the child column of the result.
   // If the child of this lists column is itself a lists column, we call copy_slice() on it.
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index 2a1f63785d1..ca5358798c0 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -136,7 +136,7 @@ struct list_child_constructor {
    */
   template <typename T>
   struct is_supported_child_type {
-    static const bool value = cudf::is_fixed_width<T>() || std::is_same_v<T, string_view> ||
+    static bool const value = cudf::is_fixed_width<T>() || std::is_same_v<T, string_view> ||
                               std::is_same_v<T, list_view> || std::is_same_v<T, struct_view>;
   };
 
@@ -189,7 +189,7 @@ struct list_child_constructor {
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(child_column->size()),
       child_column->mutable_view().begin<T>(),
-      [offset_begin  = list_offsets.begin<offset_type>(),
+      [offset_begin  = list_offsets.begin<size_type>(),
        offset_size   = list_offsets.size(),
        d_list_vector = list_vector.begin(),
        source_lists,
@@ -241,7 +241,7 @@ struct list_child_constructor {
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(string_views.size()),
       string_views.begin(),
-      [offset_begin  = list_offsets.begin<offset_type>(),
+      [offset_begin  = list_offsets.begin<size_type>(),
        offset_size   = list_offsets.size(),
        d_list_vector = list_vector.begin(),
        source_lists,
@@ -255,7 +255,7 @@ struct list_child_constructor {
         auto row_index         = d_list_vector[list_index].row_index();
         auto actual_list_row = d_list_vector[list_index].bind_to_column(source_lists, target_lists);
         auto lists_column    = actual_list_row.get_column();
-        auto lists_offsets_ptr    = lists_column.offsets().template data<offset_type>();
+        auto lists_offsets_ptr    = lists_column.offsets().template data<size_type>();
         auto child_strings_column = lists_column.child();
         auto strings_offset       = lists_offsets_ptr[row_index] + intra_index;
 
@@ -308,7 +308,7 @@ struct list_child_constructor {
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(child_list_views.size()),
       child_list_views.begin(),
-      [offset_begin  = list_offsets.begin<offset_type>(),
+      [offset_begin  = list_offsets.begin<size_type>(),
        offset_size   = list_offsets.size(),
        d_list_vector = list_vector.begin(),
        source_lists,
@@ -323,10 +323,10 @@ struct list_child_constructor {
         auto actual_list_row = d_list_vector[list_index].bind_to_column(source_lists, target_lists);
         auto lists_column    = actual_list_row.get_column();
         auto child_lists_column = lists_column.child();
-        auto lists_offsets_ptr  = lists_column.offsets().template data<offset_type>();
+        auto lists_offsets_ptr  = lists_column.offsets().template data<size_type>();
         auto child_lists_offsets_ptr =
           child_lists_column.child(lists_column_view::offsets_column_index)
-            .template data<offset_type>();
+            .template data<size_type>();
         auto child_row_index = lists_offsets_ptr[row_index] + intra_index;
         auto size =
           child_lists_offsets_ptr[child_row_index + 1] - child_lists_offsets_ptr[child_row_index];
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 9f17d8c0af1..e80d63939ea 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,11 +58,9 @@ generate_list_offsets_and_validities(table_view const& input,
   auto const table_dv_ptr     = table_device_view::create(input, stream);
 
   // The output offsets column.
-  static_assert(sizeof(offset_type) == sizeof(int32_t));
-  static_assert(sizeof(size_type) == sizeof(int32_t));
   auto list_offsets = make_numeric_column(
-    data_type{type_id::INT32}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr);
-  auto const d_offsets = list_offsets->mutable_view().template begin<offset_type>();
+    data_type{type_to_id<size_type>()}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr);
+  auto const d_offsets = list_offsets->mutable_view().template begin<size_type>();
 
   // The array of int8_t to store validities for list elements.
   auto validities = rmm::device_uvector<int8_t>(has_null_mask ? num_output_lists : 0, stream);
@@ -82,7 +80,7 @@ generate_list_offsets_and_validities(table_view const& input,
       auto const& lists_col = table_dv.column(col_id);
       if (has_null_mask) { d_validities[idx] = static_cast<int8_t>(lists_col.is_valid(list_id)); }
       auto const list_offsets =
-        lists_col.child(lists_column_view::offsets_column_index).template data<offset_type>() +
+        lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
         lists_col.offset();
       return list_offsets[list_id + 1] - list_offsets[list_id];
     });
@@ -139,13 +137,13 @@ struct compute_string_sizes_and_interleave_lists_fn {
   table_device_view const table_dv;
 
   // Store list offsets of the output lists column.
-  offset_type const* const dst_list_offsets;
+  size_type const* const dst_list_offsets;
 
   // Flag to specify whether to compute string validities.
   bool const has_null_mask;
 
   // Store offsets of the strings.
-  offset_type* d_offsets{nullptr};
+  size_type* d_offsets{nullptr};
 
   // If d_chars == nullptr: only compute sizes and validities of the output strings.
   // If d_chars != nullptr: only interleave lists of strings.
@@ -164,11 +162,11 @@ struct compute_string_sizes_and_interleave_lists_fn {
     if (has_null_mask and lists_col.is_null(list_id)) { return; }
 
     auto const list_offsets =
-      lists_col.child(lists_column_view::offsets_column_index).template data<offset_type>() +
+      lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
       lists_col.offset();
     auto const& str_col = lists_col.child(lists_column_view::child_column_index);
     auto const str_offsets =
-      str_col.child(strings_column_view::offsets_column_index).template data<offset_type>();
+      str_col.child(strings_column_view::offsets_column_index).template data<size_type>();
 
     // The range of indices of the strings within the source list.
     auto const start_str_idx = list_offsets[list_id];
@@ -224,7 +222,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
   {
     auto const table_dv_ptr = table_device_view::create(input, stream);
     auto comp_fn            = compute_string_sizes_and_interleave_lists_fn{
-      *table_dv_ptr, output_list_offsets.template begin<offset_type>(), data_has_null_mask};
+      *table_dv_ptr, output_list_offsets.template begin<size_type>(), data_has_null_mask};
 
     auto validities =
       rmm::device_uvector<int8_t>(data_has_null_mask ? num_output_entries : 0, stream);
@@ -258,11 +256,11 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
     auto const table_dv_ptr = table_device_view::create(input, stream);
 
     // The output child column.
-    auto output        = allocate_like(lists_column_view(*input.begin()).child(),
-                                num_output_entries,
-                                mask_allocation_policy::NEVER,
-                                stream,
-                                mr);
+    auto output        = cudf::detail::allocate_like(lists_column_view(*input.begin()).child(),
+                                              num_output_entries,
+                                              mask_allocation_policy::NEVER,
+                                              stream,
+                                              mr);
     auto output_dv_ptr = mutable_column_device_view::create(*output, stream);
 
     // The array of int8_t to store entry validities.
@@ -276,14 +274,14 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
       [num_cols,
        table_dv     = *table_dv_ptr,
        d_validities = validities.begin(),
-       d_offsets    = output_list_offsets.template begin<offset_type>(),
+       d_offsets    = output_list_offsets.template begin<size_type>(),
        d_output     = output_dv_ptr->template begin<T>(),
        data_has_null_mask] __device__(size_type const idx) {
         auto const col_id     = idx % num_cols;
         auto const list_id    = idx / num_cols;
         auto const& lists_col = table_dv.column(col_id);
         auto const list_offsets =
-          lists_col.child(lists_column_view::offsets_column_index).template data<offset_type>() +
+          lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
           lists_col.offset();
         auto const& data_col = lists_col.child(lists_column_view::child_column_index);
 
@@ -384,7 +382,7 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
   // specialized for different types.
   auto const num_output_lists = input.num_rows() * input.num_columns();
   auto const num_output_entries =
-    cudf::detail::get_value<offset_type>(offsets_view, num_output_lists, stream);
+    cudf::detail::get_value<size_type>(offsets_view, num_output_lists, stream);
   auto const data_has_null_mask =
     std::any_of(std::cbegin(input), std::cend(input), [](auto const& col) {
       return col.child(lists_column_view::child_column_index).nullable();
diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu
index 754735f5a5b..278e5af07b2 100644
--- a/cpp/src/lists/lists_column_factories.cu
+++ b/cpp/src/lists/lists_column_factories.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.cuh>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/lists_column_factories.hpp>
 
@@ -39,7 +40,7 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
 {
   if (size == 0) {
     return make_lists_column(0,
-                             make_empty_column(type_to_id<offset_type>()),
+                             make_empty_column(type_to_id<size_type>()),
                              empty_like(value.view()),
                              0,
                              cudf::detail::create_null_mask(0, mask_state::UNALLOCATED, stream, mr),
@@ -49,14 +50,9 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
   auto mr_final = size == 1 ? mr : rmm::mr::get_current_device_resource();
 
   // Handcraft a 1-row column
-  auto offsets = make_numeric_column(
-    data_type{type_to_id<offset_type>()}, 2, mask_state::UNALLOCATED, stream, mr_final);
-  auto m_offsets = offsets->mutable_view();
-  thrust::sequence(rmm::exec_policy(stream),
-                   m_offsets.begin<size_type>(),
-                   m_offsets.end<size_type>(),
-                   0,
-                   value.view().size());
+  auto sizes_itr = thrust::constant_iterator<size_type>(value.view().size());
+  auto offsets   = std::get<0>(
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + 1, stream, mr_final));
   size_type null_count = value.is_valid(stream) ? 0 : 1;
   auto null_mask_state = null_count ? mask_state::ALL_NULL : mask_state::UNALLOCATED;
   auto null_mask       = cudf::detail::create_null_mask(1, null_mask_state, stream, mr_final);
@@ -90,7 +86,7 @@ std::unique_ptr<column> make_empty_lists_column(data_type child_type,
                                                 rmm::cuda_stream_view stream,
                                                 rmm::mr::device_memory_resource* mr)
 {
-  auto offsets = make_empty_column(data_type(type_to_id<offset_type>()));
+  auto offsets = make_empty_column(data_type(type_to_id<size_type>()));
   auto child   = make_empty_column(child_type);
   return make_lists_column(
     0, std::move(offsets), std::move(child), 0, rmm::device_buffer{}, stream, mr);
@@ -103,7 +99,7 @@ std::unique_ptr<column> make_all_nulls_lists_column(size_type size,
 {
   auto offsets = [&] {
     auto offsets_buff =
-      cudf::detail::make_zeroed_device_uvector_async<offset_type>(size + 1, stream, mr);
+      cudf::detail::make_zeroed_device_uvector_async<size_type>(size + 1, stream, mr);
     return std::make_unique<column>(std::move(offsets_buff), rmm::device_buffer{}, 0);
   }();
   auto child     = make_empty_column(child_type);
diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu
index d606f11bdb9..a2af85b5dad 100644
--- a/cpp/src/lists/reverse.cu
+++ b/cpp/src/lists/reverse.cu
@@ -56,7 +56,7 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::counting_iterator<size_type>(0),
                      child.size(),
-                     [list_offsets = out_offsets->view().begin<offset_type>(),
+                     [list_offsets = out_offsets->view().begin<size_type>(),
                       list_indices = labels->view().begin<size_type>(),
                       gather_map   = gather_map.begin()] __device__(auto const idx) {
                        auto const list_idx     = list_indices[idx];
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index 895bc9de816..aaee5608cc3 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -47,7 +47,7 @@ struct tabulator {
 
   T const* const starts;
   T const* const steps;
-  offset_type const* const offsets;
+  size_type const* const offsets;
 
   template <typename U>
   static std::enable_if_t<!cudf::is_duration<U>(), T> __device__ multiply(U x, size_type times)
@@ -86,7 +86,7 @@ struct sequences_dispatcher {
                                      size_type n_elements,
                                      column_view const& starts,
                                      std::optional<column_view> const& steps,
-                                     offset_type const* offsets,
+                                     size_type const* offsets,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
@@ -106,7 +106,7 @@ struct sequences_functor<T, std::enable_if_t<is_supported<T>()>> {
                                         size_type n_elements,
                                         column_view const& starts,
                                         std::optional<column_view> const& steps,
-                                        offset_type const* offsets,
+                                        size_type const* offsets,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
   {
@@ -154,14 +154,16 @@ std::unique_ptr<column> sequences(column_view const& starts,
 
   // Generate list offsets for the output.
   auto list_offsets = make_numeric_column(
-    data_type(type_to_id<offset_type>()), n_lists + 1, mask_state::UNALLOCATED, stream, mr);
-  auto const offsets_begin  = list_offsets->mutable_view().template begin<offset_type>();
+    data_type(type_to_id<size_type>()), n_lists + 1, mask_state::UNALLOCATED, stream, mr);
+  auto const offsets_begin  = list_offsets->mutable_view().template begin<size_type>();
   auto const sizes_input_it = cudf::detail::indexalator_factory::make_input_iterator(sizes);
+  // First copy the sizes since the exclusive_scan tries to read (n_lists+1) values
+  thrust::copy_n(rmm::exec_policy(stream), sizes_input_it, sizes.size(), offsets_begin);
 
   auto const n_elements = cudf::detail::sizes_to_offsets(
-    sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin, stream);
-  CUDF_EXPECTS(n_elements <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit",
+    offsets_begin, offsets_begin + list_offsets->size(), offsets_begin, stream);
+  CUDF_EXPECTS(n_elements <= std::numeric_limits<size_type>::max(),
+               "Size of output exceeds the column size limit",
                std::overflow_error);
 
   auto child = type_dispatcher(starts.type(),
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index 0aaa8356304..ad43fbd5b00 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -74,7 +74,7 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                              stream,
                                              rmm::mr::get_current_device_resource());
     auto const d_sizes     = column_device_view::create(*sizes, stream);
-    auto const sizes_begin = cudf::detail::make_null_replacement_iterator(*d_sizes, offset_type{0});
+    auto const sizes_begin = cudf::detail::make_null_replacement_iterator(*d_sizes, size_type{0});
     auto const sizes_end   = sizes_begin + sizes->size();
     auto output_offsets    = cudf::make_numeric_column(
       offset_data_type, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
@@ -82,12 +82,10 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
 
     // Could have attempted an exclusive_scan(), but it would not compute the last entry.
     // Instead, inclusive_scan(), followed by writing `0` to the head of the offsets column.
-    thrust::inclusive_scan(rmm::exec_policy(stream),
-                           sizes_begin,
-                           sizes_end,
-                           output_offsets_view.begin<offset_type>() + 1);
+    thrust::inclusive_scan(
+      rmm::exec_policy(stream), sizes_begin, sizes_end, output_offsets_view.begin<size_type>() + 1);
     CUDF_CUDA_TRY(cudaMemsetAsync(
-      output_offsets_view.begin<offset_type>(), 0, sizeof(offset_type), stream.value()));
+      output_offsets_view.begin<size_type>(), 0, sizeof(size_type), stream.value()));
     return output_offsets;
   };
 
diff --git a/cpp/src/lists/utilities.cu b/cpp/src/lists/utilities.cu
index 50a41c51f76..2c4966c969e 100644
--- a/cpp/src/lists/utilities.cu
+++ b/cpp/src/lists/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,10 +42,10 @@ std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
 
 {
   auto out_offsets = make_numeric_column(
-    data_type{type_to_id<offset_type>()}, n_lists + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, n_lists + 1, mask_state::UNALLOCATED, stream, mr);
 
   auto const labels_begin  = labels.template begin<size_type>();
-  auto const offsets_begin = out_offsets->mutable_view().template begin<offset_type>();
+  auto const offsets_begin = out_offsets->mutable_view().template begin<size_type>();
   cudf::detail::labels_to_offsets(labels_begin,
                                   labels_begin + labels.size(),
                                   offsets_begin,
@@ -60,7 +60,7 @@ std::unique_ptr<column> get_normalized_offsets(lists_column_view const& input,
 {
   if (input.is_empty()) { return empty_like(input.offsets()); }
 
-  auto out_offsets = make_numeric_column(data_type(type_to_id<offset_type>()),
+  auto out_offsets = make_numeric_column(data_type(type_to_id<size_type>()),
                                          input.size() + 1,
                                          cudf::mask_state::UNALLOCATED,
                                          stream,
@@ -68,7 +68,7 @@ std::unique_ptr<column> get_normalized_offsets(lists_column_view const& input,
   thrust::transform(rmm::exec_policy(stream),
                     input.offsets_begin(),
                     input.offsets_end(),
-                    out_offsets->mutable_view().begin<offset_type>(),
+                    out_offsets->mutable_view().begin<size_type>(),
                     [d_offsets = input.offsets_begin()] __device__(auto const offset_val) {
                       // The first offset value, used for zero-normalizing offsets.
                       return offset_val - *d_offsets;
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 9fc58504d0b..c0765b48205 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -78,11 +78,14 @@ __global__ void materialize_merged_bitmask_kernel(
   size_type const num_destination_rows,
   index_type const* const __restrict__ merged_indices)
 {
-  size_type destination_row = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const stride = detail::grid_1d::grid_stride();
 
-  auto active_threads = __ballot_sync(0xffff'ffffu, destination_row < num_destination_rows);
+  auto tid = detail::grid_1d::global_thread_id();
 
-  while (destination_row < num_destination_rows) {
+  auto active_threads = __ballot_sync(0xffff'ffffu, tid < num_destination_rows);
+
+  while (tid < num_destination_rows) {
+    auto const destination_row     = static_cast<size_type>(tid);
     auto const [src_side, src_row] = merged_indices[destination_row];
     bool const from_left{src_side == side::LEFT};
     bool source_bit_is_valid{true};
@@ -99,8 +102,8 @@ __global__ void materialize_merged_bitmask_kernel(
     // Only one thread writes output
     if (0 == threadIdx.x % warpSize) { out_validity[word_index(destination_row)] = result_mask; }
 
-    destination_row += blockDim.x * gridDim.x;
-    active_threads = __ballot_sync(active_threads, destination_row < num_destination_rows);
+    tid += stride;
+    active_threads = __ballot_sync(active_threads, tid < num_destination_rows);
   }
 }
 
@@ -173,9 +176,9 @@ index_vector generate_merged_indices(table_view const& left_table,
                                      bool nullable,
                                      rmm::cuda_stream_view stream)
 {
-  const size_type left_size  = left_table.num_rows();
-  const size_type right_size = right_table.num_rows();
-  const size_type total_size = left_size + right_size;
+  size_type const left_size  = left_table.num_rows();
+  size_type const right_size = right_table.num_rows();
+  size_type const total_size = left_size + right_size;
 
   auto left_gen    = side_index_generator{side::LEFT};
   auto right_gen   = side_index_generator{side::RIGHT};
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index b0174d3bd83..7b6676346c2 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -21,8 +21,8 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scatter.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/partitioning.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
@@ -123,9 +123,9 @@ class bitwise_partitioner {
  */
 template <class row_hasher_t, typename partitioner_type>
 __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
-                                              const size_type num_rows,
-                                              const size_type num_partitions,
-                                              const partitioner_type the_partitioner,
+                                              size_type const num_rows,
+                                              size_type const num_partitions,
+                                              partitioner_type const the_partitioner,
                                               size_type* __restrict__ row_partition_numbers,
                                               size_type* __restrict__ row_partition_offset,
                                               size_type* __restrict__ block_partition_sizes,
@@ -134,7 +134,8 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
   // Accumulate histogram of the size of each partition in shared memory
   extern __shared__ size_type shared_partition_sizes[];
 
-  size_type row_number = threadIdx.x + blockIdx.x * blockDim.x;
+  auto tid          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   // Initialize local histogram
   size_type partition_number = threadIdx.x;
@@ -148,17 +149,18 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
   // Compute the hash value for each row, store it to the array of hash values
   // and compute the partition to which the hash value belongs and increment
   // the shared memory counter for that partition
-  while (row_number < num_rows) {
-    const hash_value_type row_hash_value = the_hasher(row_number);
+  while (tid < num_rows) {
+    auto const row_number                = static_cast<size_type>(tid);
+    hash_value_type const row_hash_value = the_hasher(row_number);
 
-    const size_type partition_number = the_partitioner(row_hash_value);
+    size_type const partition_number = the_partitioner(row_hash_value);
 
     row_partition_numbers[row_number] = partition_number;
 
     row_partition_offset[row_number] =
       atomicAdd(&(shared_partition_sizes[partition_number]), size_type(1));
 
-    row_number += blockDim.x * gridDim.x;
+    tid += stride;
   }
 
   __syncthreads();
@@ -166,13 +168,13 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
   // Flush shared memory histogram to global memory
   partition_number = threadIdx.x;
   while (partition_number < num_partitions) {
-    const size_type block_partition_size = shared_partition_sizes[partition_number];
+    size_type const block_partition_size = shared_partition_sizes[partition_number];
 
     // Update global size of each partition
     atomicAdd(&global_partition_sizes[partition_number], block_partition_size);
 
     // Record the size of this partition in this block
-    const size_type write_location        = partition_number * gridDim.x + blockIdx.x;
+    size_type const write_location        = partition_number * gridDim.x + blockIdx.x;
     block_partition_sizes[write_location] = block_partition_size;
     partition_number += blockDim.x;
   }
@@ -196,8 +198,8 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
  partition(num_partitions -1) offset, ...} }
  */
 __global__ void compute_row_output_locations(size_type* __restrict__ row_partition_numbers,
-                                             const size_type num_rows,
-                                             const size_type num_partitions,
+                                             size_type const num_rows,
+                                             size_type const num_partitions,
                                              size_type* __restrict__ block_partition_offsets)
 {
   // Shared array that holds the offset of this blocks partitions in
@@ -213,24 +215,26 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
   }
   __syncthreads();
 
-  size_type row_number = threadIdx.x + blockIdx.x * blockDim.x;
+  auto tid          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   // Get each row's partition number, and get it's output location by
   // incrementing block's offset counter for that partition number
   // and store the row's output location in-place
-  while (row_number < num_rows) {
+  while (tid < num_rows) {
+    auto const row_number = static_cast<size_type>(tid);
     // Get partition number of this row
-    const size_type partition_number = row_partition_numbers[row_number];
+    size_type const partition_number = row_partition_numbers[row_number];
 
     // Get output location based on partition number by incrementing the
     // corresponding partition offset for this block
-    const size_type row_output_location =
+    size_type const row_output_location =
       atomicAdd(&(shared_partition_offsets[partition_number]), size_type(1));
 
     // Store the row's output location in-place
     row_partition_numbers[row_number] = row_output_location;
 
-    row_number += blockDim.x * gridDim.x;
+    tid += stride;
   }
 }
 
@@ -253,8 +257,8 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
 template <typename InputIter, typename DataType>
 __global__ void copy_block_partitions(InputIter input_iter,
                                       DataType* __restrict__ output_buf,
-                                      const size_type num_rows,
-                                      const size_type num_partitions,
+                                      size_type const num_rows,
+                                      size_type const num_partitions,
                                       size_type const* __restrict__ row_partition_numbers,
                                       size_type const* __restrict__ row_partition_offset,
                                       size_type const* __restrict__ block_partition_sizes,
@@ -307,8 +311,9 @@ __global__ void copy_block_partitions(InputIter input_iter,
   __syncthreads();
 
   // Fetch the input data to shared memory
-  for (size_type row_number = threadIdx.x + blockIdx.x * blockDim.x; row_number < num_rows;
-       row_number += blockDim.x * gridDim.x) {
+  for (auto tid = cudf::detail::grid_1d::global_thread_id(); tid < num_rows;
+       tid += cudf::detail::grid_1d::grid_stride()) {
+    auto const row_number      = static_cast<size_type>(tid);
     size_type const ipartition = row_partition_numbers[row_number];
 
     block_output[partition_offset_shared[ipartition] + row_partition_offset[row_number]] =
@@ -402,7 +407,7 @@ struct copy_block_partitions_dispatcher {
 
   template <typename DataType, CUDF_ENABLE_IF(is_copy_block_supported<DataType>())>
   std::unique_ptr<column> operator()(column_view const& input,
-                                     const size_type num_partitions,
+                                     size_type const num_partitions,
                                      size_type const* row_partition_numbers,
                                      size_type const* row_partition_offset,
                                      size_type const* block_partition_sizes,
@@ -424,12 +429,13 @@ struct copy_block_partitions_dispatcher {
                                grid_size,
                                stream);
 
-    return std::make_unique<column>(input.type(), input.size(), std::move(output));
+    return std::make_unique<column>(
+      input.type(), input.size(), std::move(output), rmm::device_buffer{}, 0);
   }
 
   template <typename DataType, CUDF_ENABLE_IF(not is_copy_block_supported<DataType>())>
   std::unique_ptr<column> operator()(column_view const& input,
-                                     const size_type num_partitions,
+                                     size_type const num_partitions,
                                      size_type const* row_partition_numbers,
                                      size_type const* row_partition_offset,
                                      size_type const* block_partition_sizes,
@@ -723,6 +729,32 @@ struct dispatch_map_type {
 
 namespace detail {
 namespace {
+
+/**
+ * @brief This hash function simply returns the input value cast to the
+ * result_type of the functor.
+ */
+template <typename Key>
+struct IdentityHash {
+  using result_type        = uint32_t;
+  constexpr IdentityHash() = default;
+  constexpr IdentityHash(uint32_t) {}
+
+  template <typename return_type = result_type>
+  constexpr std::enable_if_t<!std::is_arithmetic_v<Key>, return_type> operator()(
+    Key const& key) const
+  {
+    CUDF_UNREACHABLE("IdentityHash does not support this data type");
+  }
+
+  template <typename return_type = result_type>
+  constexpr std::enable_if_t<std::is_arithmetic_v<Key>, return_type> operator()(
+    Key const& key) const
+  {
+    return static_cast<result_type>(key);
+  }
+};
+
 template <template <typename> class hash_function>
 std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   table_view const& input,
@@ -784,14 +816,14 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
 
   switch (hash_function) {
     case (hash_id::HASH_IDENTITY):
-      for (const size_type& column_id : columns_to_hash) {
+      for (size_type const& column_id : columns_to_hash) {
         if (!is_numeric(input.column(column_id).type()))
           CUDF_FAIL("IdentityHash does not support this data type");
       }
-      return detail::hash_partition<detail::IdentityHash>(
+      return detail::hash_partition<cudf::detail::IdentityHash>(
         input, columns_to_hash, num_partitions, seed, stream, mr);
     case (hash_id::HASH_MURMUR3):
-      return detail::hash_partition<detail::MurmurHash3_32>(
+      return detail::hash_partition<cudf::hashing::detail::MurmurHash3_x86_32>(
         input, columns_to_hash, num_partitions, seed, stream, mr);
     default: CUDF_FAIL("Unsupported hash function in hash_partition");
   }
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 0c90b0af8d2..79a25f79f60 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,7 +66,7 @@ struct make_centroid {
 
 // kernel for computing percentiles on input tdigest (mean, weight) centroid data.
 template <typename CentroidIter>
-__global__ void compute_percentiles_kernel(device_span<offset_type const> tdigest_offsets,
+__global__ void compute_percentiles_kernel(device_span<size_type const> tdigest_offsets,
                                            column_device_view percentiles,
                                            CentroidIter centroids_,
                                            double const* min_,
@@ -74,7 +74,7 @@ __global__ void compute_percentiles_kernel(device_span<offset_type const> tdiges
                                            double const* cumulative_weight_,
                                            double* output)
 {
-  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
 
   auto const num_tdigests  = tdigest_offsets.size() - 1;
   auto const tdigest_index = tid / percentiles.size();
@@ -199,8 +199,8 @@ std::unique_ptr<column> compute_approx_percentiles(tdigest_column_view const& in
                                                           rmm::mr::get_current_device_resource());
   auto keys               = cudf::detail::make_counting_transform_iterator(
     0,
-    [offsets_begin = offsets.begin<offset_type>(),
-     offsets_end   = offsets.end<offset_type>()] __device__(size_type i) {
+    [offsets_begin = offsets.begin<size_type>(),
+     offsets_end   = offsets.end<size_type>()] __device__(size_type i) {
       return thrust::distance(
         offsets_begin,
         thrust::prev(thrust::upper_bound(thrust::seq, offsets_begin, offsets_end, i)));
@@ -239,7 +239,7 @@ std::unique_ptr<column> compute_approx_percentiles(tdigest_column_view const& in
   constexpr size_type block_size = 256;
   cudf::detail::grid_1d const grid(percentiles.size() * input.size(), block_size);
   compute_percentiles_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
-    {offsets.begin<offset_type>(), static_cast<size_t>(offsets.size())},
+    {offsets.begin<size_type>(), static_cast<size_t>(offsets.size())},
     *percentiles_cdv,
     centroids,
     tdv.min_begin(),
@@ -294,8 +294,8 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
   auto offsets = cudf::make_fixed_width_column(
     data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
-               offsets->mutable_view().begin<offset_type>(),
-               offsets->mutable_view().end<offset_type>(),
+               offsets->mutable_view().begin<size_type>(),
+               offsets->mutable_view().end<size_type>(),
                0);
 
   auto min_col =
@@ -362,7 +362,7 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
   thrust::exclusive_scan(rmm::exec_policy(stream),
                          row_size_iter,
                          row_size_iter + input.size() + 1,
-                         offsets->mutable_view().begin<offset_type>());
+                         offsets->mutable_view().begin<size_type>());
 
   if (percentiles.size() == 0 || all_empty_rows) {
     return cudf::make_lists_column(
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 3e06df3071a..2ce55e10fb1 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -127,7 +127,7 @@ struct merge_centroids {
  * nearest whole number <= it is floor(3.56) == 3.
  */
 struct nearest_value_scalar_weights_grouped {
-  offset_type const* group_offsets;
+  size_type const* group_offsets;
 
   thrust::pair<double, int> operator() __device__(double next_limit, size_type group_index) const
   {
@@ -167,8 +167,8 @@ struct nearest_value_scalar_weights {
 template <typename GroupOffsetsIter>
 struct nearest_value_centroid_weights {
   double const* cumulative_weights;
-  GroupOffsetsIter outer_offsets;    // groups
-  offset_type const* inner_offsets;  // tdigests within a group
+  GroupOffsetsIter outer_offsets;  // groups
+  size_type const* inner_offsets;  // tdigests within a group
 
   thrust::pair<double, int> operator() __device__(double next_limit, size_type group_index) const
   {
@@ -234,8 +234,8 @@ template <typename GroupLabelsIter, typename GroupOffsetsIter>
 struct cumulative_centroid_weight {
   double const* cumulative_weights;
   GroupLabelsIter group_labels;
-  GroupOffsetsIter outer_offsets;                      // groups
-  cudf::device_span<offset_type const> inner_offsets;  // tdigests with a group
+  GroupOffsetsIter outer_offsets;                    // groups
+  cudf::device_span<size_type const> inner_offsets;  // tdigests with a group
 
   std::tuple<size_type, size_type, double> operator() __device__(size_type value_index) const
   {
@@ -257,7 +257,7 @@ struct cumulative_centroid_weight {
 // retrieve group info (total weight, size, start offset) of scalar inputs by group index.
 struct scalar_group_info_grouped {
   size_type const* group_valid_counts;
-  offset_type const* group_offsets;
+  size_type const* group_offsets;
 
   __device__ thrust::tuple<double, size_type, size_type> operator()(size_type group_index) const
   {
@@ -283,7 +283,7 @@ template <typename GroupOffsetsIter>
 struct centroid_group_info {
   double const* cumulative_weights;
   GroupOffsetsIter outer_offsets;
-  offset_type const* inner_offsets;
+  size_type const* inner_offsets;
 
   __device__ thrust::tuple<double, size_type, size_type> operator()(size_type group_index) const
   {
@@ -375,7 +375,7 @@ __global__ void generate_cluster_limits_kernel(int delta,
                                                CumulativeWeight cumulative_weight,
                                                double* group_cluster_wl,
                                                size_type* group_num_clusters,
-                                               offset_type const* group_cluster_offsets,
+                                               size_type const* group_cluster_offsets,
                                                bool has_nulls)
 {
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -544,12 +544,12 @@ generate_group_cluster_info(int delta,
   thrust::exclusive_scan(rmm::exec_policy(stream),
                          cluster_size,
                          cluster_size + num_groups + 1,
-                         group_cluster_offsets->mutable_view().begin<offset_type>(),
+                         group_cluster_offsets->mutable_view().begin<size_type>(),
                          0);
 
   // total # of clusters
-  offset_type total_clusters =
-    cudf::detail::get_value<offset_type>(group_cluster_offsets->view(), num_groups, stream);
+  size_type total_clusters =
+    cudf::detail::get_value<size_type>(group_cluster_offsets->view(), num_groups, stream);
 
   // fill in the actual cluster weight limits
   rmm::device_uvector<double> group_cluster_wl(total_clusters, stream);
@@ -561,7 +561,7 @@ generate_group_cluster_info(int delta,
     cumulative_weight,
     group_cluster_wl.begin(),
     group_num_clusters.begin(),
-    group_cluster_offsets->view().begin<offset_type>(),
+    group_cluster_offsets->view().begin<size_type>(),
     has_nulls);
 
   return {std::move(group_cluster_wl),
@@ -584,7 +584,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
     return weights[i] == 0;
   };
   // whether or not this particular tdigest is a stub
-  auto is_stub_digest = [offsets = offsets->view().begin<offset_type>(), is_stub_weight] __device__(
+  auto is_stub_digest = [offsets = offsets->view().begin<size_type>(), is_stub_weight] __device__(
                           size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; };
 
   size_type const num_stubs = [&]() {
@@ -622,12 +622,12 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
   auto _weights = remove_stubs(*weights, num_stubs);
 
   // adjust offsets.
-  rmm::device_uvector<offset_type> sizes(num_rows, stream);
+  rmm::device_uvector<size_type> sizes(num_rows, stream);
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator(0),
                     thrust::make_counting_iterator(0) + num_rows,
                     sizes.begin(),
-                    [offsets = offsets->view().begin<offset_type>()] __device__(size_type i) {
+                    [offsets = offsets->view().begin<size_type>()] __device__(size_type i) {
                       return offsets[i + 1] - offsets[i];
                     });
   auto iter = cudf::detail::make_counting_transform_iterator(
@@ -637,7 +637,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
   thrust::exclusive_scan(rmm::exec_policy(stream),
                          iter,
                          iter + num_rows + 1,
-                         offsets->mutable_view().begin<offset_type>(),
+                         offsets->mutable_view().begin<size_type>(),
                          0);
 
   // assemble final column
@@ -659,7 +659,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
  * clustering information.
  *
  * This function is effectively just a reduce_by_key that performs a reduction
- * from input values -> centroid clusters as defined by the the cluster weight
+ * from input values -> centroid clusters as defined by the cluster weight
  * boundaries.
  *
  * @param delta              tdigest compression level
@@ -717,7 +717,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
     thrust::make_counting_iterator(0),
     [delta,
      group_cluster_wl      = group_cluster_wl.data(),
-     group_cluster_offsets = group_cluster_offsets->view().begin<offset_type>(),
+     group_cluster_offsets = group_cluster_offsets->view().begin<size_type>(),
      group_cumulative_weight] __device__(size_type value_index) -> size_type {
       // get group index, relative value index within the group and cumulative weight.
       [[maybe_unused]] auto [group_index, relative_value_index, cumulative_weight] =
@@ -1018,10 +1018,10 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   // bring tdigest offsets back to the host
   auto tdigest_offsets = tdv.centroids().offsets();
-  std::vector<offset_type> h_inner_offsets(tdigest_offsets.size());
+  std::vector<size_type> h_inner_offsets(tdigest_offsets.size());
   cudaMemcpyAsync(h_inner_offsets.data(),
-                  tdigest_offsets.begin<offset_type>(),
-                  sizeof(offset_type) * tdigest_offsets.size(),
+                  tdigest_offsets.begin<size_type>(),
+                  sizeof(size_type) * tdigest_offsets.size(),
                   cudaMemcpyDefault,
                   stream);
 
@@ -1154,7 +1154,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       cumulative_weights->view().begin<double>(),
       group_labels,
       group_offsets,
-      {tdigest_offsets.begin<offset_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
     false,
     stream,
     mr);
@@ -1174,7 +1174,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       cumulative_weights->view().begin<double>(),
       group_labels,
       group_offsets,
-      {tdigest_offsets.begin<offset_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+      {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
     std::move(merged_min_col),
     std::move(merged_max_col),
     group_cluster_wl,
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 9d32bc4c7f6..4717c0673e3 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -16,7 +16,6 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
@@ -25,6 +24,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 
+#include <cuda/atomic>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -43,10 +44,13 @@ struct all_fn {
   struct all_true_fn {
     __device__ void operator()(size_type idx)
     {
-      if (*d_result && (iter[idx] != *d_result)) atomicAnd(d_result, false);
+      if (*d_result && (iter[idx] != *d_result)) {
+        cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*d_result};
+        ref.fetch_and(0, cuda::std::memory_order_relaxed);
+      }
     }
     Iterator iter;
-    bool* d_result;
+    int32_t* d_result;
   };
 
   template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
@@ -61,12 +65,12 @@ struct all_fn {
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
     }();
-    auto result = std::make_unique<numeric_scalar<bool>>(true, true, stream, mr);
+    auto d_result = rmm::device_scalar<int32_t>(1, stream, rmm::mr::get_current_device_resource());
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
-                       all_true_fn<decltype(iter)>{iter, result->data()});
-    return result;
+                       all_true_fn<decltype(iter)>{iter, d_result.data()});
+    return std::make_unique<numeric_scalar<bool>>(d_result.value(stream), true, stream, mr);
   }
   template <typename T, std::enable_if_t<!std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const&,
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index 07977d2417f..f3093df5ac7 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -16,7 +16,6 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
@@ -25,6 +24,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 
+#include <cuda/atomic>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -43,10 +44,13 @@ struct any_fn {
   struct any_true_fn {
     __device__ void operator()(size_type idx)
     {
-      if (!*d_result && (iter[idx] != *d_result)) atomicOr(d_result, true);
+      if (!*d_result && (iter[idx] != *d_result)) {
+        cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*d_result};
+        ref.fetch_or(1, cuda::std::memory_order_relaxed);
+      }
     }
     Iterator iter;
-    bool* d_result;
+    int32_t* d_result;
   };
 
   template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
@@ -61,12 +65,12 @@ struct any_fn {
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
     }();
-    auto result = std::make_unique<numeric_scalar<bool>>(false, true, stream, mr);
+    auto d_result = rmm::device_scalar<int32_t>(0, stream, rmm::mr::get_current_device_resource());
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
-                       any_true_fn<decltype(iter)>{iter, result->data()});
-    return result;
+                       any_true_fn<decltype(iter)>{iter, d_result.data()});
+    return std::make_unique<numeric_scalar<bool>>(d_result.value(stream), true, stream, mr);
   }
   template <typename T, std::enable_if_t<!std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const&,
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index b1d951a783b..c4eb09110c6 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -240,8 +240,8 @@ struct minmax_functor {
       &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDefault, stream.value()));
     // get the keys for those indexes
     auto const keys = dictionary_column_view(col).keys();
-    return {get_element(keys, static_cast<size_type>(host_result.min_val), stream, mr),
-            get_element(keys, static_cast<size_type>(host_result.max_val), stream, mr)};
+    return {detail::get_element(keys, static_cast<size_type>(host_result.min_val), stream, mr),
+            detail::get_element(keys, static_cast<size_type>(host_result.max_val), stream, mr)};
   }
 
   template <typename T, std::enable_if_t<!is_supported<T>()>* = nullptr>
@@ -274,7 +274,7 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
 }  // namespace detail
 
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  const column_view& col, rmm::mr::device_memory_resource* mr)
+  column_view const& col, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minmax(col, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reductions/nested_type_minmax_util.cuh b/cpp/src/reductions/nested_type_minmax_util.cuh
new file mode 100644
index 00000000000..3cf390d3574
--- /dev/null
+++ b/cpp/src/reductions/nested_type_minmax_util.cuh
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/reduction/detail/reduction_operators.cuh>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/table/table_view.hpp>
+
+namespace cudf {
+namespace reduction {
+namespace detail {
+
+/**
+ * @brief Binary operator ArgMin/ArgMax with index values into the input table.
+ */
+template <typename DeviceComparator>
+struct row_arg_minmax_fn {
+  size_type const num_rows;
+  DeviceComparator const comp;
+  bool const is_arg_min;
+
+  row_arg_minmax_fn(size_type num_rows_, DeviceComparator comp_, bool const is_arg_min_)
+    : num_rows{num_rows_}, comp{std::move(comp_)}, is_arg_min{is_arg_min_}
+  {
+  }
+
+  // This function is explicitly prevented from inlining, because it calls to
+  // `DeviceComparator::operator()` which is inlined and very heavy-weight. Inlining
+  // this would result in huge code and significantly compile time when instantiated and
+  // used with `thrust::reduce_by_key` or `thrust::scan_by_key`.
+  __attribute__((noinline)) __device__ auto operator()(size_type lhs_idx, size_type rhs_idx) const
+  {
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/issues/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    if (lhs_idx < 0 || lhs_idx >= num_rows) { return rhs_idx; }
+    if (rhs_idx < 0 || rhs_idx >= num_rows) { return lhs_idx; }
+
+    // Return `lhs_idx` iff:
+    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
+    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
+    return comp(lhs_idx, rhs_idx) == is_arg_min ? lhs_idx : rhs_idx;
+  }
+};
+
+/**
+ * @brief The null order when comparing a null with non-null elements. Currently support only the
+ * default null order: nulls are compared as LESS than any other non-null elements.
+ */
+auto static constexpr DEFAULT_NULL_ORDER = cudf::null_order::BEFORE;
+
+/**
+ * @brief The utility class to provide a binary operator object for lexicographic comparison of
+ * nested-type elements.
+ *
+ * The binary operator provided by this class has an explicit non-inline `operator()` method to
+ * prevent excessive compile time when working with `thrust::reduce_by_key`.
+ *
+ * When it is a structs or a lists column, top-level NULLs are compared as larger than all other
+ * non-null elements - if finding for ARGMIN, or smaller than all other non-null elements - if
+ * finding for ARGMAX. This helps achieve the results of finding the min or max element when nulls
+ * are excluded from the operations, returning null only when all the input elements are nulls.
+ */
+class comparison_binop_generator {
+ private:
+  cudf::table_view const input_tview;
+  bool const has_nulls;
+  bool const is_min_op;
+  rmm::cuda_stream_view stream;
+
+  // Contains data used in `row_comparator` below, thus needs to be kept alive as a member variable.
+  std::unique_ptr<cudf::structs::detail::flattened_table> const flattened_input;
+
+  // Contains data used in the returned binop, thus needs to be kept alive as a member variable.
+  cudf::experimental::row::lexicographic::self_comparator row_comparator;
+
+  comparison_binop_generator(column_view const& input_,
+                             bool is_min_op_,
+                             rmm::cuda_stream_view stream_)
+    : input_tview{cudf::table_view{{input_}}},
+      has_nulls{cudf::has_nested_nulls(input_tview)},
+      is_min_op{is_min_op_},
+      stream{stream_},
+      flattened_input{cudf::structs::detail::flatten_nested_columns(
+        input_tview,
+        {},
+        std::vector<null_order>{DEFAULT_NULL_ORDER},
+        cudf::structs::detail::column_nullability::MATCH_INCOMING,
+        stream,
+        rmm::mr::get_current_device_resource())},
+      row_comparator{[&input_,
+                      &input_tview     = input_tview,
+                      &flattened_input = flattened_input,
+                      is_min_op_,
+                      stream_]() {
+        if (is_min_op_ && input_.has_nulls()) {
+          // If the input column is nested type (struct/list) and has nulls (at the top level), null
+          // structs/lists are excluded from the operations. That is equivalent to considering
+          // top-level nulls as larger than all other non-null elements (if finding for ARGMIN), or
+          // smaller than all other non-null elements (if finding for ARGMAX).
+
+          if (input_.type().id() == cudf::type_id::STRUCT) {
+            // For struct type, it is simple: Just set a separate null order (`null_order::AFTER`)
+            // for the top level column, which is stored at the first position in the null_orders
+            // array resulted from struct flattening.
+            auto null_orders    = flattened_input->null_orders();
+            null_orders.front() = cudf::null_order::AFTER;
+            return cudf::experimental::row::lexicographic::self_comparator{
+              flattened_input->flattened_columns(), {}, null_orders, stream_};
+          } else {
+            // For list type, we cannot set a separate null order for the top level column.
+            // Thus, we have to workaround this by creating a dummy (empty) struct column view
+            // having the same null mask as the input lists column.
+            // This dummy column will have a different null order (`null_order::AFTER`).
+            auto const null_orders =
+              std::vector<null_order>{cudf::null_order::AFTER, DEFAULT_NULL_ORDER};
+            auto const dummy_struct = column_view{data_type{type_id::STRUCT},
+                                                  input_.size(),
+                                                  nullptr,
+                                                  input_.null_mask(),
+                                                  input_.null_count(),
+                                                  0,
+                                                  {}};
+            return cudf::experimental::row::lexicographic::self_comparator{
+              cudf::table_view{{dummy_struct, input_}}, {}, null_orders, stream_};
+          }
+        } else {
+          return cudf::experimental::row::lexicographic::self_comparator{
+            input_tview, {}, std::vector<null_order>{DEFAULT_NULL_ORDER}, stream_};
+        }
+      }()}
+  {
+  }
+
+ public:
+  auto binop() const
+  {
+    auto const device_comp = row_comparator.less<true>(cudf::nullate::DYNAMIC{has_nulls});
+    return row_arg_minmax_fn(input_tview.num_rows(), device_comp, is_min_op);
+  }
+
+  template <typename BinOp>
+  static auto create(column_view const& input, rmm::cuda_stream_view stream)
+  {
+    CUDF_EXPECTS(cudf::is_nested(input.type()),
+                 "This utility class is designed exclusively for nested input types.");
+    return comparison_binop_generator(input,
+                                      std::is_same_v<BinOp, cudf::reduction::detail::op::min> ||
+                                        std::is_same_v<BinOp, cudf::DeviceMin>,
+                                      stream);
+  }
+
+  template <cudf::aggregation::Kind K>
+  static auto create(column_view const& input, rmm::cuda_stream_view stream)
+  {
+    CUDF_EXPECTS(cudf::is_nested(input.type()),
+                 "This utility class is designed exclusively for nested input types.");
+    return comparison_binop_generator(
+      input, K == cudf::aggregation::MIN || K == cudf::aggregation::ARGMIN, stream);
+  }
+};
+
+}  // namespace detail
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index e491958d702..e575bde0ce0 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -36,28 +36,28 @@ std::pair<rmm::device_buffer, size_type> mask_scan(column_view const& input_view
                                                    rmm::mr::device_memory_resource* mr);
 
 template <template <typename> typename DispatchFn>
-std::unique_ptr<column> scan_agg_dispatch(const column_view& input,
+std::unique_ptr<column> scan_agg_dispatch(column_view const& input,
                                           scan_aggregation const& agg,
-                                          null_policy null_handling,
+                                          bitmask_type const* output_mask,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
   switch (agg.kind) {
     case aggregation::SUM:
       return type_dispatcher<dispatch_storage_type>(
-        input.type(), DispatchFn<DeviceSum>(), input, null_handling, stream, mr);
+        input.type(), DispatchFn<DeviceSum>(), input, output_mask, stream, mr);
     case aggregation::MIN:
       return type_dispatcher<dispatch_storage_type>(
-        input.type(), DispatchFn<DeviceMin>(), input, null_handling, stream, mr);
+        input.type(), DispatchFn<DeviceMin>(), input, output_mask, stream, mr);
     case aggregation::MAX:
       return type_dispatcher<dispatch_storage_type>(
-        input.type(), DispatchFn<DeviceMax>(), input, null_handling, stream, mr);
+        input.type(), DispatchFn<DeviceMax>(), input, output_mask, stream, mr);
     case aggregation::PRODUCT:
       // a product scan on a decimal type with non-zero scale would result in each element having
       // a different scale, and because scale is stored once per column, this is not possible
       if (is_fixed_point(input.type())) CUDF_FAIL("decimal32/64/128 cannot support product scan");
       return type_dispatcher<dispatch_storage_type>(
-        input.type(), DispatchFn<DeviceProduct>(), input, null_handling, stream, mr);
+        input.type(), DispatchFn<DeviceProduct>(), input, output_mask, stream, mr);
     default: CUDF_FAIL("Unsupported aggregation operator for scan");
   }
 }
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 4be7012484a..3fb1fc64f61 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -52,7 +52,7 @@ struct scan_dispatcher {
    */
   template <typename T, std::enable_if_t<cuda::std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
-                                     null_policy,
+                                     bitmask_type const*,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
@@ -80,20 +80,24 @@ struct scan_dispatcher {
 
 }  // namespace
 
-std::unique_ptr<column> scan_exclusive(const column_view& input,
+std::unique_ptr<column> scan_exclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  auto output = scan_agg_dispatch<scan_dispatcher>(input, agg, null_handling, stream, mr);
-
-  if (null_handling == null_policy::EXCLUDE) {
-    output->set_null_mask(detail::copy_bitmask(input, stream, mr), input.null_count());
-  } else if (input.nullable()) {
-    auto [mask, null_count] = mask_scan(input, scan_type::EXCLUSIVE, stream, mr);
-    output->set_null_mask(mask, null_count);
-  }
+  auto [mask, null_count] = [&] {
+    if (null_handling == null_policy::EXCLUDE) {
+      return std::make_pair(std::move(detail::copy_bitmask(input, stream, mr)), input.null_count());
+    } else if (input.nullable()) {
+      return mask_scan(input, scan_type::EXCLUSIVE, stream, mr);
+    }
+    return std::make_pair(rmm::device_buffer{}, size_type{0});
+  }();
+
+  auto output = scan_agg_dispatch<scan_dispatcher>(
+    input, agg, static_cast<bitmask_type*>(mask.data()), stream, mr);
+  output->set_null_mask(mask, null_count);
 
   return output;
 }
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 3c7645dc8d7..e74fce62caf 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include <reductions/nested_type_minmax_util.cuh>
 #include <reductions/scan/scan.cuh>
-#include <reductions/struct_minmax_util.cuh>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -23,6 +23,7 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/reduction.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -107,6 +108,7 @@ struct min_max_scan_operator {
 template <typename Op, typename T>
 struct scan_functor {
   static std::unique_ptr<column> invoke(column_view const& input_view,
+                                        bitmask_type const*,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
   {
@@ -125,27 +127,46 @@ struct scan_functor {
   }
 };
 
+struct null_iterator {
+  bitmask_type const* mask;
+  __device__ bool operator()(size_type idx) const { return !bit_is_set(mask, idx); }
+};
+
 template <typename Op>
 struct scan_functor<Op, cudf::string_view> {
   static std::unique_ptr<column> invoke(column_view const& input_view,
+                                        bitmask_type const* mask,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
   {
     auto d_input = column_device_view::create(input_view, stream);
 
     // build indices of the scan operation results
-    rmm::device_uvector<size_type> result(input_view.size(), stream);
+    rmm::device_uvector<size_type> result_map(input_view.size(), stream);
     thrust::inclusive_scan(
       rmm::exec_policy(stream),
       thrust::counting_iterator<size_type>(0),
       thrust::counting_iterator<size_type>(input_view.size()),
-      result.begin(),
+      result_map.begin(),
       min_max_scan_operator<cudf::string_view, Op>{*d_input, input_view.has_nulls()});
 
+    if (input_view.has_nulls()) {
+      // fill the null rows with out-of-bounds values so gather records them as null;
+      // this prevents un-sanitized null entries in the output
+      auto null_itr = detail::make_counting_transform_iterator(0, null_iterator{mask});
+      auto oob_val  = thrust::constant_iterator<size_type>(input_view.size());
+      thrust::scatter_if(rmm::exec_policy(stream),
+                         oob_val,
+                         oob_val + input_view.size(),
+                         thrust::counting_iterator<size_type>(0),
+                         null_itr,
+                         result_map.data());
+    }
+
     // call gather using the indices to build the output column
     auto result_table = cudf::detail::gather(cudf::table_view({input_view}),
-                                             result,
-                                             out_of_bounds_policy::DONT_CHECK,
+                                             result_map,
+                                             out_of_bounds_policy::NULLIFY,
                                              negative_index_policy::NOT_ALLOWED,
                                              stream,
                                              mr);
@@ -156,6 +177,7 @@ struct scan_functor<Op, cudf::string_view> {
 template <typename Op>
 struct scan_functor<Op, cudf::struct_view> {
   static std::unique_ptr<column> invoke(column_view const& input,
+                                        bitmask_type const*,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
   {
@@ -226,11 +248,11 @@ struct scan_dispatcher {
    */
   template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
-                                     null_policy,
+                                     bitmask_type const* output_mask,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    return scan_functor<Op, T>::invoke(input, stream, mr);
+    return scan_functor<Op, T>::invoke(input, output_mask, stream, mr);
   }
 
   template <typename T, typename... Args>
@@ -248,14 +270,18 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  auto output = scan_agg_dispatch<scan_dispatcher>(input, agg, null_handling, stream, mr);
+  auto [mask, null_count] = [&] {
+    if (null_handling == null_policy::EXCLUDE) {
+      return std::make_pair(std::move(detail::copy_bitmask(input, stream, mr)), input.null_count());
+    } else if (input.nullable()) {
+      return mask_scan(input, scan_type::INCLUSIVE, stream, mr);
+    }
+    return std::make_pair(rmm::device_buffer{}, size_type{0});
+  }();
 
-  if (null_handling == null_policy::EXCLUDE) {
-    output->set_null_mask(detail::copy_bitmask(input, stream, mr), input.null_count());
-  } else if (input.nullable()) {
-    auto [mask, null_count] = mask_scan(input, scan_type::INCLUSIVE, stream, mr);
-    output->set_null_mask(mask, null_count);
-  }
+  auto output = scan_agg_dispatch<scan_dispatcher>(
+    input, agg, static_cast<bitmask_type*>(mask.data()), stream, mr);
+  output->set_null_mask(mask, null_count);
 
   // If the input is a structs column, we also need to push down nulls from the parent output column
   // into the children columns.
@@ -265,7 +291,7 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
     auto content          = output->release();
 
     // Build new children columns.
-    const auto null_mask = reinterpret_cast<bitmask_type const*>(content.null_mask->data());
+    auto const null_mask = reinterpret_cast<bitmask_type const*>(content.null_mask->data());
     std::for_each(content.children.begin(),
                   content.children.end(),
                   [null_mask, null_count, stream, mr](auto& child) {
diff --git a/cpp/src/reductions/segmented/simple.cuh b/cpp/src/reductions/segmented/simple.cuh
index 32138f0835b..05a871ed4fb 100644
--- a/cpp/src/reductions/segmented/simple.cuh
+++ b/cpp/src/reductions/segmented/simple.cuh
@@ -82,7 +82,7 @@ std::unique_ptr<column> simple_segmented_reduction(
   ResultType initial_value = [&] {
     if (init.has_value() && init.value().get().is_valid()) {
       using ScalarType = cudf::scalar_type_t<InputType>;
-      auto input_value = static_cast<const ScalarType*>(&init.value().get())->value(stream);
+      auto input_value = static_cast<ScalarType const*>(&init.value().get())->value(stream);
       return static_cast<ResultType>(input_value);
     } else {
       return simple_op.template get_identity<ResultType>();
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 189c17f9b28..9bb01c72d8d 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "struct_minmax_util.cuh"
+#include "nested_type_minmax_util.cuh"
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -68,7 +68,7 @@ std::unique_ptr<scalar> simple_reduction(column_view const& col,
   std::optional<ResultType> const initial_value = [&] {
     if (init.has_value() && init.value().get().is_valid()) {
       using ScalarType = cudf::scalar_type_t<ElementType>;
-      auto input_value = static_cast<const ScalarType*>(&init.value().get())->value(stream);
+      auto input_value = static_cast<ScalarType const*>(&init.value().get())->value(stream);
       return std::optional<ResultType>(static_cast<ResultType>(input_value));
     } else {
       return std::optional<ResultType>(std::nullopt);
@@ -275,7 +275,7 @@ struct same_element_type_dispatcher {
   template <typename ElementType>
   static constexpr bool is_supported()
   {
-    return !(cudf::is_dictionary<ElementType>() || std::is_same_v<ElementType, cudf::list_view>);
+    return !cudf::is_dictionary<ElementType>();
   }
 
   template <typename IndexType, std::enable_if_t<cudf::is_index_type<IndexType>()>* = nullptr>
@@ -299,7 +299,7 @@ struct same_element_type_dispatcher {
 
  public:
   template <typename ElementType,
-            std::enable_if_t<std::is_same_v<ElementType, cudf::struct_view> &&
+            std::enable_if_t<cudf::is_nested<ElementType>() &&
                              (std::is_same_v<Op, cudf::reduction::detail::op::min> ||
                               std::is_same_v<Op, cudf::reduction::detail::op::max>)>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& input,
@@ -307,7 +307,7 @@ struct same_element_type_dispatcher {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    if (init.has_value()) { CUDF_FAIL("Initial value not supported for struct reductions"); }
+    if (init.has_value()) { CUDF_FAIL("Initial value not supported for nested type reductions"); }
 
     if (input.is_empty()) { return cudf::make_empty_scalar_like(input, stream, mr); }
 
@@ -324,8 +324,8 @@ struct same_element_type_dispatcher {
   }
 
   template <typename ElementType,
-            std::enable_if_t<is_supported<ElementType>() && !cudf::is_fixed_point<ElementType>() &&
-                             !std::is_same_v<ElementType, cudf::struct_view>>* = nullptr>
+            std::enable_if_t<is_supported<ElementType>() && !cudf::is_nested<ElementType>() &&
+                             !cudf::is_fixed_point<ElementType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
diff --git a/cpp/src/reductions/struct_minmax_util.cuh b/cpp/src/reductions/struct_minmax_util.cuh
deleted file mode 100644
index 7b56646b153..00000000000
--- a/cpp/src/reductions/struct_minmax_util.cuh
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/aggregation.hpp>
-#include <cudf/detail/structs/utilities.hpp>
-#include <cudf/detail/utilities/device_operators.cuh>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/reduction/detail/reduction_operators.cuh>
-#include <cudf/table/row_operators.cuh>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/table/table_view.hpp>
-
-#include <thrust/reduce.h>
-#include <thrust/scan.h>
-
-namespace cudf {
-namespace reduction {
-namespace detail {
-
-/**
- * @brief Binary operator ArgMin/ArgMax with index values into the input table.
- */
-struct row_arg_minmax_fn {
-  size_type const num_rows;
-  row_lexicographic_comparator<nullate::DYNAMIC> const comp;
-  bool const is_arg_min;
-
-  row_arg_minmax_fn(table_device_view const& table,
-                    bool has_nulls,
-                    null_order const* null_precedence,
-                    bool const is_arg_min)
-    : num_rows(table.num_rows()),
-      comp(nullate::DYNAMIC{has_nulls}, table, table, nullptr, null_precedence),
-      is_arg_min(is_arg_min)
-  {
-  }
-
-  // This function is explicitly prevented from inlining, because it calls to
-  // `row_lexicographic_comparator::operator()` which is inlined and very heavy-weight. As a result,
-  // instantiating this functor will result in huge code, and objects of this functor used with
-  // `thrust::reduce_by_key` or `thrust::scan_by_key` will result in significant compile time.
-  __attribute__((noinline)) __device__ auto operator()(size_type lhs_idx, size_type rhs_idx) const
-  {
-    // The extra bounds checking is due to issue github.com/rapidsai/cudf/issues/9156 and
-    // github.com/NVIDIA/thrust/issues/1525
-    // where invalid random values may be passed here by thrust::reduce_by_key
-    if (lhs_idx < 0 || lhs_idx >= num_rows) { return rhs_idx; }
-    if (rhs_idx < 0 || rhs_idx >= num_rows) { return lhs_idx; }
-
-    // Return `lhs_idx` iff:
-    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
-    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
-    return comp(lhs_idx, rhs_idx) == is_arg_min ? lhs_idx : rhs_idx;
-  }
-};
-
-/**
- * @brief The null order when comparing a null with non-null elements. Currently support only the
- * default null order: nulls are compared as LESS than any other non-null elements.
- */
-auto static constexpr DEFAULT_NULL_ORDER = cudf::null_order::BEFORE;
-
-/**
- * @brief The utility class to provide a binary operator object for lexicographic comparison of
- * struct elements.
- *
- * The input of this class is a structs column. Using the binary operator provided by this class,
- * nulls STRUCT are compared as larger than all other non-null STRUCT elements - if finding for
- * ARGMIN, or smaller than all other non-null STRUCT elements - if finding for ARGMAX. This helps
- * achieve the results of finding the min or max STRUCT element when nulls are excluded from the
- * operations, returning null only when all the input elements are nulls.
- */
-class comparison_binop_generator {
- private:
-  std::unique_ptr<cudf::structs::detail::flattened_table> const flattened_input;
-  std::unique_ptr<table_device_view, std::function<void(table_device_view*)>> const
-    d_flattened_input_ptr;
-  bool const is_min_op;
-  bool const has_nulls;
-
-  std::vector<null_order> null_orders;
-  rmm::device_uvector<null_order> null_orders_dvec;
-
-  comparison_binop_generator(column_view const& input, rmm::cuda_stream_view stream, bool is_min_op)
-    : flattened_input{cudf::structs::detail::flatten_nested_columns(
-        table_view{{input}},
-        {},
-        std::vector<null_order>{DEFAULT_NULL_ORDER},
-        cudf::structs::detail::column_nullability::MATCH_INCOMING,
-        stream,
-        rmm::mr::get_current_device_resource())},
-      d_flattened_input_ptr{
-        table_device_view::create(flattened_input->flattened_columns(), stream)},
-      is_min_op(is_min_op),
-      has_nulls{has_nested_nulls(table_view{{input}})},
-      null_orders_dvec(0, stream)
-  {
-    if (is_min_op) {
-      null_orders = flattened_input->null_orders();
-      // If the input column has nulls (at the top level), null structs are excluded from the
-      // operations, and that is equivalent to considering top-level nulls as larger than all other
-      // non-null STRUCT elements (if finding for ARGMIN), or smaller than all other non-null STRUCT
-      // elements (if finding for ARGMAX). Thus, we need to set a separate null order for the top
-      // level structs column (which is stored at the first position in the null_orders array) to
-      // achieve this purpose.
-      if (input.has_nulls()) { null_orders.front() = cudf::null_order::AFTER; }
-      null_orders_dvec = cudf::detail::make_device_uvector_async(
-        null_orders, stream, rmm::mr::get_current_device_resource());
-    }
-    // else: Don't need to generate nulls order to copy to device memory if we have all null orders
-    // are BEFORE (that happens when we have is_min_op == false).
-  }
-
- public:
-  auto binop() const
-  {
-    return row_arg_minmax_fn(*d_flattened_input_ptr, has_nulls, null_orders_dvec.data(), is_min_op);
-  }
-
-  template <typename BinOp>
-  static auto create(column_view const& input, rmm::cuda_stream_view stream)
-  {
-    return comparison_binop_generator(input,
-                                      stream,
-                                      std::is_same_v<BinOp, cudf::reduction::detail::op::min> ||
-                                        std::is_same_v<BinOp, cudf::DeviceMin>);
-  }
-
-  template <cudf::aggregation::Kind K>
-  static auto create(column_view const& input, rmm::cuda_stream_view stream)
-
-  {
-    return comparison_binop_generator(
-      input, stream, K == cudf::aggregation::MIN || K == cudf::aggregation::ARGMIN);
-  }
-};
-
-}  // namespace detail
-}  // namespace reduction
-}  // namespace cudf
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 68b496e0ab8..2b48aed2d29 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -386,19 +386,21 @@ std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& lo_replace,
                               scalar const& hi,
                               scalar const& hi_replace,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::clamp(input, lo, lo_replace, hi, hi_replace, cudf::get_default_stream(), mr);
+  return detail::clamp(input, lo, lo_replace, hi, hi_replace, stream, mr);
 }
 
 // clamp input at lo and hi
 std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& lo,
                               scalar const& hi,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::clamp(input, lo, lo, hi, hi, cudf::get_default_stream(), mr);
+  return detail::clamp(input, lo, lo, hi, hi, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index ce0d2d07b36..2fcb934ba65 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,18 +111,20 @@ std::unique_ptr<column> replace_nans(column_view const& input,
 
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      column_view const& replacement,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nans(input, replacement, cudf::get_default_stream(), mr);
+  return detail::replace_nans(input, replacement, stream, mr);
 }
 
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      scalar const& replacement,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nans(input, replacement, cudf::get_default_stream(), mr);
+  return detail::replace_nans(input, replacement, stream, mr);
 }
 
 }  // namespace cudf
@@ -202,7 +204,7 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
 
   // from device. unique_ptr which gets automatically cleaned up when we leave.
   auto out_view = out->mutable_view();
-  normalize_nans_and_zeros(out_view, stream);
+  detail::normalize_nans_and_zeros(out_view, stream);
   out->set_null_count(input.null_count());
 
   return out;
@@ -221,10 +223,11 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
 std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
+                                                 rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::normalize_nans_and_zeros(input, cudf::get_default_stream(), mr);
+  return detail::normalize_nans_and_zeros(input, stream, mr);
 }
 
 /**
@@ -237,7 +240,7 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
  * @throws cudf::logic_error if column does not have floating point data type.
  * @param[in, out] in_out mutable_column_view representing input data. data is processed in-place
  */
-void normalize_nans_and_zeros(mutable_column_view& in_out)
+void normalize_nans_and_zeros(mutable_column_view& in_out, rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   detail::normalize_nans_and_zeros(in_out, cudf::get_default_stream());
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index c87e2c9b2e5..2eb624d3f05 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -64,9 +64,9 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
                                       char* chars,
                                       cudf::size_type* valid_counter)
 {
-  cudf::size_type nrows                = input.size();
-  cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
-  cudf::thread_index_type const stride = blockDim.x * gridDim.x;
+  cudf::size_type nrows = input.size();
+  auto i                = cudf::detail::grid_1d::global_thread_id();
+  auto const stride     = cudf::detail::grid_1d::grid_stride();
 
   uint32_t active_mask = 0xffff'ffff;
   active_mask          = __ballot_sync(active_mask, i < nrows);
@@ -117,9 +117,9 @@ __global__ void replace_nulls(cudf::column_device_view input,
                               cudf::mutable_column_device_view output,
                               cudf::size_type* output_valid_count)
 {
-  cudf::size_type nrows                = input.size();
-  cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
-  cudf::thread_index_type const stride = blockDim.x * gridDim.x;
+  cudf::size_type nrows = input.size();
+  auto i                = cudf::detail::grid_1d::global_thread_id();
+  auto const stride     = cudf::detail::grid_1d::grid_stride();
 
   uint32_t active_mask = 0xffff'ffff;
   active_mask          = __ballot_sync(active_mask, i < nrows);
@@ -306,8 +306,8 @@ struct replace_nulls_scalar_kernel_forwarder {
                                            rmm::mr::device_memory_resource* mr)
   {
     CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
-    std::unique_ptr<cudf::column> output =
-      cudf::allocate_like(input, cudf::mask_allocation_policy::NEVER, mr);
+    std::unique_ptr<cudf::column> output = cudf::detail::allocate_like(
+      input, input.size(), cudf::mask_allocation_policy::NEVER, stream, mr);
     auto output_view = output->mutable_view();
 
     using ScalarType = cudf::scalar_type_t<col_type>;
@@ -343,7 +343,7 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
 {
   CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
   cudf::strings_column_view input_s(input);
-  const cudf::string_scalar& repl = static_cast<const cudf::string_scalar&>(replacement);
+  cudf::string_scalar const& repl = static_cast<cudf::string_scalar const&>(replacement);
   return cudf::strings::detail::replace_nulls(input_s, repl, stream, mr);
 }
 
@@ -446,26 +446,29 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::column_view const& replacement,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nulls(input, replacement, cudf::get_default_stream(), mr);
+  return detail::replace_nulls(input, replacement, stream, mr);
 }
 
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::scalar const& replacement,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nulls(input, replacement, cudf::get_default_stream(), mr);
+  return detail::replace_nulls(input, replacement, stream, mr);
 }
 
 std::unique_ptr<cudf::column> replace_nulls(column_view const& input,
                                             replace_policy const& replace_policy,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nulls(input, replace_policy, cudf::get_default_stream(), mr);
+  return detail::replace_nulls(input, replace_policy, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 373e5ee97e2..9341929de44 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -66,10 +66,10 @@ static constexpr int BLOCK_SIZE = 256;
 // return the new_value for output column at index `idx`
 template <class T, bool replacement_has_nulls>
 __device__ auto get_new_value(cudf::size_type idx,
-                              const T* __restrict__ input_data,
-                              const T* __restrict__ values_to_replace_begin,
-                              const T* __restrict__ values_to_replace_end,
-                              const T* __restrict__ d_replacement_values,
+                              T const* __restrict__ input_data,
+                              T const* __restrict__ values_to_replace_begin,
+                              T const* __restrict__ values_to_replace_end,
+                              T const* __restrict__ d_replacement_values,
                               cudf::bitmask_type const* __restrict__ replacement_valid)
 {
   auto found_ptr =
@@ -127,40 +127,42 @@ __global__ void replace_strings_first_pass(cudf::column_device_view input,
                                            cudf::size_type* __restrict__ output_valid_count)
 {
   cudf::size_type nrows = input.size();
-  cudf::size_type i     = blockIdx.x * blockDim.x + threadIdx.x;
+  auto tid              = cudf::detail::grid_1d::global_thread_id();
+  auto const stride     = cudf::detail::grid_1d::grid_stride();
   uint32_t active_mask  = 0xffff'ffffu;
-  active_mask           = __ballot_sync(active_mask, i < nrows);
+  active_mask           = __ballot_sync(active_mask, tid < nrows);
   auto const lane_id{threadIdx.x % cudf::detail::warp_size};
   uint32_t valid_sum{0};
 
-  while (i < nrows) {
+  while (tid < nrows) {
+    auto const idx      = static_cast<cudf::size_type>(tid);
     bool input_is_valid = true;
 
-    if (input_has_nulls) input_is_valid = input.is_valid_nocheck(i);
+    if (input_has_nulls) input_is_valid = input.is_valid_nocheck(idx);
     bool output_is_valid = input_is_valid;
 
     if (input_is_valid) {
-      int result               = get_new_string_value(i, input, values_to_replace, replacement);
-      cudf::string_view output = (result == -1) ? input.element<cudf::string_view>(i)
+      int result               = get_new_string_value(idx, input, values_to_replace, replacement);
+      cudf::string_view output = (result == -1) ? input.element<cudf::string_view>(idx)
                                                 : replacement.element<cudf::string_view>(result);
-      offsets.data<cudf::size_type>()[i] = output.size_bytes();
-      indices.data<cudf::size_type>()[i] = result;
+      offsets.data<cudf::size_type>()[idx] = output.size_bytes();
+      indices.data<cudf::size_type>()[idx] = result;
       if (replacement_has_nulls && result != -1) {
         output_is_valid = replacement.is_valid_nocheck(result);
       }
     } else {
-      offsets.data<cudf::size_type>()[i] = 0;
-      indices.data<cudf::size_type>()[i] = -1;
+      offsets.data<cudf::size_type>()[idx] = 0;
+      indices.data<cudf::size_type>()[idx] = -1;
     }
 
     uint32_t bitmask = __ballot_sync(active_mask, output_is_valid);
     if (0 == lane_id) {
-      output_valid[cudf::word_index(i)] = bitmask;
+      output_valid[cudf::word_index(idx)] = bitmask;
       valid_sum += __popc(bitmask);
     }
 
-    i += blockDim.x * gridDim.x;
-    active_mask = __ballot_sync(active_mask, i < nrows);
+    tid += stride;
+    active_mask = __ballot_sync(active_mask, tid < nrows);
   }
 
   // Compute total valid count for this block and add it to global count
@@ -189,27 +191,32 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input,
                                             cudf::mutable_column_device_view indices)
 {
   cudf::size_type nrows = input.size();
-  cudf::size_type i     = blockIdx.x * blockDim.x + threadIdx.x;
+  auto tid              = cudf::detail::grid_1d::global_thread_id();
+  auto const stride     = cudf::detail::grid_1d::grid_stride();
 
-  while (i < nrows) {
-    bool output_is_valid = true;
-    bool input_is_valid  = true;
-    cudf::size_type idx  = indices.element<cudf::size_type>(i);
+  while (tid < nrows) {
+    auto const idx         = static_cast<cudf::size_type>(tid);
+    auto const replace_idx = indices.element<cudf::size_type>(idx);
+    bool output_is_valid   = true;
+    bool input_is_valid    = true;
 
     if (input_has_nulls) {
-      input_is_valid  = input.is_valid_nocheck(i);
+      input_is_valid  = input.is_valid_nocheck(idx);
       output_is_valid = input_is_valid;
     }
-    if (replacement_has_nulls && idx != -1) { output_is_valid = replacement.is_valid_nocheck(idx); }
+    if (replacement_has_nulls && replace_idx != -1) {
+      output_is_valid = replacement.is_valid_nocheck(replace_idx);
+    }
     if (output_is_valid) {
-      cudf::string_view output = (idx == -1) ? input.element<cudf::string_view>(i)
-                                             : replacement.element<cudf::string_view>(idx);
-      std::memcpy(strings.data<char>() + offsets.data<cudf::size_type>()[i],
+      cudf::string_view output = (replace_idx == -1)
+                                   ? input.element<cudf::string_view>(idx)
+                                   : replacement.element<cudf::string_view>(replace_idx);
+      std::memcpy(strings.data<char>() + offsets.data<cudf::size_type>()[idx],
                   output.data(),
                   output.size_bytes());
     }
 
-    i += blockDim.x * gridDim.x;
+    tid += stride;
   }
 }
 
@@ -247,23 +254,25 @@ __global__ void replace_kernel(cudf::column_device_view input,
 {
   T* __restrict__ output_data = output.data<T>();
 
-  cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  auto tid          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   uint32_t active_mask = 0xffff'ffffu;
-  active_mask          = __ballot_sync(active_mask, i < nrows);
+  active_mask          = __ballot_sync(active_mask, tid < nrows);
   auto const lane_id{threadIdx.x % cudf::detail::warp_size};
   uint32_t valid_sum{0};
 
-  while (i < nrows) {
+  while (tid < nrows) {
+    auto const idx = static_cast<cudf::size_type>(tid);
     bool output_is_valid{true};
     bool input_is_valid{true};
     if (input_has_nulls) {
-      input_is_valid  = input.is_valid_nocheck(i);
+      input_is_valid  = input.is_valid_nocheck(idx);
       output_is_valid = input_is_valid;
     }
     if (input_is_valid)
-      thrust::tie(output_data[i], output_is_valid) = get_new_value<T, replacement_has_nulls>(
-        i,
+      thrust::tie(output_data[idx], output_is_valid) = get_new_value<T, replacement_has_nulls>(
+        idx,
         input.data<T>(),
         values_to_replace.data<T>(),
         values_to_replace.data<T>() + values_to_replace.size(),
@@ -274,13 +283,13 @@ __global__ void replace_kernel(cudf::column_device_view input,
     if (input_has_nulls or replacement_has_nulls) {
       uint32_t bitmask = __ballot_sync(active_mask, output_is_valid);
       if (0 == lane_id) {
-        output.set_mask_word(cudf::word_index(i), bitmask);
+        output.set_mask_word(cudf::word_index(idx), bitmask);
         valid_sum += __popc(bitmask);
       }
     }
 
-    i += blockDim.x * gridDim.x;
-    active_mask = __ballot_sync(active_mask, i < nrows);
+    tid += stride;
+    active_mask = __ballot_sync(active_mask, tid < nrows);
   }
   if (input_has_nulls or replacement_has_nulls) {
     // Compute total valid count for this block and add it to global count
@@ -384,10 +393,16 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
   }
 
   // Create new offsets column to use in kernel
-  std::unique_ptr<cudf::column> sizes = cudf::make_numeric_column(
-    cudf::data_type(cudf::type_id::INT32), input_col.size(), cudf::mask_state::UNALLOCATED, stream);
-  std::unique_ptr<cudf::column> indices = cudf::make_numeric_column(
-    cudf::data_type(cudf::type_id::INT32), input_col.size(), cudf::mask_state::UNALLOCATED, stream);
+  std::unique_ptr<cudf::column> sizes =
+    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                              input_col.size(),
+                              cudf::mask_state::UNALLOCATED,
+                              stream);
+  std::unique_ptr<cudf::column> indices =
+    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                              input_col.size(),
+                              cudf::mask_state::UNALLOCATED,
+                              stream);
 
   auto sizes_view   = sizes->mutable_view();
   auto indices_view = indices->mutable_view();
@@ -413,7 +428,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
     valid_count);
 
   auto [offsets, bytes] = cudf::detail::make_offsets_child_column(
-    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
+    sizes_view.begin<cudf::size_type>(), sizes_view.end<cudf::size_type>(), stream, mr);
   auto offsets_view   = offsets->mutable_view();
   auto device_offsets = cudf::mutable_column_device_view::create(offsets_view, stream);
 
@@ -527,9 +542,9 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
 std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& input_col,
                                                    cudf::column_view const& values_to_replace,
                                                    cudf::column_view const& replacement_values,
+                                                   rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
-  return detail::find_and_replace_all(
-    input_col, values_to_replace, replacement_values, cudf::get_default_stream(), mr);
+  return detail::find_and_replace_all(input_col, values_to_replace, replacement_values, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 015bdd02eca..d803d786517 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -228,7 +228,7 @@ struct interleave_columns_impl<T, std::enable_if_t<cudf::is_fixed_width<T>()>> {
     auto arch_column = input.column(0);
     auto output_size = input.num_columns() * input.num_rows();
     auto output =
-      allocate_like(arch_column, output_size, mask_allocation_policy::NEVER, stream, mr);
+      detail::allocate_like(arch_column, output_size, mask_allocation_policy::NEVER, stream, mr);
     auto device_input  = table_device_view::create(input, stream);
     auto device_output = mutable_column_device_view::create(*output, stream);
     auto index_begin   = thrust::make_counting_iterator<size_type>(0);
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 18174ef1001..9d76c509333 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ struct tile_functor {
 }  // anonymous namespace
 
 namespace detail {
-std::unique_ptr<table> tile(const table_view& in,
+std::unique_ptr<table> tile(table_view const& in,
                             size_type count,
                             rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr)
@@ -60,7 +60,7 @@ std::unique_ptr<table> tile(const table_view& in,
 }
 }  // namespace detail
 
-std::unique_ptr<table> tile(const table_view& in,
+std::unique_ptr<table> tile(table_view const& in,
                             size_type count,
                             rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/rolling/detail/nth_element.cuh b/cpp/src/rolling/detail/nth_element.cuh
index c28d96e7793..bd3cbb39168 100644
--- a/cpp/src/rolling/detail/nth_element.cuh
+++ b/cpp/src/rolling/detail/nth_element.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -157,7 +157,7 @@ std::unique_ptr<column> nth_element(size_type n,
     gather_index_calculator<null_handling, PrecedingIter, FollowingIter>{
       n, input, preceding, following, min_periods, stream});
 
-  auto gather_map = rmm::device_uvector<offset_type>(input.size(), stream);
+  auto gather_map = rmm::device_uvector<size_type>(input.size(), stream);
   thrust::copy(
     rmm::exec_policy(stream), gather_iter, gather_iter + input.size(), gather_map.begin());
 
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
new file mode 100644
index 00000000000..f1a5c4c78a8
--- /dev/null
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/groupby/sort_helper.hpp>
+#include <cudf/detail/utilities/assert.cuh>
+#include <cudf/groupby.hpp>
+#include <cudf/reduction/detail/reduction.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/types.hpp>
+#include <cudf/unary.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+namespace cudf::detail {
+
+bool can_optimize_unbounded_window(bool unbounded_preceding,
+                                   bool unbounded_following,
+                                   size_type min_periods,
+                                   rolling_aggregation const& agg)
+{
+  auto is_supported = [](auto const& agg) {
+    switch (agg.kind) {
+      case cudf::aggregation::Kind::COUNT_ALL: [[fallthrough]];
+      case cudf::aggregation::Kind::COUNT_VALID: [[fallthrough]];
+      case cudf::aggregation::Kind::SUM: [[fallthrough]];
+      case cudf::aggregation::Kind::MIN: [[fallthrough]];
+      case cudf::aggregation::Kind::MAX: return true;
+      default:
+        // COLLECT_LIST and COLLECT_SET can be added at a later date.
+        // Other aggregations do not fit into the [UNBOUNDED, UNBOUNDED]
+        // category. For instance:
+        // 1. Ranking functions (ROW_NUMBER, RANK, DENSE_RANK, PERCENT_RANK)
+        //    use [UNBOUNDED PRECEDING, CURRENT ROW].
+        // 2. LEAD/LAG are defined on finite row boundaries.
+        return false;
+    }
+  };
+
+  return unbounded_preceding && unbounded_following && (min_periods == 1) && is_supported(agg);
+}
+
+/// Converts rolling_aggregation to corresponding reduce/groupby_aggregation.
+template <typename Base>
+struct aggregation_converter {
+  template <aggregation::Kind k>
+  std::unique_ptr<Base> operator()() const
+  {
+    if constexpr (std::is_same_v<Base, cudf::groupby_aggregation> and
+                  k == aggregation::Kind::COUNT_ALL) {
+      // Note: COUNT_ALL cannot be used as a cudf::reduce_aggregation; cudf::reduce does not support
+      // it.
+      return cudf::make_count_aggregation<Base>(null_policy::INCLUDE);
+    } else if constexpr (std::is_same_v<Base, cudf::groupby_aggregation> and
+                         k == aggregation::Kind::COUNT_VALID) {
+      // Note: COUNT_ALL cannot be used as a cudf::reduce_aggregation; cudf::reduce does not support
+      // it.
+      return cudf::make_count_aggregation<Base>(null_policy::EXCLUDE);
+    } else if constexpr (k == aggregation::Kind::SUM) {
+      return cudf::make_sum_aggregation<Base>();
+    } else if constexpr (k == aggregation::Kind::MIN) {
+      return cudf::make_min_aggregation<Base>();
+    } else if constexpr (k == aggregation::Kind::MAX) {
+      return cudf::make_max_aggregation<Base>();
+    } else {
+      CUDF_FAIL("Unsupported aggregation kind for optimized unbounded windows.");
+    }
+  }
+};
+
+template <typename Base>
+std::unique_ptr<Base> convert_to(cudf::rolling_aggregation const& aggr)
+{
+  return cudf::detail::aggregation_dispatcher(aggr.kind, aggregation_converter<Base>{});
+}
+
+/// Compute unbounded rolling window via groupby-aggregation.
+/// Used for input that has groupby key columns.
+std::unique_ptr<column> aggregation_based_rolling_window(table_view const& group_keys,
+                                                         column_view const& input,
+                                                         rolling_aggregation const& aggr,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(group_keys.num_columns() > 0,
+               "Ungrouped rolling window not supported in aggregation path.");
+
+  auto agg_requests = std::vector<cudf::groupby::aggregation_request>{};
+  agg_requests.push_back(cudf::groupby::aggregation_request());
+  agg_requests.front().values = input;
+  agg_requests.front().aggregations.push_back(convert_to<cudf::groupby_aggregation>(aggr));
+
+  auto group_by = cudf::groupby::groupby{group_keys, cudf::null_policy::INCLUDE, cudf::sorted::YES};
+  auto aggregation_results           = group_by.aggregate(agg_requests, stream);
+  auto const& aggregation_result_col = aggregation_results.second.front().results.front();
+
+  using cudf::groupby::detail::sort::sort_groupby_helper;
+  auto helper = sort_groupby_helper{group_keys, cudf::null_policy::INCLUDE, cudf::sorted::YES, {}};
+  auto const& group_labels = helper.group_labels(stream);
+
+  auto result_columns = cudf::detail::gather(cudf::table_view{{*aggregation_result_col}},
+                                             group_labels,
+                                             cudf::out_of_bounds_policy::DONT_CHECK,
+                                             cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                             stream,
+                                             mr)
+                          ->release();
+  return std::move(result_columns.front());
+}
+
+/// Compute unbounded rolling window via cudf::reduce.
+/// Used for input that has no groupby keys. i.e. The window spans the column.
+std::unique_ptr<column> reduction_based_rolling_window(column_view const& input,
+                                                       rolling_aggregation const& aggr,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
+{
+  auto const reduce_results = [&] {
+    auto const return_dtype = cudf::detail::target_type(input.type(), aggr.kind);
+    if (aggr.kind == aggregation::COUNT_ALL) {
+      return cudf::make_fixed_width_scalar(input.size(), stream);
+    } else if (aggr.kind == aggregation::COUNT_VALID) {
+      return cudf::make_fixed_width_scalar(input.size() - input.null_count(), stream);
+    } else {
+      return cudf::reduction::detail::reduce(input,
+                                             *convert_to<cudf::reduce_aggregation>(aggr),
+                                             return_dtype,
+                                             std::nullopt,
+                                             stream,
+                                             rmm::mr::get_current_device_resource());
+    }
+  }();
+  // Blow up results into separate column.
+  return cudf::make_column_from_scalar(*reduce_results, input.size(), stream, mr);
+}
+
+std::unique_ptr<column> optimized_unbounded_window(table_view const& group_keys,
+                                                   column_view const& input,
+                                                   rolling_aggregation const& aggr,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  return group_keys.num_columns() > 0
+           ? aggregation_based_rolling_window(group_keys, input, aggr, stream, mr)
+           : reduction_based_rolling_window(input, aggr, stream, mr);
+}
+}  // namespace cudf::detail
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.hpp b/cpp/src/rolling/detail/optimized_unbounded_window.hpp
new file mode 100644
index 00000000000..5964390398c
--- /dev/null
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace rmm::mr {
+class device_memory_resource;
+}
+
+namespace cudf {
+
+class rolling_aggregation;
+class table_view;
+
+namespace detail {
+/**
+ * @brief Checks if it is possible to optimize fully UNBOUNDED window function.
+ *
+ * @return true if the window aggregation can optimized, i.e. if it is unbounded-preceding,
+ * unbounded-following, if it has a supported aggregation type, and if min_periods is 1.
+ * @return false if the window aggregation cannot be optimized.
+ */
+bool can_optimize_unbounded_window(bool unbounded_preceding,
+                                   bool unbounded_following,
+                                   size_type min_periods,
+                                   rolling_aggregation const& agg);
+
+/**
+ * @brief Optimized bypass for fully UNBOUNDED window functions.
+ *
+ * @return the result column from running the unbounded window aggregation,
+ * via the optimized aggregation/reduction path.
+ */
+std::unique_ptr<column> optimized_unbounded_window(table_view const& group_keys,
+                                                   column_view const& input,
+                                                   rolling_aggregation const& aggr,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr);
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/rolling/detail/range_comparator_utils.cuh b/cpp/src/rolling/detail/range_comparator_utils.cuh
new file mode 100644
index 00000000000..009070d4ac2
--- /dev/null
+++ b/cpp/src/rolling/detail/range_comparator_utils.cuh
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/string_view.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <thrust/functional.h>
+
+#include <cmath>
+#include <limits>
+
+namespace cudf::detail {
+
+/// For order-by columns of signed types, bounds calculation might cause accidental
+/// overflow/underflows. This needs to be detected and handled appropriately
+/// for signed and unsigned types.
+
+/**
+ * @brief Add `delta` to value, and cap at numeric_limits::max(), for signed types.
+ */
+template <typename T, CUDF_ENABLE_IF(cuda::std::numeric_limits<T>::is_signed)>
+__host__ __device__ T add_safe(T const& value, T const& delta)
+{
+  if constexpr (std::is_floating_point_v<T>) {
+    if (std::isinf(value) or std::isnan(value)) { return value; }
+  }
+  // delta >= 0.
+  return (value < 0 || (cuda::std::numeric_limits<T>::max() - value) >= delta)
+           ? (value + delta)
+           : cuda::std::numeric_limits<T>::max();
+}
+
+/**
+ * @brief Add `delta` to value, and cap at numeric_limits::max(), for unsigned types.
+ */
+template <typename T, CUDF_ENABLE_IF(not cuda::std::numeric_limits<T>::is_signed)>
+__host__ __device__ T add_safe(T const& value, T const& delta)
+{
+  // delta >= 0.
+  return ((cuda::std::numeric_limits<T>::max() - value) >= delta)
+           ? (value + delta)
+           : cuda::std::numeric_limits<T>::max();
+}
+
+/**
+ * @brief Subtract `delta` from value, and cap at numeric_limits::lowest(), for signed types.
+ *
+ * Note: We use numeric_limits::lowest() instead of min() because for floats, lowest() returns
+ * the smallest finite value, as opposed to min() which returns the smallest _positive_ value.
+ */
+template <typename T, CUDF_ENABLE_IF(cuda::std::numeric_limits<T>::is_signed)>
+__host__ __device__ T subtract_safe(T const& value, T const& delta)
+{
+  if constexpr (std::is_floating_point_v<T>) {
+    if (std::isinf(value) or std::isnan(value)) { return value; }
+  }
+  // delta >= 0;
+  return (value >= 0 || (value - cuda::std::numeric_limits<T>::lowest()) >= delta)
+           ? (value - delta)
+           : cuda::std::numeric_limits<T>::lowest();
+}
+
+/**
+ * @brief Subtract `delta` from value, and cap at numeric_limits::lowest(), for unsigned types.
+ *
+ * Note: We use numeric_limits::lowest() instead of min() because for floats, lowest() returns
+ * the smallest finite value, as opposed to min() which returns the smallest _positive_ value.
+ *
+ * This distinction isn't truly relevant for this overload (because float is signed).
+ * lowest() is kept for uniformity.
+ */
+template <typename T, CUDF_ENABLE_IF(not cuda::std::numeric_limits<T>::is_signed)>
+__host__ __device__ T subtract_safe(T const& value, T const& delta)
+{
+  // delta >= 0;
+  return ((value - cuda::std::numeric_limits<T>::lowest()) >= delta)
+           ? (value - delta)
+           : cuda::std::numeric_limits<T>::lowest();
+}
+
+/**
+ * @brief Comparator for numeric order-by columns, handling floating point NaN values.
+ *
+ * This is required for binary search through sorted vectors that contain NaN values.
+ * With ascending sort, NaN values are stored at the end of the sequence, even
+ * greater than infinity.
+ * But thrust::less would have trouble locating it because:
+ * 1. thrust::less(NaN, 10) returns false
+ * 2. thrust::less(10, NaN) also returns false
+ *
+ * This comparator honors the position of NaN values vis-à-vis non-NaN values.
+ *
+ */
+struct nan_aware_less {
+  template <typename T, CUDF_ENABLE_IF(not cudf::is_floating_point<T>())>
+  __host__ __device__ bool operator()(T const& lhs, T const& rhs) const
+  {
+    return thrust::less<T>{}(lhs, rhs);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(cudf::is_floating_point<T>())>
+  __host__ __device__ bool operator()(T const& lhs, T const& rhs) const
+  {
+    if (std::isnan(lhs)) { return false; }
+    return std::isnan(rhs) or thrust::less<T>{}(lhs, rhs);
+  }
+};
+
+/**
+ * @brief Comparator for numeric order-by columns, handling floating point NaN values.
+ *
+ * This is required for binary search through sorted vectors that contain NaN values.
+ * With descending sort, NaN values are stored at the beginning of the sequence, even
+ * greater than infinity.
+ * But thrust::greater would have trouble locating it because:
+ * 1. thrust::greater(NaN, 10) returns false
+ * 2. thrust::greater(10, NaN) also returns false
+ *
+ * This comparator honors the position of NaN values vis-à-vis non-NaN values.
+ *
+ */
+struct nan_aware_greater {
+  template <typename T>
+  __host__ __device__ bool operator()(T const& lhs, T const& rhs) const
+  {
+    return nan_aware_less{}(rhs, lhs);
+  }
+};
+}  // namespace cudf::detail
diff --git a/cpp/src/rolling/detail/range_window_bounds.hpp b/cpp/src/rolling/detail/range_window_bounds.hpp
index 76af3f5a774..8a53e937f98 100644
--- a/cpp/src/rolling/detail/range_window_bounds.hpp
+++ b/cpp/src/rolling/detail/range_window_bounds.hpp
@@ -29,7 +29,7 @@ template <typename RangeType>
 constexpr bool is_supported_range_type()
 {
   return cudf::is_duration<RangeType>() || cudf::is_fixed_point<RangeType>() ||
-         (std::is_integral_v<RangeType> && !cudf::is_boolean<RangeType>());
+         (cudf::is_numeric<RangeType>() && !cudf::is_boolean<RangeType>());
 }
 
 /// Checks if the specified type is a supported target type,
@@ -38,7 +38,7 @@ template <typename ColumnType>
 constexpr bool is_supported_order_by_column_type()
 {
   return cudf::is_timestamp<ColumnType>() || cudf::is_fixed_point<ColumnType>() ||
-         (std::is_integral_v<ColumnType> && !cudf::is_boolean<ColumnType>()) ||
+         (cudf::is_numeric<ColumnType>() && !cudf::is_boolean<ColumnType>()) ||
          std::is_same_v<ColumnType, cudf::string_view>;
 }
 
@@ -64,7 +64,7 @@ struct range_type_impl {
 template <typename ColumnType>
 struct range_type_impl<
   ColumnType,
-  std::enable_if_t<std::is_integral_v<ColumnType> && !cudf::is_boolean<ColumnType>(), void>> {
+  std::enable_if_t<cudf::is_numeric<ColumnType>() && !cudf::is_boolean<ColumnType>(), void>> {
   using type     = ColumnType;
   using rep_type = ColumnType;
 };
@@ -98,7 +98,7 @@ void assert_non_negative([[maybe_unused]] T const& value)
 
 template <typename RangeT,
           typename RepT,
-          CUDF_ENABLE_IF(std::is_integral_v<RangeT> && !cudf::is_boolean<RangeT>())>
+          CUDF_ENABLE_IF(cudf::is_numeric<RangeT>() && !cudf::is_boolean<RangeT>())>
 RepT range_comparable_value_impl(scalar const& range_scalar,
                                  bool,
                                  data_type const&,
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index d996f88ca49..3b6d53f43c4 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -22,7 +22,7 @@
 #include "rolling_collect_list.cuh"
 #include "rolling_jit.hpp"
 
-#include <reductions/struct_minmax_util.cuh>
+#include <reductions/nested_type_minmax_util.cuh>
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -454,7 +454,7 @@ struct agg_specific_empty_output {
 
     if constexpr (op == aggregation::COLLECT_LIST) {
       return cudf::make_lists_column(
-        0, make_empty_column(type_to_id<offset_type>()), empty_like(input), 0, {});
+        0, make_empty_column(type_to_id<size_type>()), empty_like(input), 0, {});
     }
 
     return empty_like(input);
@@ -745,7 +745,7 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre
   // MIN aggregations with strings are processed in 2 passes. The first pass performs
   // the rolling operation on a ARGMIN aggregation to generate indices instead of values.
   // Then a second pass uses those indices to gather the final strings.  This step
-  // translates the the MIN -> ARGMIN aggregation
+  // translates the MIN -> ARGMIN aggregation
   std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                   cudf::detail::min_aggregation const&) override
   {
@@ -759,7 +759,7 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre
   // MAX aggregations with strings are processed in 2 passes. The first pass performs
   // the rolling operation on a ARGMAX aggregation to generate indices instead of values.
   // Then a second pass uses those indices to gather the final strings.  This step
-  // translates the the MAX -> ARGMAX aggregation
+  // translates the MAX -> ARGMAX aggregation
   std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                   cudf::detail::max_aggregation const&) override
   {
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cu b/cpp/src/rolling/detail/rolling_collect_list.cu
index f7544e81ba5..85dced0efe3 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cu
+++ b/cpp/src/rolling/detail/rolling_collect_list.cu
@@ -140,8 +140,8 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   thrust::tabulate(rmm::exec_policy(stream),
                    new_sizes->mutable_view().template begin<size_type>(),
                    new_sizes->mutable_view().template end<size_type>(),
-                   [d_gather_map  = gather_map.template begin<offset_type>(),
-                    d_old_offsets = offsets.template begin<offset_type>(),
+                   [d_gather_map  = gather_map.template begin<size_type>(),
+                    d_old_offsets = offsets.template begin<size_type>(),
                     input_row_not_null] __device__(auto i) {
                      return thrust::count_if(thrust::seq,
                                              d_gather_map + d_old_offsets[i],
diff --git a/cpp/src/rolling/detail/rolling_jit.hpp b/cpp/src/rolling/detail/rolling_jit.hpp
index ba49ff7f28c..615cba8b084 100644
--- a/cpp/src/rolling/detail/rolling_jit.hpp
+++ b/cpp/src/rolling/detail/rolling_jit.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,8 +29,8 @@ T minimum(T a, T b)
 }
 
 struct preceding_window_wrapper {
-  const cudf::size_type* d_group_offsets;
-  const cudf::size_type* d_group_labels;
+  cudf::size_type const* d_group_offsets;
+  cudf::size_type const* d_group_labels;
   cudf::size_type preceding_window;
 
   cudf::size_type operator[](cudf::size_type idx)
@@ -42,8 +42,8 @@ struct preceding_window_wrapper {
 };
 
 struct following_window_wrapper {
-  const cudf::size_type* d_group_offsets;
-  const cudf::size_type* d_group_labels;
+  cudf::size_type const* d_group_offsets;
+  cudf::size_type const* d_group_labels;
   cudf::size_type following_window;
 
   cudf::size_type operator[](cudf::size_type idx)
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 861effe8dfb..ca5c04d1c4f 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "detail/optimized_unbounded_window.hpp"
+#include "detail/range_comparator_utils.cuh"
 #include "detail/range_window_bounds.hpp"
 #include "detail/rolling.cuh"
 #include "detail/rolling_jit.hpp"
@@ -114,8 +116,16 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
   CUDF_EXPECTS((default_outputs.is_empty() || default_outputs.size() == input.size()),
                "Defaults column must be either empty or have as many rows as the input column.");
 
-  auto const preceding_window = preceding_window_bounds.value;
-  auto const following_window = following_window_bounds.value;
+  // Detect and bypass fully UNBOUNDED windows.
+  if (can_optimize_unbounded_window(preceding_window_bounds.is_unbounded(),
+                                    following_window_bounds.is_unbounded(),
+                                    min_periods,
+                                    aggr)) {
+    return optimized_unbounded_window(group_keys, input, aggr, stream, mr);
+  }
+
+  auto const preceding_window = preceding_window_bounds.value();
+  auto const following_window = following_window_bounds.value();
 
   if (group_keys.num_columns() == 0) {
     // No Groupby columns specified. Treat as one big group.
@@ -218,58 +228,6 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
 
 namespace {
 
-/// For order-by columns of signed types, bounds calculation might cause accidental
-/// overflow/underflows. This needs to be detected and handled appropriately
-/// for signed and unsigned types.
-
-/**
- * @brief Add `delta` to value, and cap at numeric_limits::max(), for signed types.
- */
-template <typename T, CUDF_ENABLE_IF(cuda::std::numeric_limits<T>::is_signed)>
-__device__ T add_safe(T const& value, T const& delta)
-{
-  // delta >= 0.
-  return (value < 0 || (cuda::std::numeric_limits<T>::max() - value) >= delta)
-           ? (value + delta)
-           : cuda::std::numeric_limits<T>::max();
-}
-
-/**
- * @brief Add `delta` to value, and cap at numeric_limits::max(), for unsigned types.
- */
-template <typename T, CUDF_ENABLE_IF(not cuda::std::numeric_limits<T>::is_signed)>
-__device__ T add_safe(T const& value, T const& delta)
-{
-  // delta >= 0.
-  return ((cuda::std::numeric_limits<T>::max() - value) >= delta)
-           ? (value + delta)
-           : cuda::std::numeric_limits<T>::max();
-}
-
-/**
- * @brief Subtract `delta` from value, and cap at numeric_limits::min(), for signed types.
- */
-template <typename T, CUDF_ENABLE_IF(cuda::std::numeric_limits<T>::is_signed)>
-__device__ T subtract_safe(T const& value, T const& delta)
-{
-  // delta >= 0;
-  return (value >= 0 || (value - cuda::std::numeric_limits<T>::min()) >= delta)
-           ? (value - delta)
-           : cuda::std::numeric_limits<T>::min();
-}
-
-/**
- * @brief Subtract `delta` from value, and cap at numeric_limits::min(), for unsigned types.
- */
-template <typename T, CUDF_ENABLE_IF(not cuda::std::numeric_limits<T>::is_signed)>
-__device__ T subtract_safe(T const& value, T const& delta)
-{
-  // delta >= 0;
-  return ((value - cuda::std::numeric_limits<T>::min()) >= delta)
-           ? (value - delta)
-           : cuda::std::numeric_limits<T>::min();
-}
-
 /**
  * @brief For a specified idx, find the lowest value of the (sorted) orderby column that
  * participates in a range-window query.
@@ -282,7 +240,7 @@ __device__ ElementT compute_lowest_in_window(ElementIter orderby_iter,
   if constexpr (std::is_same_v<ElementT, cudf::string_view>) {
     return orderby_iter[idx];
   } else {
-    return subtract_safe(orderby_iter[idx], delta);
+    return cudf::detail::subtract_safe(orderby_iter[idx], delta);
   }
 }
 
@@ -298,7 +256,7 @@ __device__ ElementT compute_highest_in_window(ElementIter orderby_iter,
   if constexpr (std::is_same_v<ElementT, cudf::string_view>) {
     return orderby_iter[idx];
   } else {
-    return add_safe(orderby_iter[idx], delta);
+    return cudf::detail::add_safe(orderby_iter[idx], delta);
   }
 }
 
@@ -314,7 +272,7 @@ struct device_value_accessor {
    *
    * @param[in] col_ column device view of cudf column
    */
-  __device__ device_value_accessor(column_device_view const& col_) : col{col_}
+  explicit __device__ device_value_accessor(column_device_view const& col_) : col{col_}
   {
     cudf_assert(type_id_matches_device_storage_type<T>(col.type().id()) &&
                 "the data type mismatch");
@@ -369,12 +327,12 @@ std::unique_ptr<column> expand_to_column(Calculator const& calc,
                                          rmm::cuda_stream_view stream)
 {
   auto window_column = cudf::make_numeric_column(
-    cudf::data_type{type_to_id<offset_type>()}, num_rows, cudf::mask_state::UNALLOCATED, stream);
+    cudf::data_type{type_to_id<size_type>()}, num_rows, cudf::mask_state::UNALLOCATED, stream);
 
   auto begin = cudf::detail::make_counting_transform_iterator(0, calc);
 
   thrust::copy_n(
-    rmm::exec_policy(stream), begin, num_rows, window_column->mutable_view().data<offset_type>());
+    rmm::exec_policy(stream), begin, num_rows, window_column->mutable_view().data<size_type>());
 
   return window_column;
 }
@@ -427,7 +385,8 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
     return ((d_orderby + idx) - thrust::lower_bound(thrust::seq,
                                                     d_orderby + group_start,
                                                     d_orderby + idx,
-                                                    lowest_in_window)) +
+                                                    lowest_in_window,
+                                                    cudf::detail::nan_aware_less{})) +
            1;  // Add 1, for `preceding` to account for current row.
   };
 
@@ -457,8 +416,11 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
     auto const group_end         = nulls_begin_idx == 0 ? num_rows : nulls_begin_idx;
     auto const highest_in_window = compute_highest_in_window(d_orderby, idx, following_window);
 
-    return (thrust::upper_bound(
-              thrust::seq, d_orderby + idx, d_orderby + group_end, highest_in_window) -
+    return (thrust::upper_bound(thrust::seq,
+                                d_orderby + idx,
+                                d_orderby + group_end,
+                                highest_in_window,
+                                cudf::detail::nan_aware_less{}) -
             (d_orderby + idx)) -
            1;
   };
@@ -603,7 +565,8 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
     return ((d_orderby + idx) - thrust::lower_bound(thrust::seq,
                                                     d_orderby + search_start,
                                                     d_orderby + idx,
-                                                    lowest_in_window)) +
+                                                    lowest_in_window,
+                                                    cudf::detail::nan_aware_less{})) +
            1;  // Add 1, for `preceding` to account for current row.
   };
 
@@ -644,8 +607,11 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
     auto const search_end        = nulls_begin == group_start ? group_end : nulls_begin;
     auto const highest_in_window = compute_highest_in_window(d_orderby, idx, following_window);
 
-    return (thrust::upper_bound(
-              thrust::seq, d_orderby + idx, d_orderby + search_end, highest_in_window) -
+    return (thrust::upper_bound(thrust::seq,
+                                d_orderby + idx,
+                                d_orderby + search_end,
+                                highest_in_window,
+                                cudf::detail::nan_aware_less{}) -
             (d_orderby + idx)) -
            1;
   };
@@ -701,12 +667,11 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
     auto const group_start       = nulls_begin_idx == 0 ? nulls_end_idx : 0;
     auto const highest_in_window = compute_highest_in_window(d_orderby, idx, preceding_window);
 
-    return ((d_orderby + idx) -
-            thrust::lower_bound(thrust::seq,
-                                d_orderby + group_start,
-                                d_orderby + idx,
-                                highest_in_window,
-                                thrust::greater<decltype(highest_in_window)>())) +
+    return ((d_orderby + idx) - thrust::lower_bound(thrust::seq,
+                                                    d_orderby + group_start,
+                                                    d_orderby + idx,
+                                                    highest_in_window,
+                                                    cudf::detail::nan_aware_greater{})) +
            1;  // Add 1, for `preceding` to account for current row.
   };
 
@@ -740,7 +705,7 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
                                 d_orderby + idx,
                                 d_orderby + group_end,
                                 lowest_in_window,
-                                thrust::greater<decltype(lowest_in_window)>()) -
+                                cudf::detail::nan_aware_greater{}) -
             (d_orderby + idx)) -
            1;
   };
@@ -801,12 +766,11 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
     auto const search_start      = nulls_begin == group_start ? nulls_end : group_start;
     auto const highest_in_window = compute_highest_in_window(d_orderby, idx, preceding_window);
 
-    return ((d_orderby + idx) -
-            thrust::lower_bound(thrust::seq,
-                                d_orderby + search_start,
-                                d_orderby + idx,
-                                highest_in_window,
-                                thrust::greater<decltype(highest_in_window)>())) +
+    return ((d_orderby + idx) - thrust::lower_bound(thrust::seq,
+                                                    d_orderby + search_start,
+                                                    d_orderby + idx,
+                                                    highest_in_window,
+                                                    cudf::detail::nan_aware_greater{})) +
            1;  // Add 1, for `preceding` to account for current row.
   };
 
@@ -848,7 +812,7 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
                                 d_orderby + idx,
                                 d_orderby + search_end,
                                 lowest_in_window,
-                                thrust::greater<decltype(lowest_in_window)>()) -
+                                cudf::detail::nan_aware_greater{}) -
             (d_orderby + idx)) -
            1;
   };
@@ -1041,9 +1005,9 @@ range_window_bounds to_range_bounds(cudf::size_type num_days, cudf::data_type ti
 range_window_bounds to_range_bounds(cudf::window_bounds const& days_bounds,
                                     cudf::data_type timestamp_type)
 {
-  return days_bounds.is_unbounded
+  return days_bounds.is_unbounded()
            ? range_window_bounds::unbounded(get_duration_type_for(timestamp_type))
-           : cudf::type_dispatcher(timestamp_type, to_duration_bounds{}, days_bounds.value);
+           : cudf::type_dispatcher(timestamp_type, to_duration_bounds{}, days_bounds.value());
 }
 
 }  // namespace
@@ -1084,6 +1048,12 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
 
   CUDF_EXPECTS((min_periods > 0), "min_periods must be positive");
 
+  // Detect and bypass fully UNBOUNDED windows.
+  if (can_optimize_unbounded_window(
+        preceding.is_unbounded(), following.is_unbounded(), min_periods, aggr)) {
+    return optimized_unbounded_window(group_keys, input, aggr, stream, mr);
+  }
+
   using sort_groupby_helper = cudf::groupby::detail::sort::sort_groupby_helper;
   using index_vector        = sort_groupby_helper::index_vector;
 
diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index 3bfee32d1cc..06b224c39ad 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
     bool const output_is_valid = (count >= min_periods);
 
     // set the mask
-    const unsigned int result_mask = __ballot_sync(active_threads, output_is_valid);
+    unsigned int const result_mask = __ballot_sync(active_threads, output_is_valid);
 
     // store the output value, one per thread
     if (output_is_valid) { out_col[i] = val; }
diff --git a/cpp/src/rolling/jit/operation.hpp b/cpp/src/rolling/jit/operation.hpp
index 9af8c2ac3fb..22943f0db95 100644
--- a/cpp/src/rolling/jit/operation.hpp
+++ b/cpp/src/rolling/jit/operation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 struct rolling_udf_ptx {
   template <typename OutType, typename InType>
-  static OutType operate(const InType* in_col, cudf::size_type start, cudf::size_type count)
+  static OutType operate(InType const* in_col, cudf::size_type start, cudf::size_type count)
   {
     OutType ret;
     rolling_udf(&ret, 0, 0, 0, 0, &in_col[start], count, sizeof(InType));
@@ -32,7 +32,7 @@ struct rolling_udf_ptx {
 
 struct rolling_udf_cuda {
   template <typename OutType, typename InType>
-  static OutType operate(const InType* in_col, cudf::size_type start, cudf::size_type count)
+  static OutType operate(InType const* in_col, cudf::size_type start, cudf::size_type count)
   {
     OutType ret;
     rolling_udf(&ret, in_col, start, count);
diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp
index 35da9e3f8ec..a136f152d25 100644
--- a/cpp/src/rolling/range_window_bounds.cpp
+++ b/cpp/src/rolling/range_window_bounds.cpp
@@ -35,7 +35,7 @@ struct range_scalar_constructor {
   {
     CUDF_FAIL(
       "Unsupported range type. "
-      "Only Durations, fixed-point, and non-boolean integral range types are allowed.");
+      "Only durations, fixed-point, and non-boolean numeric range types are allowed.");
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_duration<T>())>
@@ -45,7 +45,7 @@ struct range_scalar_constructor {
       static_cast<duration_scalar<T> const&>(range_scalar_));
   }
 
-  template <typename T, CUDF_ENABLE_IF(std::is_integral_v<T> && not cudf::is_boolean<T>())>
+  template <typename T, CUDF_ENABLE_IF(cudf::is_numeric<T>() && not cudf::is_boolean<T>())>
   std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
   {
     return std::make_unique<numeric_scalar<T>>(
@@ -59,7 +59,6 @@ struct range_scalar_constructor {
       static_cast<fixed_point_scalar<T> const&>(range_scalar_));
   }
 };
-
 }  // namespace
 
 range_window_bounds::range_window_bounds(extent_type extent_, std::unique_ptr<scalar> range_scalar_)
@@ -73,19 +72,18 @@ range_window_bounds::range_window_bounds(extent_type extent_, std::unique_ptr<sc
 
 range_window_bounds range_window_bounds::unbounded(data_type type)
 {
-  return range_window_bounds(extent_type::UNBOUNDED, make_default_constructed_scalar(type));
+  return {extent_type::UNBOUNDED, make_default_constructed_scalar(type)};
 }
 
 range_window_bounds range_window_bounds::current_row(data_type type)
 {
-  return range_window_bounds(extent_type::CURRENT_ROW, make_default_constructed_scalar(type));
+  return {extent_type::CURRENT_ROW, make_default_constructed_scalar(type)};
 }
 
 range_window_bounds range_window_bounds::get(scalar const& boundary)
 {
-  return range_window_bounds{
-    extent_type::BOUNDED,
-    cudf::type_dispatcher(boundary.type(), range_scalar_constructor{}, boundary)};
+  return {extent_type::BOUNDED,
+          cudf::type_dispatcher(boundary.type(), range_scalar_constructor{}, boundary)};
 }
 
 }  // namespace cudf
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 58e21fc97ab..4b3f80fc6e2 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -331,7 +331,7 @@ std::unique_ptr<column> round(column_view const& input,
   if (input.is_empty()) {
     if (is_fixed_point(input.type())) {
       auto const type = data_type{input.type().id(), numeric::scale_type{-decimal_places}};
-      return std::make_unique<cudf::column>(type, 0, rmm::device_buffer{});
+      return make_empty_column(type);
     }
     return empty_like(input);
   }
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 403dc8c9189..6fb05e6e191 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -66,6 +66,10 @@ string_scalar::string_scalar(std::string const& string,
   : scalar(data_type(type_id::STRING), is_valid, stream, mr),
     _data(string.data(), string.size(), stream, mr)
 {
+  CUDF_EXPECTS(
+    string.size() <= static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+    "Data exceeds the string size limit",
+    std::overflow_error);
 }
 
 string_scalar::string_scalar(string_scalar const& other,
@@ -107,7 +111,7 @@ string_scalar::value_type string_scalar::value(rmm::cuda_stream_view stream) con
 
 size_type string_scalar::size() const { return _data.size(); }
 
-const char* string_scalar::data() const { return static_cast<const char*>(_data.data()); }
+char const* string_scalar::data() const { return static_cast<char const*>(_data.data()); }
 
 string_scalar::operator std::string() const { return this->to_string(cudf::get_default_stream()); }
 
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 08bcf8d48d8..4363bd212fe 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -154,10 +154,11 @@ std::unique_ptr<column> contains(column_view const& haystack,
 
 std::unique_ptr<column> contains(column_view const& haystack,
                                  column_view const& needles,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(haystack, needles, cudf::get_default_stream(), mr);
+  return detail::contains(haystack, needles, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index 7c16a1b12ef..0b344ec347b 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -160,10 +160,10 @@ bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_strea
 
 }  // namespace detail
 
-bool contains(column_view const& haystack, scalar const& needle)
+bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(haystack, needle, cudf::get_default_stream());
+  return detail::contains(haystack, needle, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index ac68faec929..e37f0686ac3 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -1,300 +1,299 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <join/join_common_utils.cuh>
-
-#include <cudf/detail/join.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-
-#include <cuco/static_map.cuh>
-
-#include <type_traits>
-
-namespace cudf::detail {
-
-namespace {
-
-using cudf::experimental::row::lhs_index_type;
-using cudf::experimental::row::rhs_index_type;
-
-using static_map = cuco::static_map<lhs_index_type,
-                                    size_type,
-                                    cuda::thread_scope_device,
-                                    rmm::mr::stream_allocator_adaptor<default_allocator<char>>>;
-
-/**
- * @brief Check if the given type `T` is a strong index type (i.e., `lhs_index_type` or
- * `rhs_index_type`).
- *
- * @return A boolean value indicating if `T` is a strong index type
- */
-template <typename T>
-constexpr auto is_strong_index_type()
-{
-  return std::is_same_v<T, lhs_index_type> || std::is_same_v<T, rhs_index_type>;
-}
-
-/**
- * @brief An adapter functor to support strong index types for row hasher that must be operating on
- * `cudf::size_type`.
- */
-template <typename Hasher>
-struct strong_index_hasher_adapter {
-  strong_index_hasher_adapter(Hasher const& hasher) : _hasher{hasher} {}
-
-  template <typename T, CUDF_ENABLE_IF(is_strong_index_type<T>())>
-  __device__ constexpr auto operator()(T const idx) const noexcept
-  {
-    return _hasher(static_cast<size_type>(idx));
-  }
-
- private:
-  Hasher const _hasher;
-};
-
-/**
- * @brief An adapter functor to support strong index type for table row comparator that must be
- * operating on `cudf::size_type`.
- */
-template <typename Comparator>
-struct strong_index_comparator_adapter {
-  strong_index_comparator_adapter(Comparator const& comparator) : _comparator{comparator} {}
-
-  template <typename T,
-            typename U,
-            CUDF_ENABLE_IF(is_strong_index_type<T>() && is_strong_index_type<U>())>
-  __device__ constexpr auto operator()(T const lhs_index, U const rhs_index) const noexcept
-  {
-    auto const lhs = static_cast<size_type>(lhs_index);
-    auto const rhs = static_cast<size_type>(rhs_index);
-
-    if constexpr (std::is_same_v<T, U> || std::is_same_v<T, lhs_index_type>) {
-      return _comparator(lhs, rhs);
-    } else {
-      // Here we have T == rhs_index_type.
-      // This is when the indices are provided in wrong order for two table comparator, so we need
-      // to switch them back to the right order before calling the underlying comparator.
-      return _comparator(rhs, lhs);
-    }
-  }
-
- private:
-  Comparator const _comparator;
-};
-
-/**
- * @brief Build a row bitmask for the input table.
- *
- * The output bitmask will have invalid bits corresponding to the the input rows having nulls (at
- * any nested level) and vice versa.
- *
- * @param input The input table
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return A pair of pointer to the output bitmask and the buffer containing the bitmask
- */
-std::pair<rmm::device_buffer, bitmask_type const*> build_row_bitmask(table_view const& input,
-                                                                     rmm::cuda_stream_view stream)
-{
-  auto const nullable_columns = get_nullable_columns(input);
-  CUDF_EXPECTS(nullable_columns.size() > 0,
-               "The input table has nulls thus it should have nullable columns.");
-
-  // If there are more than one nullable column, we compute `bitmask_and` of their null masks.
-  // Otherwise, we have only one nullable column and can use its null mask directly.
-  if (nullable_columns.size() > 1) {
-    auto row_bitmask =
-      cudf::detail::bitmask_and(
-        table_view{nullable_columns}, stream, rmm::mr::get_current_device_resource())
-        .first;
-    auto const row_bitmask_ptr = static_cast<bitmask_type const*>(row_bitmask.data());
-    return std::pair(std::move(row_bitmask), row_bitmask_ptr);
-  }
-
-  return std::pair(rmm::device_buffer{0, stream}, nullable_columns.front().null_mask());
-}
-
-/**
- * @brief Invoke an `operator()` template with a row equality comparator based on the specified
- * `compare_nans` parameter.
- *
- * @param compare_nans The flag to specify whether NaNs should be compared equal or not
- * @param func The input functor to invoke
- */
-template <typename Func>
-void dispatch_nan_comparator(nan_equality compare_nans, Func&& func)
-{
-  if (compare_nans == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    func(nan_equal_comparator{});
-  } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    func(nan_unequal_comparator{});
-  }
-}
-
-}  // namespace
-
-/**
- * @brief Check if rows in the given `needles` table exist in the `haystack` table.
- *
- * @param haystack The table containing the search space
- * @param needles A table of rows whose existence to check in the search space
- * @param compare_nulls Control whether nulls should be compared as equal or not
- * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned vector
- * @return A vector of bools indicating if each row in `needles` has matching rows in `haystack`
- */
-rmm::device_uvector<bool> contains(table_view const& haystack,
-                                   table_view const& needles,
-                                   null_equality compare_nulls,
-                                   nan_equality compare_nans,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
-{
-  auto map = static_map(compute_hash_table_size(haystack.num_rows()),
-                        cuco::empty_key{lhs_index_type{std::numeric_limits<size_type>::max()}},
-                        cuco::empty_value{detail::JoinNoneValue},
-                        detail::hash_table_allocator_type{default_allocator<char>{}, stream},
-                        stream.value());
-
-  auto const haystack_has_nulls = has_nested_nulls(haystack);
-  auto const needles_has_nulls  = has_nested_nulls(needles);
-  auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
-
-  auto const preprocessed_haystack =
-    cudf::experimental::row::equality::preprocessed_table::create(haystack, stream);
-  // Insert row indices of the haystack table as map keys.
-  {
-    auto const haystack_it = cudf::detail::make_counting_transform_iterator(
-      size_type{0},
-      [] __device__(auto const idx) { return cuco::make_pair(lhs_index_type{idx}, 0); });
-
-    auto const hasher = cudf::experimental::row::hash::row_hasher(preprocessed_haystack);
-    auto const d_hasher =
-      strong_index_hasher_adapter{hasher.device_hasher(nullate::DYNAMIC{has_any_nulls})};
-
-    auto const comparator =
-      cudf::experimental::row::equality::self_comparator(preprocessed_haystack);
-
-    // If the haystack table has nulls but they are compared unequal, don't insert them.
-    // Otherwise, it was known to cause performance issue:
-    // - https://github.com/rapidsai/cudf/pull/6943
-    // - https://github.com/rapidsai/cudf/pull/8277
-    if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
-      auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
-      auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
-
-      auto const insert_map = [&](auto const value_comp) {
-        if (cudf::detail::has_nested_columns(haystack)) {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<true>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert_if(haystack_it,
-                        haystack_it + haystack.num_rows(),
-                        thrust::counting_iterator<size_type>(0),  // stencil
-                        row_is_valid{row_bitmask_ptr},
-                        d_hasher,
-                        d_eqcomp,
-                        stream.value());
-        } else {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<false>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert_if(haystack_it,
-                        haystack_it + haystack.num_rows(),
-                        thrust::counting_iterator<size_type>(0),  // stencil
-                        row_is_valid{row_bitmask_ptr},
-                        d_hasher,
-                        d_eqcomp,
-                        stream.value());
-        }
-      };
-
-      // Insert only rows that do not have any null at any level.
-      dispatch_nan_comparator(compare_nans, insert_map);
-    } else {  // haystack_doesn't_have_nulls || compare_nulls == null_equality::EQUAL
-      auto const insert_map = [&](auto const value_comp) {
-        if (cudf::detail::has_nested_columns(haystack)) {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<true>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert(
-            haystack_it, haystack_it + haystack.num_rows(), d_hasher, d_eqcomp, stream.value());
-        } else {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<false>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert(
-            haystack_it, haystack_it + haystack.num_rows(), d_hasher, d_eqcomp, stream.value());
-        }
-      };
-
-      dispatch_nan_comparator(compare_nans, insert_map);
-    }
-  }
-
-  // The output vector.
-  auto contained = rmm::device_uvector<bool>(needles.num_rows(), stream, mr);
-
-  auto const preprocessed_needles =
-    cudf::experimental::row::equality::preprocessed_table::create(needles, stream);
-  // Check existence for each row of the needles table in the haystack table.
-  {
-    auto const needles_it = cudf::detail::make_counting_transform_iterator(
-      size_type{0}, [] __device__(auto const idx) { return rhs_index_type{idx}; });
-
-    auto const hasher = cudf::experimental::row::hash::row_hasher(preprocessed_needles);
-    auto const d_hasher =
-      strong_index_hasher_adapter{hasher.device_hasher(nullate::DYNAMIC{has_any_nulls})};
-
-    auto const comparator = cudf::experimental::row::equality::two_table_comparator(
-      preprocessed_haystack, preprocessed_needles);
-
-    auto const check_contains = [&](auto const value_comp) {
-      if (cudf::detail::has_nested_columns(haystack) or cudf::detail::has_nested_columns(needles)) {
-        auto const d_eqcomp =
-          comparator.equal_to<true>(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
-        map.contains(needles_it,
-                     needles_it + needles.num_rows(),
-                     contained.begin(),
-                     d_hasher,
-                     d_eqcomp,
-                     stream.value());
-      } else {
-        auto const d_eqcomp =
-          comparator.equal_to<false>(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
-        map.contains(needles_it,
-                     needles_it + needles.num_rows(),
-                     contained.begin(),
-                     d_hasher,
-                     d_eqcomp,
-                     stream.value());
-      }
-    };
-
-    dispatch_nan_comparator(compare_nans, check_contains);
-  }
-
-  return contained;
-}
-
-}  // namespace cudf::detail
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <join/join_common_utils.cuh>
+
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <cuco/static_map.cuh>
+
+#include <type_traits>
+
+namespace cudf::detail {
+
+namespace {
+
+using cudf::experimental::row::lhs_index_type;
+using cudf::experimental::row::rhs_index_type;
+
+using static_map = cuco::static_map<lhs_index_type,
+                                    size_type,
+                                    cuda::thread_scope_device,
+                                    rmm::mr::stream_allocator_adaptor<default_allocator<char>>>;
+
+/**
+ * @brief Check if the given type `T` is a strong index type (i.e., `lhs_index_type` or
+ * `rhs_index_type`).
+ *
+ * @return A boolean value indicating if `T` is a strong index type
+ */
+template <typename T>
+constexpr auto is_strong_index_type()
+{
+  return std::is_same_v<T, lhs_index_type> || std::is_same_v<T, rhs_index_type>;
+}
+
+/**
+ * @brief An adapter functor to support strong index types for row hasher that must be operating on
+ * `cudf::size_type`.
+ */
+template <typename Hasher>
+struct strong_index_hasher_adapter {
+  strong_index_hasher_adapter(Hasher const& hasher) : _hasher{hasher} {}
+
+  template <typename T, CUDF_ENABLE_IF(is_strong_index_type<T>())>
+  __device__ constexpr auto operator()(T const idx) const noexcept
+  {
+    return _hasher(static_cast<size_type>(idx));
+  }
+
+ private:
+  Hasher const _hasher;
+};
+
+/**
+ * @brief An adapter functor to support strong index type for table row comparator that must be
+ * operating on `cudf::size_type`.
+ */
+template <typename Comparator>
+struct strong_index_comparator_adapter {
+  strong_index_comparator_adapter(Comparator const& comparator) : _comparator{comparator} {}
+
+  template <typename T,
+            typename U,
+            CUDF_ENABLE_IF(is_strong_index_type<T>() && is_strong_index_type<U>())>
+  __device__ constexpr auto operator()(T const lhs_index, U const rhs_index) const noexcept
+  {
+    auto const lhs = static_cast<size_type>(lhs_index);
+    auto const rhs = static_cast<size_type>(rhs_index);
+
+    if constexpr (std::is_same_v<T, U> || std::is_same_v<T, lhs_index_type>) {
+      return _comparator(lhs, rhs);
+    } else {
+      // Here we have T == rhs_index_type.
+      // This is when the indices are provided in wrong order for two table comparator, so we need
+      // to switch them back to the right order before calling the underlying comparator.
+      return _comparator(rhs, lhs);
+    }
+  }
+
+ private:
+  Comparator const _comparator;
+};
+
+/**
+ * @brief Build a row bitmask for the input table.
+ *
+ * The output bitmask will have invalid bits corresponding to the input rows having nulls (at
+ * any nested level) and vice versa.
+ *
+ * @param input The input table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A pair of pointer to the output bitmask and the buffer containing the bitmask
+ */
+std::pair<rmm::device_buffer, bitmask_type const*> build_row_bitmask(table_view const& input,
+                                                                     rmm::cuda_stream_view stream)
+{
+  auto const nullable_columns = get_nullable_columns(input);
+  CUDF_EXPECTS(nullable_columns.size() > 0,
+               "The input table has nulls thus it should have nullable columns.");
+
+  // If there are more than one nullable column, we compute `bitmask_and` of their null masks.
+  // Otherwise, we have only one nullable column and can use its null mask directly.
+  if (nullable_columns.size() > 1) {
+    auto row_bitmask =
+      cudf::detail::bitmask_and(
+        table_view{nullable_columns}, stream, rmm::mr::get_current_device_resource())
+        .first;
+    auto const row_bitmask_ptr = static_cast<bitmask_type const*>(row_bitmask.data());
+    return std::pair(std::move(row_bitmask), row_bitmask_ptr);
+  }
+
+  return std::pair(rmm::device_buffer{0, stream}, nullable_columns.front().null_mask());
+}
+
+/**
+ * @brief Invoke an `operator()` template with a row equality comparator based on the specified
+ * `compare_nans` parameter.
+ *
+ * @param compare_nans The flag to specify whether NaNs should be compared equal or not
+ * @param func The input functor to invoke
+ */
+template <typename Func>
+void dispatch_nan_comparator(nan_equality compare_nans, Func&& func)
+{
+  if (compare_nans == nan_equality::ALL_EQUAL) {
+    using nan_equal_comparator =
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
+    func(nan_equal_comparator{});
+  } else {
+    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
+    func(nan_unequal_comparator{});
+  }
+}
+
+}  // namespace
+
+/**
+ * @brief Check if rows in the given `needles` table exist in the `haystack` table.
+ *
+ * @param haystack The table containing the search space
+ * @param needles A table of rows whose existence to check in the search space
+ * @param compare_nulls Control whether nulls should be compared as equal or not
+ * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned vector
+ * @return A vector of bools indicating if each row in `needles` has matching rows in `haystack`
+ */
+rmm::device_uvector<bool> contains(table_view const& haystack,
+                                   table_view const& needles,
+                                   null_equality compare_nulls,
+                                   nan_equality compare_nans,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  auto map = static_map(compute_hash_table_size(haystack.num_rows()),
+                        cuco::empty_key{lhs_index_type{std::numeric_limits<size_type>::max()}},
+                        cuco::empty_value{detail::JoinNoneValue},
+                        detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+                        stream.value());
+
+  auto const haystack_has_nulls = has_nested_nulls(haystack);
+  auto const needles_has_nulls  = has_nested_nulls(needles);
+  auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
+
+  auto const preprocessed_haystack =
+    cudf::experimental::row::equality::preprocessed_table::create(haystack, stream);
+  // Insert row indices of the haystack table as map keys.
+  {
+    auto const haystack_it = cudf::detail::make_counting_transform_iterator(
+      size_type{0},
+      [] __device__(auto const idx) { return cuco::make_pair(lhs_index_type{idx}, 0); });
+
+    auto const hasher = cudf::experimental::row::hash::row_hasher(preprocessed_haystack);
+    auto const d_hasher =
+      strong_index_hasher_adapter{hasher.device_hasher(nullate::DYNAMIC{has_any_nulls})};
+
+    auto const comparator =
+      cudf::experimental::row::equality::self_comparator(preprocessed_haystack);
+
+    // If the haystack table has nulls but they are compared unequal, don't insert them.
+    // Otherwise, it was known to cause performance issue:
+    // - https://github.com/rapidsai/cudf/pull/6943
+    // - https://github.com/rapidsai/cudf/pull/8277
+    if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
+      auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
+      auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
+
+      auto const insert_map = [&](auto const value_comp) {
+        if (cudf::detail::has_nested_columns(haystack)) {
+          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<true>(
+            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
+          map.insert_if(haystack_it,
+                        haystack_it + haystack.num_rows(),
+                        thrust::counting_iterator<size_type>(0),  // stencil
+                        row_is_valid{row_bitmask_ptr},
+                        d_hasher,
+                        d_eqcomp,
+                        stream.value());
+        } else {
+          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<false>(
+            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
+          map.insert_if(haystack_it,
+                        haystack_it + haystack.num_rows(),
+                        thrust::counting_iterator<size_type>(0),  // stencil
+                        row_is_valid{row_bitmask_ptr},
+                        d_hasher,
+                        d_eqcomp,
+                        stream.value());
+        }
+      };
+
+      // Insert only rows that do not have any null at any level.
+      dispatch_nan_comparator(compare_nans, insert_map);
+    } else {  // haystack_doesn't_have_nulls || compare_nulls == null_equality::EQUAL
+      auto const insert_map = [&](auto const value_comp) {
+        if (cudf::detail::has_nested_columns(haystack)) {
+          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<true>(
+            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
+          map.insert(
+            haystack_it, haystack_it + haystack.num_rows(), d_hasher, d_eqcomp, stream.value());
+        } else {
+          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<false>(
+            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
+          map.insert(
+            haystack_it, haystack_it + haystack.num_rows(), d_hasher, d_eqcomp, stream.value());
+        }
+      };
+
+      dispatch_nan_comparator(compare_nans, insert_map);
+    }
+  }
+
+  // The output vector.
+  auto contained = rmm::device_uvector<bool>(needles.num_rows(), stream, mr);
+
+  auto const preprocessed_needles =
+    cudf::experimental::row::equality::preprocessed_table::create(needles, stream);
+  // Check existence for each row of the needles table in the haystack table.
+  {
+    auto const needles_it = cudf::detail::make_counting_transform_iterator(
+      size_type{0}, [] __device__(auto const idx) { return rhs_index_type{idx}; });
+
+    auto const hasher = cudf::experimental::row::hash::row_hasher(preprocessed_needles);
+    auto const d_hasher =
+      strong_index_hasher_adapter{hasher.device_hasher(nullate::DYNAMIC{has_any_nulls})};
+
+    auto const comparator = cudf::experimental::row::equality::two_table_comparator(
+      preprocessed_haystack, preprocessed_needles);
+
+    auto const check_contains = [&](auto const value_comp) {
+      if (cudf::detail::has_nested_columns(haystack) or cudf::detail::has_nested_columns(needles)) {
+        auto const d_eqcomp =
+          comparator.equal_to<true>(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
+        map.contains(needles_it,
+                     needles_it + needles.num_rows(),
+                     contained.begin(),
+                     d_hasher,
+                     d_eqcomp,
+                     stream.value());
+      } else {
+        auto const d_eqcomp =
+          comparator.equal_to<false>(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
+        map.contains(needles_it,
+                     needles_it + needles.num_rows(),
+                     contained.begin(),
+                     d_hasher,
+                     d_eqcomp,
+                     stream.value());
+      }
+    };
+
+    dispatch_nan_comparator(compare_nans, check_contains);
+  }
+
+  return contained;
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/search/search_ordered.cu b/cpp/src/search/search_ordered.cu
index bf0eb8d46f8..3b5dbef0401 100644
--- a/cpp/src/search/search_ordered.cu
+++ b/cpp/src/search/search_ordered.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -144,22 +144,22 @@ std::unique_ptr<column> lower_bound(table_view const& haystack,
                                     table_view const& needles,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::lower_bound(
-    haystack, needles, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::lower_bound(haystack, needles, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<column> upper_bound(table_view const& haystack,
                                     table_view const& needles,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::upper_bound(
-    haystack, needles, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::upper_bound(haystack, needles, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index b7347974173..37664f33762 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -273,8 +273,8 @@ std::unique_ptr<column> segmented_sorted_order_common(
   // insert segment id before all columns.
   std::vector<column_view> keys_with_segid;
   keys_with_segid.reserve(keys.num_columns() + 1);
-  keys_with_segid.push_back(
-    column_view(data_type(type_to_id<size_type>()), segment_ids.size(), segment_ids.data()));
+  keys_with_segid.push_back(column_view(
+    data_type(type_to_id<size_type>()), segment_ids.size(), segment_ids.data(), nullptr, 0));
   keys_with_segid.insert(keys_with_segid.end(), keys.begin(), keys.end());
   auto segid_keys = table_view(keys_with_segid);
 
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 8c50f8d29e8..4bca0827efe 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -34,6 +34,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuco/static_set.cuh>
+
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -127,40 +129,48 @@ cudf::size_type distinct_count(table_view const& keys,
                                null_equality nulls_equal,
                                rmm::cuda_stream_view stream)
 {
-  auto const num_rows  = keys.num_rows();
+  auto const num_rows = keys.num_rows();
+  if (num_rows == 0) { return 0; }  // early exit for empty input
   auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
 
-  hash_map_type key_map{compute_hash_table_size(num_rows),
-                        cuco::empty_key{COMPACTION_EMPTY_KEY_SENTINEL},
-                        cuco::empty_value{COMPACTION_EMPTY_VALUE_SENTINEL},
-                        detail::hash_table_allocator_type{default_allocator<char>{}, stream},
-                        stream.value()};
-
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(keys, stream);
-
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
   auto const hash_key   = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
-
-  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto iter = cudf::detail::make_counting_transform_iterator(
-    0, [] __device__(size_type i) { return cuco::make_pair(i, i); });
+  auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
   auto const comparator_helper = [&](auto const row_equal) {
-    // when nulls are equal, insert non-null rows only to improve efficiency
+    using hasher_type = decltype(hash_key);
+    auto key_set      = cuco::experimental::static_set{
+      cuco::experimental::extent{compute_hash_table_size(num_rows)},
+      cuco::empty_key<cudf::size_type>{COMPACTION_EMPTY_KEY_SENTINEL},
+      row_equal,
+      cuco::experimental::linear_probing<1, hasher_type>{hash_key},
+      detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+      stream.value()};
+
+    auto const iter = thrust::counting_iterator<cudf::size_type>(0);
+    // when nulls are equal, we skip hashing any row that has a null
+    // in every column to improve efficiency.
     if (nulls_equal == null_equality::EQUAL and has_nulls) {
       thrust::counting_iterator<size_type> stencil(0);
+      // We must consider a row if any of its column entries is valid,
+      // hence OR together the validities of the columns.
       auto const [row_bitmask, null_count] =
         cudf::detail::bitmask_or(keys, stream, rmm::mr::get_current_device_resource());
-      row_validity pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
-      key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value());
-      return key_map.get_size() + static_cast<std::size_t>(null_count > 0);
+      // Unless all columns have a null mask, row_bitmask will be
+      // null, and null_count will be zero. Equally, unless there is
+      // some row which is null in all columns, null_count will be
+      // zero. So, it is only when null_count is not zero that we need
+      // to do a filtered insertion.
+      if (null_count > 0) {
+        row_validity pred{static_cast<bitmask_type const*>(row_bitmask.data())};
+        return key_set.insert_if(iter, iter + num_rows, stencil, pred, stream.value()) + 1;
+      }
     }
     // otherwise, insert all
-    key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value());
-    return key_map.get_size();
+    return key_set.insert(iter, iter + num_rows, stream.value());
   };
 
   if (cudf::detail::has_nested_columns(keys)) {
diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu
index d45897930b0..45a2de9288b 100644
--- a/cpp/src/stream_compaction/stable_distinct.cu
+++ b/cpp/src/stream_compaction/stable_distinct.cu
@@ -19,12 +19,14 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 #include <thrust/uninitialized_fill.h>
 
-namespace cudf::detail {
+namespace cudf {
+namespace detail {
 
 std::unique_ptr<table> stable_distinct(table_view const& input,
                                        std::vector<size_type> const& keys,
@@ -45,7 +47,13 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                                      stream,
                                                      rmm::mr::get_current_device_resource());
 
-  // Markers to denote which rows to be copied to the output.
+  // The only difference between this implementation and the unstable version
+  // is that the stable implementation must retain the input order. The
+  // distinct indices are not sorted, so we cannot simply copy the rows in the
+  // order of the distinct indices and retain the input order. Instead, we use
+  // a boolean mask to indicate which rows to copy to the output. This avoids
+  // the need to sort the distinct indices, which is slower.
+
   auto const output_markers = [&] {
     auto markers = rmm::device_uvector<bool>(input.num_rows(), stream);
     thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false);
@@ -58,13 +66,22 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
     return markers;
   }();
 
-  return cudf::detail::copy_if(
-    input,
-    [output_markers = output_markers.begin()] __device__(auto const idx) {
-      return *(output_markers + idx);
-    },
-    stream,
-    mr);
+  return cudf::detail::apply_boolean_mask(
+    input, cudf::device_span<bool const>(output_markers), stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<table> stable_distinct(table_view const& input,
+                                       std::vector<size_type> const& keys,
+                                       duplicate_keep_option keep,
+                                       null_equality nulls_equal,
+                                       nan_equality nans_equal,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::stable_distinct(
+    input, keys, keep, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
 }
 
-}  // namespace cudf::detail
+}  // namespace cudf
diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh
index 02cef0e6467..4779cd990fd 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.cuh
+++ b/cpp/src/stream_compaction/stream_compaction_common.cuh
@@ -58,7 +58,7 @@ class row_validity {
  public:
   row_validity(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {}
 
-  __device__ inline bool operator()(const size_type& i) const noexcept
+  __device__ inline bool operator()(size_type const& i) const noexcept
   {
     return cudf::bit_is_set(_row_bitmask, i);
   }
diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp
index eb57a62fd71..0cd2d8f4b14 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.hpp
+++ b/cpp/src/stream_compaction/stream_compaction_common.hpp
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index 3a1b7044b56..8dc150998ee 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -111,14 +111,14 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
 __global__ void count_characters_parallel_fn(column_device_view const d_strings,
                                              size_type* d_lengths)
 {
-  size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
-  using warp_reduce   = cub::WarpReduce<size_type>;
+  auto const idx    = cudf::detail::grid_1d::global_thread_id();
+  using warp_reduce = cub::WarpReduce<size_type>;
   __shared__ typename warp_reduce::TempStorage temp_storage;
 
   if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }
 
-  auto const str_idx  = idx / cudf::detail::warp_size;
-  auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const str_idx  = static_cast<size_type>(idx / cudf::detail::warp_size);
+  auto const lane_idx = static_cast<size_type>(idx % cudf::detail::warp_size);
   if (d_strings.is_null(str_idx)) {
     d_lengths[str_idx] = 0;
     return;
@@ -126,7 +126,7 @@ __global__ void count_characters_parallel_fn(column_device_view const d_strings,
   auto const d_str   = d_strings.element<string_view>(str_idx);
   auto const str_ptr = d_str.data();
 
-  auto count = 0;
+  size_type count = 0;
   for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
     count += static_cast<size_type>(is_begin_utf8_char(str_ptr[i]));
   }
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index fcb0bacad9a..4e248922702 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -63,7 +63,7 @@ struct base_fn {
   character_cases_table_type const* d_case_table;
   special_case_mapping const* d_special_case_mapping;
   column_device_view const d_column;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   base_fn(column_device_view const& d_column)
@@ -111,11 +111,11 @@ struct base_fn {
       return;
     }
 
-    auto& derived     = static_cast<Derived&>(*this);
-    auto const d_str  = d_column.element<string_view>(idx);
-    offset_type bytes = 0;
-    auto d_buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    bool capitalize   = true;
+    auto& derived    = static_cast<Derived&>(*this);
+    auto const d_str = d_column.element<string_view>(idx);
+    size_type bytes  = 0;
+    auto d_buffer    = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    bool capitalize  = true;
     for (auto const chr : d_str) {
       auto const info        = get_char_info(d_flags, chr);
       auto const flag        = info.second;
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 0997983c95e..c5fe7a19f53 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -20,7 +20,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -33,6 +32,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/atomic>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -167,7 +168,10 @@ struct count_bytes_fn {
       size += converter.process_character(u8);
     }
     // this is every so slightly faster than using the cub::warp_reduce
-    if (size > 0) atomicAdd(d_offsets + str_idx, size);
+    if (size > 0) {
+      cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_offsets + str_idx)};
+      ref.fetch_add(size, cuda::std::memory_order_relaxed);
+    }
   }
 };
 
@@ -255,8 +259,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // convert sizes to offsets
   auto const bytes =
     cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit",
+  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
+               "Size of output exceeds the column size limit",
                std::overflow_error);
 
   auto chars = create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
diff --git a/cpp/src/strings/char_types/char_cases.h b/cpp/src/strings/char_types/char_cases.h
index 72a840b748b..96d132b3a78 100644
--- a/cpp/src/strings/char_types/char_cases.h
+++ b/cpp/src/strings/char_types/char_cases.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 // For example, if the code-point is an upper-case character, the entry is its lower-case
 // counterpart.
 //
-const uint16_t g_character_cases_table[] = {
+uint16_t const g_character_cases_table[] = {
   0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
   0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
   0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
diff --git a/cpp/src/strings/char_types/char_flags.h b/cpp/src/strings/char_types/char_flags.h
index e98082732d0..5aa79fe7945 100644
--- a/cpp/src/strings/char_types/char_flags.h
+++ b/cpp/src/strings/char_types/char_flags.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@
 //    0 - decimal
 //
 
-const uint8_t g_character_codepoint_flags[] = {
+uint8_t const g_character_codepoint_flags[] = {
   0,   0,   0,   0,   0,   0,   0,   0,   0,   16,  16,  16,  16,  16,  0,   0,   0,   0,   0,
   0,   0,   0,   0,   0,   0,   0,   0,   0,   16,  16,  16,  16,  16,  0,   0,   0,   0,   0,
   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,   7,   7,   7,   7,   7,   7,   7,   7,
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index a403061ba0e..b87fb80fcc2 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -31,59 +31,84 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/logical.h>
 #include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
-//
-std::unique_ptr<column> all_characters_of_type(strings_column_view const& strings,
+namespace {
+
+/**
+ * @brief Returns true for each string where all characters match the given types.
+ *
+ * Only the characters that match to `verify_types` are checked.
+ * Returns false if no characters are checked or one character does not match `types`.
+ * Returns true if at least one character is checked and all checked characters match `types`.
+ */
+struct char_types_fn {
+  column_device_view const d_column;
+  character_flags_table_type const* d_flags;
+  string_character_types const types;
+  string_character_types const verify_types;
+
+  __device__ bool operator()(size_type idx) const
+  {
+    if (d_column.is_null(idx)) { return false; }
+    auto const d_str = d_column.element<string_view>(idx);
+    auto const end   = d_str.data() + d_str.size_bytes();
+
+    bool type_matched     = !d_str.empty();  // require at least one character;
+    size_type check_count = 0;               // count checked characters
+    for (auto itr = d_str.data(); type_matched && (itr < end); ++itr) {
+      uint8_t const chr = static_cast<uint8_t>(*itr);
+      if (is_utf8_continuation_char(chr)) { continue; }
+      auto u8 = static_cast<char_utf8>(chr);  // holds UTF8 value
+      // using max(int8) here since max(char)=255 on ARM systems
+      if (u8 > std::numeric_limits<int8_t>::max()) { to_char_utf8(itr, u8); }
+
+      // lookup flags in table by codepoint
+      auto const code_point = utf8_to_codepoint(u8);
+      auto const flag       = code_point <= 0x00'FFFF ? d_flags[code_point] : 0;
+
+      if ((verify_types & flag) ||                   // should flag be verified;
+          (flag == 0 && verify_types == ALL_TYPES))  // special edge case
+      {
+        type_matched = (types & flag) > 0;
+        ++check_count;
+      }
+    }
+
+    return type_matched && (check_count > 0);
+  }
+};
+}  // namespace
+
+std::unique_ptr<column> all_characters_of_type(strings_column_view const& input,
                                                string_character_types types,
                                                string_character_types verify_types,
                                                rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
-  auto strings_count  = strings.size();
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
+  auto d_strings = column_device_view::create(input.parent(), stream);
 
   // create output column
-  auto results      = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+  auto results = make_numeric_column(data_type{type_id::BOOL8},
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
-  auto results_view = results->mutable_view();
-  auto d_results    = results_view.data<bool>();
   // get the static character types table
   auto d_flags = detail::get_character_flags_table();
+
   // set the output values by checking the character types for each string
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings_count),
-                    d_results,
-                    [d_column, d_flags, types, verify_types, d_results] __device__(size_type idx) {
-                      if (d_column.is_null(idx)) return false;
-                      auto d_str            = d_column.element<string_view>(idx);
-                      bool check            = !d_str.empty();  // require at least one character
-                      size_type check_count = 0;
-                      for (auto itr = d_str.begin(); check && (itr != d_str.end()); ++itr) {
-                        auto code_point = detail::utf8_to_codepoint(*itr);
-                        // lookup flags in table by code-point
-                        auto flag = code_point <= 0x00'FFFF ? d_flags[code_point] : 0;
-                        if ((verify_types & flag) ||                   // should flag be verified
-                            (flag == 0 && verify_types == ALL_TYPES))  // special edge case
-                        {
-                          check = (types & flag) > 0;
-                          ++check_count;
-                        }
-                      }
-                      return check && (check_count > 0);
-                    });
-  //
-  results->set_null_count(strings.null_count());
+                    thrust::make_counting_iterator<size_type>(input.size()),
+                    results->mutable_view().data<bool>(),
+                    char_types_fn{*d_strings, d_flags, types, verify_types});
+
+  results->set_null_count(input.null_count());
   return results;
 }
 
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 29023fbb139..ba8acd23467 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ struct concat_strings_base {
   table_device_view const d_table;
   string_scalar_device_view const d_narep;
   separator_on_nulls separate_nulls;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   /**
@@ -72,7 +72,7 @@ struct concat_strings_base {
     }
 
     char* d_buffer       = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    offset_type bytes    = 0;
+    size_type bytes      = 0;
     bool write_separator = false;
 
     for (auto itr = d_table.begin(); itr < d_table.end(); ++itr) {
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 9519cf66664..faf1be6a26f 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -18,11 +18,12 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -33,95 +34,141 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
-#include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform_scan.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 
-std::unique_ptr<column> join_strings(strings_column_view const& strings,
+namespace {
+
+/**
+ * @brief Threshold to decide on using string-per-thread vs the string-gather
+ * approaches.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * the string-gather function is used.
+ * Otherwise, a regular string-parallel function is used.
+ *
+ * This value was found using the strings_join benchmark results.
+ */
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 32;
+
+struct join_base_fn {
+  column_device_view const d_strings;
+  string_view d_separator;
+  string_scalar_device_view d_narep;
+
+  __device__ thrust::pair<string_view, string_view> process_string(size_type idx) const
+  {
+    string_view d_str{};
+    string_view d_sep = (idx + 1 < d_strings.size()) ? d_separator : d_str;
+    if (d_strings.is_null(idx)) {
+      if (d_narep.is_valid()) {
+        d_str = d_narep.value();
+      } else {
+        // if null and no narep, don't output a separator either
+        d_sep = d_str;
+      }
+    } else {
+      d_str = d_strings.element<string_view>(idx);
+    }
+    return {d_str, d_sep};
+  }
+};
+
+/**
+ * @brief Compute output sizes and write output bytes
+ *
+ * This functor is suitable for make_strings_children
+ */
+struct join_fn : public join_base_fn {
+  size_type* d_offsets{};
+  char* d_chars{};
+
+  join_fn(column_device_view const d_strings,
+          string_view d_separator,
+          string_scalar_device_view d_narep)
+    : join_base_fn{d_strings, d_separator, d_narep}
+  {
+  }
+
+  __device__ void operator()(size_type idx) const
+  {
+    auto const [d_str, d_sep] = process_string(idx);
+
+    char* d_buffer  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    size_type bytes = 0;
+    if (d_buffer) {
+      d_buffer = detail::copy_string(d_buffer, d_str);
+      d_buffer = detail::copy_string(d_buffer, d_sep);
+    } else {
+      bytes += d_str.size_bytes() + d_sep.size_bytes();
+    }
+    if (!d_chars) { d_offsets[idx] = bytes; }
+  }
+};
+
+struct join_gather_fn : public join_base_fn {
+  join_gather_fn(column_device_view const d_strings,
+                 string_view d_separator,
+                 string_scalar_device_view d_narep)
+    : join_base_fn{d_strings, d_separator, d_narep}
+  {
+  }
+
+  __device__ string_index_pair operator()(size_type idx) const
+  {
+    auto const [d_str, d_sep] = process_string(idx / 2);
+    // every other string is the separator
+    return idx % 2 ? string_index_pair{d_sep.data(), d_sep.size_bytes()}
+                   : string_index_pair{d_str.data(), d_str.size_bytes()};
+  }
+};
+}  // namespace
+
+std::unique_ptr<column> join_strings(strings_column_view const& input,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
 
   CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be a valid string_scalar");
 
   string_view d_separator(separator.data(), separator.size());
   auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
-
-  // create an offsets array for building the output memory layout
-  rmm::device_uvector<size_type> output_offsets(strings_count + 1, stream);
-  auto d_output_offsets = output_offsets.data();
-  // using inclusive-scan to compute last entry which is the total size
-  thrust::transform_inclusive_scan(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(strings_count),
-    d_output_offsets + 1,
-    [d_strings, d_separator, d_narep] __device__(size_type idx) {
-      size_type bytes = 0;
-      if (d_strings.is_null(idx)) {
-        if (!d_narep.is_valid()) return 0;  // skip nulls
-        bytes += d_narep.size();
-      } else
-        bytes += d_strings.element<string_view>(idx).size_bytes();
-      if ((idx + 1) < d_strings.size()) bytes += d_separator.size_bytes();
-      return bytes;
-    },
-    thrust::plus<size_type>());
-
-  output_offsets.set_element_to_zero_async(0, stream);
-  // total size is the last entry
-  size_type const bytes = output_offsets.back_element(stream);
-
-  // build offsets column (only 1 string so 2 offset entries)
-  auto offsets_column =
-    make_numeric_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view = offsets_column->mutable_view();
-  // set the first entry to 0 and the last entry to bytes
-  int32_t new_offsets[] = {0, static_cast<int32_t>(bytes)};
-  CUDF_CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
-                                new_offsets,
-                                sizeof(new_offsets),
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  // build null mask
-  // only one entry so it is either all valid or all null
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto chars_column = [&] {
+    // build the strings column and commandeer the chars column
+    if ((input.size() == input.null_count()) ||
+        ((input.chars_size() / (input.size() - input.null_count())) <= AVG_CHAR_BYTES_THRESHOLD)) {
+      return std::get<1>(
+        make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr));
+    }
+    // dynamically feeds index pairs to build the output
+    auto indices = cudf::detail::make_counting_transform_iterator(
+      0, join_gather_fn{*d_strings, d_separator, d_narep});
+    auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr);
+    return std::move(joined_col->release().children.back());
+  }();
+
+  // build the offsets: single string output has offsets [0,chars-size]
+  auto offsets = cudf::detail::make_device_uvector_async(
+    std::vector<size_type>({0, chars_column->size()}), stream, mr);
+  auto offsets_column = std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
+
+  // build the null mask: only one output row so it is either all-valid or all-null
   auto const null_count =
-    static_cast<size_type>(strings.null_count() == strings_count && !narep.is_valid(stream));
-  auto null_mask    = null_count
-                        ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
-                        : rmm::device_buffer{0, stream, mr};
-  auto chars_column = create_chars_child_column(bytes, stream, mr);
-  auto d_chars      = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    strings_count,
-    [d_strings, d_separator, d_narep, d_output_offsets, d_chars] __device__(size_type idx) {
-      size_type offset = d_output_offsets[idx];
-      char* d_buffer   = d_chars + offset;
-      if (d_strings.is_null(idx)) {
-        if (!d_narep.is_valid())
-          return;  // do not write to buffer if element is null (including separator)
-        d_buffer = detail::copy_string(d_buffer, d_narep.value());
-      } else {
-        string_view d_str = d_strings.element<string_view>(idx);
-        d_buffer          = detail::copy_string(d_buffer, d_str);
-      }
-      if ((idx + 1) < d_strings.size()) d_buffer = detail::copy_string(d_buffer, d_separator);
-    });
+    static_cast<size_type>(input.null_count() == input.size() && !narep.is_valid(stream));
+  auto null_mask = null_count
+                     ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
+                     : rmm::device_buffer{0, stream, mr};
 
+  // perhaps this return a string_scalar instead of a single-row column
   return make_strings_column(
     1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask));
 }
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 7c9acbfbc58..eee59e37478 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,13 +53,13 @@ template <class Functor>
 struct compute_size_and_concatenate_fn {
   Functor const func;
   column_device_view const lists_dv;
-  offset_type const* const list_offsets;
+  size_type const* const list_offsets;
   column_device_view const strings_dv;
   string_scalar_device_view const string_narep_dv;
   separator_on_nulls const separate_nulls;
   output_if_empty_list const empty_list_policy;
 
-  offset_type* d_offsets{nullptr};
+  size_type* d_offsets{nullptr};
 
   // If d_chars == nullptr: only compute sizes and validities of the output strings.
   // If d_chars != nullptr: only concatenate strings.
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 44b3faeb38a..22534870409 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -50,10 +50,9 @@ struct contains_fn {
     if (d_strings.is_null(idx)) return false;
     auto const d_str = d_strings.element<string_view>(idx);
 
-    size_type begin = 0;
-    size_type end   = beginning_only ? 1    // match only the beginning of the string;
-                                     : -1;  // match anywhere in the string
-    return static_cast<bool>(prog.find(thread_idx, d_str, begin, end));
+    size_type end = beginning_only ? 1    // match only the beginning of the string;
+                                   : -1;  // match anywhere in the string
+    return prog.find(thread_idx, d_str, d_str.begin(), end).has_value();
   }
 };
 
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index 4f446c8c1cf..0d04fc74b0c 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -95,7 +95,7 @@ struct from_booleans_fn {
   column_device_view const d_column;
   string_view d_true;
   string_view d_false;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   __device__ void operator()(size_type idx) const
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index d6ef033c2fc..cca06ca0739 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -477,7 +477,7 @@ struct check_datetime_format {
    * @param bytes Number of bytes to check.
    * @return true if all digits are 0-9
    */
-  __device__ bool check_digits(const char* str, size_type bytes)
+  __device__ bool check_digits(char const* str, size_type bytes)
   {
     return thrust::all_of(thrust::seq, str, str + bytes, [] __device__(char chr) {
       return (chr >= '0' && chr <= '9');
@@ -1003,7 +1003,7 @@ struct datetime_formatter_fn {
         case 'f':                                 // sub-second
         {
           char subsecond_digits[] = "000000000";  // 9 max digits
-          const int digits        = [] {
+          int const digits        = [] {
             if constexpr (std::is_same_v<T, cudf::timestamp_ms>) return 3;
             if constexpr (std::is_same_v<T, cudf::timestamp_us>) return 6;
             if constexpr (std::is_same_v<T, cudf::timestamp_ns>) return 9;
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 6b1d741a806..863f76b9b98 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -191,7 +191,7 @@ struct from_durations_fn {
   column_device_view d_durations;
   format_item const* d_format_items;
   size_type items_count;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   __device__ int8_t format_length(char format_char, duration_component const* const timeparts) const
@@ -279,7 +279,7 @@ struct from_durations_fn {
   inline __device__ char* subsecond(char* ptr, duration_component const* timeparts)
   {
     if (timeparts->subsecond == 0) return ptr;
-    const int digits = format_length('S', timeparts) - 3;
+    int const digits = format_length('S', timeparts) - 3;
     *ptr             = '.';
     auto value       = timeparts->subsecond;
     for (int idx = digits; idx > 0; idx--) {
@@ -446,9 +446,9 @@ struct parse_duration {
   size_type items_count;
 
   // function to parse string (maximum 10 digits) to integer.
-  __device__ int32_t str2int(const char* str, int8_t max_bytes, int8_t& actual_length)
+  __device__ int32_t str2int(char const* str, int8_t max_bytes, int8_t& actual_length)
   {
-    const char* ptr = (*str == '-' || *str == '+') ? str + 1 : str;
+    char const* ptr = (*str == '-' || *str == '+') ? str + 1 : str;
     int32_t value   = 0;
     for (int8_t idx = 0; idx < max_bytes; ++idx) {
       char chr = *ptr++;
@@ -463,12 +463,12 @@ struct parse_duration {
   }
 
   // function to parse fraction of decimal value with trailing zeros removed.
-  __device__ int32_t str2int_fixed(const char* str,
+  __device__ int32_t str2int_fixed(char const* str,
                                    int8_t fixed_width,
                                    size_type string_length,
                                    int8_t& actual_length)
   {
-    const char* ptr = (*str == '.') ? str + 1 : str;
+    char const* ptr = (*str == '.') ? str + 1 : str;
     int32_t value   = 0;
     // parse till fixed_width or end of string.
     for (int8_t idx = 0; idx < fixed_width && idx < string_length; ++idx) {
@@ -487,24 +487,24 @@ struct parse_duration {
   }
 
   // parse 2 digit string to integer
-  __device__ int8_t parse_2digit_int(const char* str, int8_t& actual_length)
+  __device__ int8_t parse_2digit_int(char const* str, int8_t& actual_length)
   {
-    const char* ptr = (*str == '-' || *str == '+') ? str + 1 : str;
+    char const* ptr = (*str == '-' || *str == '+') ? str + 1 : str;
     int8_t value    = 0;
     if (*ptr >= '0' && *ptr <= '9') value = (value * 10) + static_cast<int32_t>(*ptr++ - '0');
     if (*ptr >= '0' && *ptr <= '9') value = (value * 10) + static_cast<int32_t>(*ptr++ - '0');
     actual_length += (ptr - str);
     return (*str == '-') ? -value : value;
   }
-  inline __device__ int8_t parse_hour(const char* str, int8_t& actual_length)
+  inline __device__ int8_t parse_hour(char const* str, int8_t& actual_length)
   {
     return parse_2digit_int(str, actual_length);
   }
-  inline __device__ int8_t parse_minute(const char* str, int8_t& actual_length)
+  inline __device__ int8_t parse_minute(char const* str, int8_t& actual_length)
   {
     return parse_2digit_int(str, actual_length);
   }
-  inline __device__ int8_t parse_second(const char* str, int8_t& actual_length)
+  inline __device__ int8_t parse_second(char const* str, int8_t& actual_length)
   {
     return parse_2digit_int(str, actual_length);
   }
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index cb061d03e5a..a3336258d3e 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -197,7 +197,7 @@ namespace {
 template <typename DecimalType>
 struct from_fixed_point_fn {
   column_device_view d_decimals;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   /**
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index b0f34beeafe..ab1e6870937 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -51,7 +51,7 @@ namespace {
  */
 template <typename FloatType>
 struct string_to_float_fn {
-  const column_device_view strings_column;  // strings to convert
+  column_device_view const strings_column;  // strings to convert
 
   __device__ FloatType operator()(size_type idx)
   {
@@ -147,9 +147,9 @@ struct ftos_converter {
   static constexpr double lower_limit = 0.0001;      // printf uses scientific notation below this
   // Tables for doing normalization: converting to exponent form
   // IEEE double float has maximum exponent of 305 so these should cover everything
-  const double upper10[9]  = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256};
-  const double lower10[9]  = {.1, .01, .0001, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256};
-  const double blower10[9] = {1.0, .1, .001, 1e-7, 1e-15, 1e-31, 1e-63, 1e-127, 1e-255};
+  double const upper10[9]  = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256};
+  double const lower10[9]  = {.1, .01, .0001, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256};
+  double const blower10[9] = {1.0, .1, .001, 1e-7, 1e-15, 1e-31, 1e-63, 1e-127, 1e-255};
 
   // utility for quickly converting known integer range to character array
   __device__ char* int2str(int value, char* output)
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index b8bbf831cad..bed682aba71 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ struct hex_to_integer_fn {
   __device__ int64_t string_to_integer(string_view const& d_str)
   {
     int64_t result = 0, base = 1;
-    const char* str = d_str.data();
+    char const* str = d_str.data();
     size_type index = d_str.size_bytes();
     while (index-- > 0) {
       char ch = str[index];
@@ -129,7 +129,7 @@ void dispatch_hex_to_integers_fn::operator()<bool>(column_device_view const&,
 template <typename IntegerType>
 struct integer_to_hex_fn {
   column_device_view const d_column;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   __device__ void byte_to_hex(uint8_t byte, char* hex)
@@ -173,7 +173,7 @@ struct integer_to_hex_fn {
         --byte_index;
       }
     } else {
-      d_offsets[idx] = static_cast<offset_type>(bytes) * 2;  // 2 hex characters per byte
+      d_offsets[idx] = static_cast<size_type>(bytes) * 2;  // 2 hex characters per byte
     }
   }
 };
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index a8a084398f9..260c3393f3c 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -226,7 +226,7 @@ namespace {
  */
 template <typename IntegerType>
 struct string_to_integer_fn {
-  const column_device_view strings_column;  // strings to convert
+  column_device_view const strings_column;  // strings to convert
 
   __device__ IntegerType operator()(size_type idx)
   {
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 4944813b7d3..4606aba6d17 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -52,8 +52,8 @@ struct ipv4_to_integers_fn {
     uint32_t ipvals[4] = {0};  // IPV4 format: xxx.xxx.xxx.xxx
     int32_t ipv_idx    = 0;
     int32_t factor     = 1;
-    const char* in_ptr = d_str.data();
-    const char* end    = in_ptr + d_str.size_bytes();
+    char const* in_ptr = d_str.data();
+    char const* end    = in_ptr + d_str.size_bytes();
     while ((in_ptr < end) && (ipv_idx < 4)) {
       char ch = *in_ptr++;
       if (ch < '0' || ch > '9') {
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index 609ced97c26..3aef37914fd 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -134,7 +134,7 @@ struct format_lists_fn {
       auto const view = get_nested_child(stack_idx);
 
       auto offsets   = view.child(cudf::lists_column_view::offsets_column_index);
-      auto d_offsets = offsets.data<offset_type>() + view.offset();
+      auto d_offsets = offsets.data<size_type>() + view.offset();
 
       // add pending separator
       if (item.separator == item_separator::LIST) {
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 052135ee2c8..71b6c09310e 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -195,20 +195,20 @@ __forceinline__ __device__ char escaped_sequence_to_byte(char const* const ptr)
  * @param[in] in_strings Input string column.
  * @param[out] out_counts Number of characters in each decode URL.
  */
-template <int num_warps_per_threadblock, int char_block_size>
+template <size_type num_warps_per_threadblock, size_type char_block_size>
 __global__ void url_decode_char_counter(column_device_view const in_strings,
-                                        offset_type* const out_counts)
+                                        size_type* const out_counts)
 {
   constexpr int halo_size = 2;
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size];
   __shared__ typename cub::WarpReduce<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
 
-  int const global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  int const global_warp_id   = global_thread_id / cudf::detail::warp_size;
-  int const local_warp_id    = threadIdx.x / cudf::detail::warp_size;
-  int const warp_lane        = threadIdx.x % cudf::detail::warp_size;
-  int const nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
-  char* in_chars_shared      = temporary_buffer[local_warp_id];
+  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
+  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
+  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
+  char* in_chars_shared = temporary_buffer[local_warp_id];
 
   // Loop through strings, and assign each string to a warp.
   for (size_type row_idx = global_warp_id; row_idx < in_strings.size(); row_idx += nwarps) {
@@ -220,11 +220,11 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
     auto const in_string     = in_strings.element<string_view>(row_idx);
     auto const in_chars      = in_string.data();
     auto const string_length = in_string.size_bytes();
-    int const nblocks        = cudf::util::div_rounding_up_unsafe(string_length, char_block_size);
-    offset_type escape_char_count = 0;
+    auto const nblocks       = cudf::util::div_rounding_up_unsafe(string_length, char_block_size);
+    size_type escape_char_count = 0;
 
-    for (int block_idx = 0; block_idx < nblocks; block_idx++) {
-      int const string_length_block =
+    for (size_type block_idx = 0; block_idx < nblocks; block_idx++) {
+      auto const string_length_block =
         std::min(char_block_size, string_length - char_block_size * block_idx);
 
       // Each warp collectively loads input characters of the current block to the shared memory.
@@ -233,18 +233,18 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
       // are added after the end of the block. If the cell is beyond the end of the string, 0s are
       // filled in to make sure the last two characters of the string are not the start of an
       // escaped sequence.
-      for (int char_idx = warp_lane; char_idx < string_length_block + halo_size;
+      for (auto char_idx = warp_lane; char_idx < string_length_block + halo_size;
            char_idx += cudf::detail::warp_size) {
-        int const in_idx          = block_idx * char_block_size + char_idx;
+        auto const in_idx         = block_idx * char_block_size + char_idx;
         in_chars_shared[char_idx] = in_idx < string_length ? in_chars[in_idx] : 0;
       }
 
       __syncwarp();
 
       // `char_idx_start` represents the start character index of the current warp.
-      for (int char_idx_start = 0; char_idx_start < string_length_block;
+      for (size_type char_idx_start = 0; char_idx_start < string_length_block;
            char_idx_start += cudf::detail::warp_size) {
-        int const char_idx = char_idx_start + warp_lane;
+        auto const char_idx = char_idx_start + warp_lane;
         int8_t const is_ichar_escape_char =
           (char_idx < string_length_block && is_escape_char(in_chars_shared + char_idx)) ? 1 : 0;
 
@@ -277,22 +277,22 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
  * @param[out] out_chars Character buffer for the output string column.
  * @param[in] out_offsets Offset value of each string associated with `out_chars`.
  */
-template <int num_warps_per_threadblock, int char_block_size>
+template <size_type num_warps_per_threadblock, size_type char_block_size>
 __global__ void url_decode_char_replacer(column_device_view const in_strings,
                                          char* const out_chars,
-                                         offset_type const* const out_offsets)
+                                         size_type const* const out_offsets)
 {
   constexpr int halo_size = 2;
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size * 2];
   __shared__ typename cub::WarpScan<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
-  __shared__ int out_idx[num_warps_per_threadblock];
+  __shared__ size_type out_idx[num_warps_per_threadblock];
 
-  int const global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  int const global_warp_id   = global_thread_id / cudf::detail::warp_size;
-  int const local_warp_id    = threadIdx.x / cudf::detail::warp_size;
-  int const warp_lane        = threadIdx.x % cudf::detail::warp_size;
-  int const nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
-  char* in_chars_shared      = temporary_buffer[local_warp_id];
+  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
+  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
+  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
+  char* in_chars_shared = temporary_buffer[local_warp_id];
 
   // Loop through strings, and assign each string to a warp
   for (size_type row_idx = global_warp_id; row_idx < in_strings.size(); row_idx += nwarps) {
@@ -302,31 +302,31 @@ __global__ void url_decode_char_replacer(column_device_view const in_strings,
     auto const in_chars      = in_string.data();
     auto const string_length = in_string.size_bytes();
     auto out_chars_string    = out_chars + out_offsets[row_idx];
-    int const nblocks        = cudf::util::div_rounding_up_unsafe(string_length, char_block_size);
+    auto const nblocks       = cudf::util::div_rounding_up_unsafe(string_length, char_block_size);
 
     // Use the last thread of the warp to initialize `out_idx` to 0.
     if (warp_lane == cudf::detail::warp_size - 1) { out_idx[local_warp_id] = 0; }
 
-    for (int block_idx = 0; block_idx < nblocks; block_idx++) {
-      int const string_length_block =
+    for (size_type block_idx = 0; block_idx < nblocks; block_idx++) {
+      auto const string_length_block =
         std::min(char_block_size, string_length - char_block_size * block_idx);
 
       // Each warp collectively loads input characters of the current block to shared memory.
       // Two halo cells before and after the block are added. The halo cells are used to test
       // whether the current location as well as the previous two locations are escape characters,
       // without branches.
-      for (int char_idx = warp_lane; char_idx < string_length_block + halo_size * 2;
+      for (auto char_idx = warp_lane; char_idx < string_length_block + halo_size * 2;
            char_idx += cudf::detail::warp_size) {
-        int const in_idx          = block_idx * char_block_size + char_idx - halo_size;
+        auto const in_idx         = block_idx * char_block_size + char_idx - halo_size;
         in_chars_shared[char_idx] = in_idx >= 0 && in_idx < string_length ? in_chars[in_idx] : 0;
       }
 
       __syncwarp();
 
       // `char_idx_start` represents the start character index of the current warp.
-      for (int char_idx_start = 0; char_idx_start < string_length_block;
+      for (size_type char_idx_start = 0; char_idx_start < string_length_block;
            char_idx_start += cudf::detail::warp_size) {
-        int const char_idx = char_idx_start + warp_lane;
+        auto const char_idx = char_idx_start + warp_lane;
         // If the current character is part of an escape sequence starting at the previous two
         // locations, the thread with the starting location should output the escaped character, and
         // the current thread should not output a character.
@@ -375,10 +375,10 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
 
-  constexpr int num_warps_per_threadblock = 4;
-  constexpr int threadblock_size          = num_warps_per_threadblock * cudf::detail::warp_size;
-  constexpr int char_block_size           = 256;
-  const int num_threadblocks =
+  constexpr size_type num_warps_per_threadblock = 4;
+  constexpr size_type threadblock_size = num_warps_per_threadblock * cudf::detail::warp_size;
+  constexpr size_type char_block_size  = 256;
+  auto const num_threadblocks =
     std::min(65536, cudf::util::div_rounding_up_unsafe(strings_count, num_warps_per_threadblock));
 
   auto offset_count    = strings_count + 1;
@@ -386,25 +386,24 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
 
   // build offsets column
   auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, offset_count, mask_state::UNALLOCATED, stream, mr);
 
   // count number of bytes in each string after decoding and store it in offsets_column
   auto offsets_view         = offsets_column->view();
   auto offsets_mutable_view = offsets_column->mutable_view();
   url_decode_char_counter<num_warps_per_threadblock, char_block_size>
     <<<num_threadblocks, threadblock_size, 0, stream.value()>>>(
-      *d_strings, offsets_mutable_view.begin<offset_type>());
+      *d_strings, offsets_mutable_view.begin<size_type>());
 
   // use scan to transform number of bytes into offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
-                         offsets_view.begin<offset_type>(),
-                         offsets_view.end<offset_type>(),
-                         offsets_mutable_view.begin<offset_type>());
+                         offsets_view.begin<size_type>(),
+                         offsets_view.end<size_type>(),
+                         offsets_mutable_view.begin<size_type>());
 
   // copy the total number of characters of all strings combined (last element of the offset column)
   // to the host memory
-  auto out_chars_bytes =
-    cudf::detail::get_value<offset_type>(offsets_view, offset_count - 1, stream);
+  auto out_chars_bytes = cudf::detail::get_value<size_type>(offsets_view, offset_count - 1, stream);
 
   // create the chars column
   auto chars_column = create_chars_child_column(out_chars_bytes, stream, mr);
@@ -413,7 +412,7 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
   // decode and copy the characters from the input column to the output column
   url_decode_char_replacer<num_warps_per_threadblock, char_block_size>
     <<<num_threadblocks, threadblock_size, 0, stream.value()>>>(
-      *d_strings, d_out_chars, offsets_column->view().begin<offset_type>());
+      *d_strings, d_out_chars, offsets_column->view().begin<size_type>());
 
   // copy null mask
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 92b71d128e1..26cd4fff09b 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -121,8 +121,8 @@ __global__ void fused_concatenate_string_offset_kernel(column_device_view const*
                                                        bitmask_type* output_mask,
                                                        size_type* out_valid_count)
 {
-  size_type output_index     = threadIdx.x + blockIdx.x * blockDim.x;
-  size_type warp_valid_count = 0;
+  cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
+  size_type warp_valid_count           = 0;
 
   unsigned active_mask;
   if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
@@ -175,7 +175,7 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const*
                                                       size_type const output_size,
                                                       char* output_data)
 {
-  size_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
+  cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
 
   while (output_index < output_size) {
     // Lookup input index by searching for output index in offsets
@@ -216,9 +216,11 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
 
   CUDF_EXPECTS(offsets_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-               "total number of strings is too large for cudf column");
+               "total number of strings exceeds the column size limit",
+               std::overflow_error);
   CUDF_EXPECTS(total_bytes <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-               "total size of strings is too large for cudf column");
+               "total size of strings exceeds the column size limit",
+               std::overflow_error);
 
   bool const has_nulls =
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
@@ -285,12 +287,12 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
         column_view chars_child   = column->child(strings_column_view::chars_column_index);
 
         auto bytes_offset =
-          cudf::detail::get_value<offset_type>(offsets_child, column_offset, stream);
+          cudf::detail::get_value<size_type>(offsets_child, column_offset, stream);
 
         // copy the chars column data
         auto d_chars = chars_child.data<char>() + bytes_offset;
         auto const bytes =
-          cudf::detail::get_value<offset_type>(offsets_child, column_size + column_offset, stream) -
+          cudf::detail::get_value<size_type>(offsets_child, column_size + column_offset, stream) -
           bytes_offset;
 
         CUDF_CUDA_TRY(
diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index bdcf01bd336..b54c433c23d 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/strings/detail/copying.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 
@@ -31,35 +33,31 @@ namespace cudf::strings::detail {
 
 namespace {
 
-struct adjust_offsets_fn {
-  column_device_view const d_column;
+struct output_sizes_fn {
+  column_device_view const d_column;  // input strings column
   string_view const d_filler;
   size_type const offset;
 
-  __device__ offset_type operator()(size_type idx)
+  __device__ size_type get_string_size_at(size_type idx)
   {
+    return d_column.is_null(idx) ? 0 : d_column.element<string_view>(idx).size_bytes();
+  }
+
+  __device__ size_type operator()(size_type idx)
+  {
+    auto const last_index = offset < 0 ? d_column.size() + offset : offset;
     if (offset < 0) {
-      auto const first      = d_column.element<offset_type>(-offset);
-      auto const last_index = d_column.size() + offset;
-      if (idx < last_index) {
-        return d_column.element<offset_type>(idx - offset) - first;
-      } else {
-        auto const last = d_column.element<offset_type>(d_column.size() - 1);
-        return (last - first) + ((idx - last_index + 1) * d_filler.size_bytes());
-      }
+      // shift left:  a,b,c,d,e,f -> b,c,d,e,f,x
+      return (idx < last_index) ? get_string_size_at(idx - offset) : d_filler.size_bytes();
     } else {
-      if (idx < offset) {
-        return idx * d_filler.size_bytes();
-      } else {
-        auto const total_filler = d_filler.size_bytes() * offset;
-        return total_filler + d_column.element<offset_type>(idx - offset);
-      }
+      // shift right:  a,b,c,d,e,f -> x,a,b,c,d,e
+      return (idx < last_index) ? d_filler.size_bytes() : get_string_size_at(idx - offset);
     }
   }
 };
 
 struct shift_chars_fn {
-  column_device_view const d_column;
+  column_device_view const d_column;  // input strings column
   string_view const d_filler;
   size_type const offset;
 
@@ -68,8 +66,11 @@ struct shift_chars_fn {
     if (offset < 0) {
       auto const last_index = -offset;
       if (idx < last_index) {
-        auto const first_index = d_column.size() + offset;
-        return d_column.element<char>(idx + first_index);
+        auto const first_index =
+          offset + d_column.child(strings_column_view::offsets_column_index)
+                     .element<size_type>(d_column.offset() + d_column.size());
+        return d_column.child(strings_column_view::chars_column_index)
+          .element<char>(idx + first_index);
       } else {
         auto const char_index = idx - last_index;
         return d_filler.data()[char_index % d_filler.size_bytes()];
@@ -78,7 +79,10 @@ struct shift_chars_fn {
       if (idx < offset) {
         return d_filler.data()[idx % d_filler.size_bytes()];
       } else {
-        return d_column.element<char>(idx - offset);
+        return d_column.child(strings_column_view::chars_column_index)
+          .element<char>(idx - offset +
+                         d_column.child(strings_column_view::offsets_column_index)
+                           .element<size_type>(d_column.offset()));
       }
     }
   }
@@ -97,44 +101,30 @@ std::unique_ptr<column> shift(strings_column_view const& input,
   // adjust offset when greater than the size of the input
   if (std::abs(offset) > input.size()) { offset = input.size(); }
 
-  // output offsets column is the same size as the input
-  auto const input_offsets =
-    cudf::detail::slice(
-      input.offsets(), {input.offset(), input.offset() + input.size() + 1}, stream)
-      .front();
-  auto const offsets_size = input_offsets.size();
-  auto offsets_column     = cudf::detail::allocate_like(
-    input_offsets, offsets_size, mask_allocation_policy::NEVER, stream, mr);
-
-  // run kernel to simultaneously shift and adjust the values in the output offsets column
-  auto d_offsets = mutable_column_device_view::create(offsets_column->mutable_view(), stream);
-  auto const d_input_offsets = column_device_view::create(input_offsets, stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::counting_iterator<size_type>(0),
-                    thrust::counting_iterator<size_type>(offsets_size),
-                    d_offsets->data<offset_type>(),
-                    adjust_offsets_fn{*d_input_offsets, d_fill_str, offset});
+  // build the output offsets by computing the sizes of each output row
+  auto const d_input = column_device_view::create(input.parent(), stream);
+  auto sizes_itr     = cudf::detail::make_counting_transform_iterator(
+    0, output_sizes_fn{*d_input, d_fill_str, offset});
+  auto [offsets_column, total_bytes] =
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto offsets_view = offsets_column->view();
 
   // compute the shift-offset for the output characters child column
   auto const shift_offset = [&] {
-    auto const index = (offset >= 0) ? offset : offsets_size - 1 + offset;
-    return (offset < 0 ? -1 : 1) *
-           cudf::detail::get_value<offset_type>(offsets_column->view(), index, stream);
+    auto const index = (offset < 0) ? input.size() + offset : offset;
+    return (offset < 0 ? -1 : 1) * cudf::detail::get_value<size_type>(offsets_view, index, stream);
   }();
 
   // create output chars child column
-  auto const chars_size =
-    cudf::detail::get_value<offset_type>(offsets_column->view(), offsets_size - 1, stream);
-  auto chars_column = create_chars_child_column(chars_size, stream, mr);
+  auto chars_column = create_chars_child_column(static_cast<size_type>(total_bytes), stream, mr);
   auto d_chars      = mutable_column_device_view::create(chars_column->mutable_view(), stream);
-  auto const d_input_chars = column_device_view::create(input.chars(), stream);
 
-  // run kernel to shift the characters
+  // run kernel to shift all the characters
   thrust::transform(rmm::exec_policy(stream),
                     thrust::counting_iterator<size_type>(0),
-                    thrust::counting_iterator<size_type>(chars_size),
+                    thrust::counting_iterator<size_type>(total_bytes),
                     d_chars->data<char>(),
-                    shift_chars_fn{*d_input_chars, d_fill_str, shift_offset});
+                    shift_chars_fn{*d_input, d_fill_str, shift_offset});
 
   // caller sets the null-mask
   return make_strings_column(
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index 1fde3a54089..6de5d43dc94 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -41,12 +41,14 @@ struct count_fn {
     auto const nchars = d_str.length();
     int32_t count     = 0;
 
-    size_type begin = 0;
-    size_type end   = -1;
-    while ((begin <= nchars) && (prog.find(thread_idx, d_str, begin, end) > 0)) {
+    auto itr = d_str.begin();
+    while (itr.position() <= nchars) {
+      auto result = prog.find(thread_idx, d_str, itr);
+      if (!result) { break; }
       ++count;
-      begin = end + (begin == end);
-      end   = -1;
+      // increment the iterator is faster than creating a new one
+      // +1 if the match was on a virtual position (e.g. word boundary)
+      itr += (result->second - itr.position()) + (result->first == result->second);
     }
     return count;
   }
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index d195642e5ea..532053e750e 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -42,7 +42,7 @@ namespace detail {
 
 namespace {
 
-using string_index_pair = thrust::pair<const char*, size_type>;
+using string_index_pair = thrust::pair<char const*, size_type>;
 
 /**
  * @brief This functor handles extracting strings by applying the compiled regex pattern
@@ -61,18 +61,19 @@ struct extract_fn {
 
     if (d_strings.is_valid(idx)) {
       auto const d_str = d_strings.element<string_view>(idx);
-
-      size_type begin = 0;
-      size_type end   = -1;  // handles empty strings automatically
-      if (d_prog.find(prog_idx, d_str, begin, end) > 0) {
+      auto const match = d_prog.find(prog_idx, d_str, d_str.begin());
+      if (match) {
+        auto const itr = d_str.begin() + match->first;
+        auto last_pos  = itr;
         for (auto col_idx = 0; col_idx < groups; ++col_idx) {
-          auto const extracted = d_prog.extract(prog_idx, d_str, begin, end, col_idx);
-          d_output[col_idx]    = [&] {
-            if (!extracted) return string_index_pair{nullptr, 0};
-            auto const offset = d_str.byte_offset((*extracted).first);
-            return string_index_pair{d_str.data() + offset,
-                                     d_str.byte_offset((*extracted).second) - offset};
-          }();
+          auto const extracted = d_prog.extract(prog_idx, d_str, itr, match->second, col_idx);
+          if (extracted) {
+            auto const d_extracted = string_from_match(*extracted, d_str, last_pos);
+            d_output[col_idx] = string_index_pair{d_extracted.data(), d_extracted.size_bytes()};
+            last_pos += (extracted->second - last_pos.position());
+          } else {
+            d_output[col_idx] = string_index_pair{nullptr, 0};
+          }
         }
         return;
       }
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 1252e79be90..8a2f8f0cbfc 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -50,7 +50,7 @@ namespace {
  */
 struct extract_fn {
   column_device_view const d_strings;
-  offset_type const* d_offsets;
+  size_type const* d_offsets;
   string_index_pair* d_indices;
 
   __device__ void operator()(size_type const idx,
@@ -59,32 +59,36 @@ struct extract_fn {
   {
     if (d_strings.is_null(idx)) { return; }
 
+    auto const d_str  = d_strings.element<string_view>(idx);
+    auto const nchars = d_str.length();
+
     auto const groups    = d_prog.group_counts();
     auto d_output        = d_indices + d_offsets[idx];
     size_type output_idx = 0;
 
-    auto const d_str  = d_strings.element<string_view>(idx);
-    auto const nchars = d_str.length();
+    auto itr = d_str.begin();
 
-    size_type begin = 0;
-    size_type end   = nchars;
-    // match the regex
-    while ((begin < end) && d_prog.find(prog_idx, d_str, begin, end) > 0) {
+    while (itr.position() < nchars) {
+      // first, match the regex
+      auto const match = d_prog.find(prog_idx, d_str, itr);
+      if (!match) { break; }
+      itr += (match->first - itr.position());  // position to beginning of the match
+      auto last_pos = itr;
       // extract each group into the output
       for (auto group_idx = 0; group_idx < groups; ++group_idx) {
         // result is an optional containing the bounds of the extracted string at group_idx
-        auto const extracted = d_prog.extract(prog_idx, d_str, begin, end, group_idx);
-
-        d_output[group_idx + output_idx] = [&] {
-          if (!extracted) { return string_index_pair{nullptr, 0}; }
-          auto const start_offset = d_str.byte_offset(extracted->first);
-          auto const end_offset   = d_str.byte_offset(extracted->second);
-          return string_index_pair{d_str.data() + start_offset, end_offset - start_offset};
-        }();
+        auto const extracted = d_prog.extract(prog_idx, d_str, itr, match->second, group_idx);
+        if (extracted) {
+          auto const d_result = string_from_match(*extracted, d_str, last_pos);
+          d_output[group_idx + output_idx] =
+            string_index_pair{d_result.data(), d_result.size_bytes()};
+        } else {
+          d_output[group_idx + output_idx] = string_index_pair{nullptr, 0};
+        }
+        last_pos += (extracted->second - last_pos.position());
       }
-      // continue to next match
-      begin = end;
-      end   = nchars;
+      // point to the end of this match to start the next match
+      itr += (match->second - itr.position());
       output_idx += groups;
     }
   }
@@ -115,7 +119,7 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,
   // Get the match counts for each string.
   // This column will become the output lists child offsets column.
   auto offsets   = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
-  auto d_offsets = offsets->mutable_view().data<offset_type>();
+  auto d_offsets = offsets->mutable_view().data<size_type>();
 
   // Compute null output rows
   auto [null_mask, null_count] = cudf::detail::valid_if(
@@ -134,10 +138,10 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,
     d_offsets + strings_count + 1,
     d_offsets,
     [groups] __device__(auto v) { return v * groups; },
-    offset_type{0},
+    size_type{0},
     thrust::plus{});
   auto const total_groups =
-    cudf::detail::get_value<offset_type>(offsets->view(), strings_count, stream);
+    cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
 
   rmm::device_uvector<string_index_pair> indices(total_groups, stream);
 
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index b7219196836..2d2691e0518 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -76,7 +76,7 @@ enum class parse_result {
 class parser {
  protected:
   CUDF_HOST_DEVICE inline parser() {}
-  CUDF_HOST_DEVICE inline parser(const char* _input, int64_t _input_len)
+  CUDF_HOST_DEVICE inline parser(char const* _input, int64_t _input_len)
     : input(_input), input_len(_input_len), pos(_input)
   {
     parse_whitespace();
@@ -87,7 +87,7 @@ class parser {
   {
   }
 
-  CUDF_HOST_DEVICE inline bool eof(const char* p) { return p - input >= input_len; }
+  CUDF_HOST_DEVICE inline bool eof(char const* p) { return p - input >= input_len; }
   CUDF_HOST_DEVICE inline bool eof() { return eof(pos); }
 
   CUDF_HOST_DEVICE inline bool parse_whitespace()
@@ -160,7 +160,7 @@ class parser {
       if ((quote == 0 && (*pos == '\'' || *pos == '\"')) || (quote == *pos)) {
         quote = *pos;
 
-        const char* start = ++pos;
+        char const* start = ++pos;
         while (!eof()) {
           // handle escaped characters
           if (*pos == '\\') {
@@ -206,7 +206,7 @@ struct json_output {
   char* output;
   thrust::optional<size_t> output_len;
 
-  __device__ void add_output(const char* str, size_t len)
+  __device__ void add_output(char const* str, size_t len)
   {
     if (output != nullptr) { memcpy(output + output_len.value_or(0), str, len); }
     output_len = output_len.value_or(0) + len;
@@ -224,7 +224,7 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
 class json_state : private parser {
  public:
   __device__ json_state() : parser() {}
-  __device__ json_state(const char* _input, int64_t _input_len, get_json_object_options _options)
+  __device__ json_state(char const* _input, int64_t _input_len, get_json_object_options _options)
     : parser(_input, _input_len),
 
       options(_options)
@@ -483,7 +483,7 @@ class json_state : private parser {
     return (c == '\"') || (options.get_allow_single_quotes() && (c == '\''));
   }
 
-  const char* cur_el_start{nullptr};  // pointer to the first character of the -value- of the
+  char const* cur_el_start{nullptr};  // pointer to the first character of the -value- of the
                                       // current element - not the name
   string_view cur_el_name;            // name of the current element (if applicable)
   json_element_type cur_el_type{json_element_type::NONE};     // type of the current element
@@ -524,7 +524,7 @@ struct path_operator {
  */
 class path_state : private parser {
  public:
-  path_state(const char* _path, size_t _path_len) : parser(_path, _path_len) {}
+  path_state(char const* _path, size_t _path_len) : parser(_path, _path_len) {}
 
   // get the next operator in the JSONPath string
   path_operator get_next_operator()
@@ -901,14 +901,14 @@ template <int block_size>
 __launch_bounds__(block_size) __global__
   void get_json_object_kernel(column_device_view col,
                               path_operator const* const commands,
-                              offset_type* output_offsets,
+                              size_type* output_offsets,
                               thrust::optional<char*> out_buf,
                               thrust::optional<bitmask_type*> out_validity,
                               thrust::optional<size_type*> out_valid_count,
                               get_json_object_options options)
 {
-  size_type tid    = threadIdx.x + (blockDim.x * blockIdx.x);
-  size_type stride = blockDim.x * gridDim.x;
+  auto tid          = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
 
   size_type warp_valid_count{0};
 
@@ -932,7 +932,7 @@ __launch_bounds__(block_size) __global__
 
     // filled in only during the precompute step. during the compute step, the offsets
     // are fed back in so we do -not- want to write them out
-    if (!out_buf.has_value()) { output_offsets[tid] = static_cast<offset_type>(output_size); }
+    if (!out_buf.has_value()) { output_offsets[tid] = static_cast<size_type>(output_size); }
 
     // validity filled in only during the output step
     if (out_validity.has_value()) {
@@ -995,7 +995,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
       *cdv,
       std::get<0>(preprocess).value().data(),
-      offsets_view.head<offset_type>(),
+      offsets_view.head<size_type>(),
       thrust::nullopt,
       thrust::nullopt,
       thrust::nullopt,
@@ -1003,12 +1003,12 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 
   // convert sizes to offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
-                         offsets_view.head<offset_type>(),
-                         offsets_view.head<offset_type>() + col.size() + 1,
-                         offsets_view.head<offset_type>(),
+                         offsets_view.head<size_type>(),
+                         offsets_view.head<size_type>() + col.size() + 1,
+                         offsets_view.head<size_type>(),
                          0);
   size_type const output_size =
-    cudf::detail::get_value<offset_type>(offsets_view, col.size(), stream);
+    cudf::detail::get_value<size_type>(offsets_view, col.size(), stream);
 
   // allocate output string column
   auto chars = create_chars_child_column(output_size, stream, mr);
@@ -1026,7 +1026,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
       *cdv,
       std::get<0>(preprocess).value().data(),
-      offsets_view.head<offset_type>(),
+      offsets_view.head<size_type>(),
       chars_view.head<char>(),
       static_cast<bitmask_type*>(validity.data()),
       d_valid_count.data(),
diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu
index 33e019eb6d1..5b91f295efb 100644
--- a/cpp/src/strings/like.cu
+++ b/cpp/src/strings/like.cu
@@ -55,11 +55,11 @@ struct like_fn {
     auto const d_str     = d_strings.element<string_view>(idx);
     auto const d_pattern = patterns_itr[idx];
 
-    // using only iterators to better handle UTF-8 characters
-    auto target_itr  = d_str.begin();
+    // incrementing by bytes instead of character improves performance 10-20%
+    auto target_itr  = d_str.data();
     auto pattern_itr = d_pattern.begin();
 
-    auto const target_end  = d_str.end();
+    auto const target_end  = target_itr + d_str.size_bytes();
     auto const pattern_end = d_pattern.end();
     auto const esc_char    = d_escape.empty() ? 0 : d_escape[0];
 
@@ -75,12 +75,20 @@ struct like_fn {
           escaped && (pattern_itr + 1 < pattern_end) ? *(++pattern_itr) : *pattern_itr;
 
         if (escaped || (pattern_char != multi_wildcard)) {
+          size_type char_width = 0;
           // check match with the current character
-          result = ((target_itr != target_end) && ((!escaped && pattern_char == single_wildcard) ||
-                                                   (pattern_char == *target_itr)));
+          result = (target_itr != target_end);
+          if (result) {
+            if (escaped || pattern_char != single_wildcard) {
+              char_utf8 target_char = 0;
+              // retrieve the target character to compare with the current pattern_char
+              char_width = to_char_utf8(target_itr, target_char);
+              result     = (pattern_char == target_char);
+            }
+          }
           if (!result) { break; }
-          ++target_itr;
           ++pattern_itr;
+          target_itr += char_width ? char_width : bytes_in_utf8_byte(*target_itr);
         } else {
           // process wildcard '%'
           result = true;
@@ -92,8 +100,8 @@ struct like_fn {
           // save positions
           last_pattern_itr = pattern_itr;
           last_target_itr  = target_itr;
-        }
-      }                                                     // next pattern character
+        }  // next pattern character
+      }
 
       if (result && (target_itr == target_end)) { break; }  // success
 
@@ -103,7 +111,8 @@ struct like_fn {
 
       // restore saved positions
       pattern_itr = last_pattern_itr;
-      target_itr  = ++last_target_itr;
+      last_target_itr += bytes_in_utf8_byte(*last_target_itr);
+      target_itr = last_target_itr;
     }
     return result;
   }
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index da6d01c92dc..c501a8bf7b4 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ struct base_fn {
   column_device_view const d_column;
   size_type const width;
   size_type const fill_char_size;
-  offset_type* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   base_fn(column_device_view const& d_column, size_type width, size_type fill_char_size)
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 93150efc65e..5fd098a872e 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -60,10 +60,10 @@ static reclass cclass_S(NCCLASS_S);  // \S
 static reclass cclass_D(NCCLASS_D);  // \D
 
 // Tables for analyzing quantifiers
-const std::array<int, 5> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL}};
-const std::array<char, 5> quantifiers{{'*', '?', '+', '{', '|'}};
+std::array<int, 5> const valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL}};
+std::array<char, 5> const quantifiers{{'*', '?', '+', '{', '|'}};
 // Valid regex characters that can be escaped and used as literals
-const std::array<char, 33> escapable_chars{
+std::array<char, 33> const escapable_chars{
   {'.', '-', '+',  '*', '\\', '?', '^', '$', '|', '{', '}', '(', ')', '[', ']', '<', '>',
    '"', '~', '\'', '`', '_',  '@', '=', ';', ':', '!', '#', '%', '&', ',', '/', ' '}};
 
@@ -86,7 +86,7 @@ std::vector<char32_t> string_to_char32_vector(std::string_view pattern)
   });
   std::vector<char32_t> result(count + 1);
   char32_t* output_ptr  = result.data();
-  const char* input_ptr = pattern.data();
+  char const* input_ptr = pattern.data();
   for (size_type idx = 0; idx < size; ++idx) {
     char_utf8 output_character = 0;
     size_type ch_width         = to_char_utf8(input_ptr, output_character);
@@ -760,7 +760,7 @@ class regex_parser {
   }
 
  public:
-  regex_parser(const char32_t* pattern,
+  regex_parser(char32_t const* pattern,
                regex_flags const flags,
                capture_groups const capture,
                reprog& prog)
@@ -967,7 +967,7 @@ class regex_compiler {
   }
 
  public:
-  regex_compiler(const char32_t* pattern,
+  regex_compiler(char32_t const* pattern,
                  regex_flags const flags,
                  capture_groups const capture,
                  reprog& prog)
@@ -1093,7 +1093,7 @@ void reprog::build_start_ids()
   while (!ids.empty()) {
     int id = ids.top();
     ids.pop();
-    const reinst& inst = _insts[id];
+    reinst const& inst = _insts[id];
     if (inst.type == OR) {
       if (inst.u2.left_id != id)   // prevents infinite while-loop here
         ids.push(inst.u2.left_id);
@@ -1174,7 +1174,7 @@ void reprog::print(regex_flags const flags)
   printf("Flags = 0x%08x\n", static_cast<uint32_t>(flags));
   printf("Instructions:\n");
   for (std::size_t i = 0; i < _insts.size(); i++) {
-    const reinst& inst = _insts[i];
+    reinst const& inst = _insts[i];
     printf("%3zu: ", i);
     switch (inst.type) {
       default: printf("Unknown instruction: %d, next=%d", inst.type, inst.u2.next_id); break;
@@ -1238,7 +1238,7 @@ void reprog::print(regex_flags const flags)
   int count = static_cast<int>(_classes.size());
   printf("\nClasses %d\n", count);
   for (int i = 0; i < count; i++) {
-    const reclass& cls = _classes[i];
+    reclass const& cls = _classes[i];
     auto const size    = static_cast<int>(cls.literals.size());
     printf("%2d: ", i);
     for (int j = 0; j < size; ++j) {
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index baefe06cbdc..aa2cb363b80 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -97,10 +97,10 @@ struct reinst {
  */
 class reprog {
  public:
-  reprog(const reprog&)            = default;
+  reprog(reprog const&)            = default;
   reprog(reprog&&)                 = default;
   ~reprog()                        = default;
-  reprog& operator=(const reprog&) = default;
+  reprog& operator=(reprog const&) = default;
   reprog& operator=(reprog&&)      = default;
 
   /**
@@ -131,7 +131,7 @@ class reprog {
   [[nodiscard]] reclass const& class_at(int32_t id) const;
   [[nodiscard]] reclass const* classes_data() const;
 
-  [[nodiscard]] const int32_t* starts_data() const;
+  [[nodiscard]] int32_t const* starts_data() const;
   [[nodiscard]] int32_t starts_count() const;
 
   void set_start_inst(int32_t id);
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index f5e4146830b..19d82380350 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -18,6 +18,7 @@
 #include <strings/regex/regcomp.h>
 
 #include <cudf/strings/regex/flags.hpp>
+#include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -30,9 +31,6 @@
 #include <memory>
 
 namespace cudf {
-
-class string_view;
-
 namespace strings {
 namespace detail {
 
@@ -76,9 +74,9 @@ class reprog_device {
  public:
   reprog_device()                                = delete;
   ~reprog_device()                               = default;
-  reprog_device(const reprog_device&)            = default;
+  reprog_device(reprog_device const&)            = default;
   reprog_device(reprog_device&&)                 = default;
-  reprog_device& operator=(const reprog_device&) = default;
+  reprog_device& operator=(reprog_device const&) = default;
   reprog_device& operator=(reprog_device&&)      = default;
 
   /**
@@ -184,36 +182,33 @@ class reprog_device {
    *
    * @param thread_idx The index used for mapping the state memory for this string in global memory.
    * @param d_str The string to search.
-   * @param[in,out] begin Position index to begin the search. If found, returns the position found
-   * in the string.
-   * @param[in,out] end Position index to end the search. If found, returns the last position
-   * matching in the string.
-   * @return Returns 0 if no match is found.
+   * @param begin Position to begin the search within `d_str`.
+   * @param end Character position index to end the search within `d_str`.
+   *            Specify -1 to match any virtual positions past the end of the string.
+   * @return If match found, returns character positions of the matches.
    */
-  __device__ inline int32_t find(int32_t const thread_idx,
-                                 string_view const d_str,
-                                 cudf::size_type& begin,
-                                 cudf::size_type& end) const;
+  __device__ inline match_result find(int32_t const thread_idx,
+                                      string_view const d_str,
+                                      string_view::const_iterator begin,
+                                      cudf::size_type end = -1) const;
 
   /**
    * @brief Does an extract evaluation using the compiled expression on the given string.
    *
-   * This will find a specific match within the string when more than match occurs.
+   * This will find a specific capture group within the string.
    * The find() function should be called first to locate the begin/end bounds of the
    * the matched section.
    *
    * @param thread_idx The index used for mapping the state memory for this string in global memory.
    * @param d_str The string to search.
-   * @param begin Position index to begin the search. If found, returns the position found
-   * in the string.
-   * @param end Position index to end the search. If found, returns the last position
-   * matching in the string.
+   * @param begin Position to begin the search within `d_str`.
+   * @param end Character position index to end the search within `d_str`.
    * @param group_id The specific group to return its matching position values.
    * @return If valid, returns the character position of the matched group in the given string,
    */
   __device__ inline match_result extract(int32_t const thread_idx,
                                          string_view const d_str,
-                                         cudf::size_type begin,
+                                         string_view::const_iterator begin,
                                          cudf::size_type end,
                                          cudf::size_type const group_id) const;
 
@@ -241,20 +236,20 @@ class reprog_device {
   /**
    * @brief Executes the regex pattern on the given string.
    */
-  __device__ inline int32_t regexec(string_view const d_str,
-                                    reljunk jnk,
-                                    cudf::size_type& begin,
-                                    cudf::size_type& end,
-                                    cudf::size_type const group_id = 0) const;
+  __device__ inline match_result regexec(string_view const d_str,
+                                         reljunk jnk,
+                                         string_view::const_iterator begin,
+                                         cudf::size_type end,
+                                         cudf::size_type const group_id = 0) const;
 
   /**
    * @brief Utility wrapper to setup state memory structures for calling regexec
    */
-  __device__ inline int32_t call_regexec(int32_t const thread_idx,
-                                         string_view const d_str,
-                                         cudf::size_type& begin,
-                                         cudf::size_type& end,
-                                         cudf::size_type const group_id = 0) const;
+  __device__ inline match_result call_regexec(int32_t const thread_idx,
+                                              string_view const d_str,
+                                              string_view::const_iterator begin,
+                                              cudf::size_type end,
+                                              cudf::size_type const group_id = 0) const;
 
   reprog_device(reprog const&);
 
@@ -285,6 +280,30 @@ class reprog_device {
  */
 std::size_t compute_working_memory_size(int32_t num_threads, int32_t insts_count);
 
+/**
+ * @brief Converts a match_pair from character positions to byte positions
+ */
+__device__ __forceinline__ match_pair match_positions_to_bytes(match_pair const result,
+                                                               string_view d_str,
+                                                               string_view::const_iterator last)
+{
+  if (d_str.length() == d_str.size_bytes()) { return result; }
+  auto const begin = (last + (result.first - last.position())).byte_offset();
+  auto const end   = (last + (result.second - last.position())).byte_offset();
+  return {begin, end};
+}
+
+/**
+ * @brief Creates a string_view from a match result
+ */
+__device__ __forceinline__ string_view string_from_match(match_pair const result,
+                                                         string_view d_str,
+                                                         string_view::const_iterator last)
+{
+  auto const [begin, end] = match_positions_to_bytes(result, d_str, last);
+  return string_view(d_str.data() + begin, end - begin);
+}
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index d25a0888f32..c5205ae7789 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,6 @@
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/utf8.hpp>
-#include <cudf/strings/string_view.cuh>
-
-#include <thrust/optional.h>
 
 namespace cudf {
 namespace strings {
@@ -235,21 +231,19 @@ __device__ __forceinline__ reprog_device reprog_device::load(reprog_device const
  * @param group_id Index of the group to match in a multi-group regex pattern.
  * @return >0 if match found
  */
-__device__ __forceinline__ int32_t reprog_device::regexec(string_view const dstr,
-                                                          reljunk jnk,
-                                                          cudf::size_type& begin,
-                                                          cudf::size_type& end,
-                                                          cudf::size_type const group_id) const
+__device__ __forceinline__ match_result reprog_device::regexec(string_view const dstr,
+                                                               reljunk jnk,
+                                                               string_view::const_iterator itr,
+                                                               cudf::size_type end,
+                                                               cudf::size_type const group_id) const
 {
   int32_t match       = 0;
+  auto begin          = itr.position();
   auto pos            = begin;
   auto eos            = end;
-  char_utf8 c         = 0;
   auto checkstart     = jnk.starttype != 0;
   auto last_character = false;
 
-  string_view::const_iterator itr = string_view::const_iterator(dstr, pos);
-
   jnk.list1->reset();
   do {
     // fast check for first CHAR or BOL
@@ -258,12 +252,12 @@ __device__ __forceinline__ int32_t reprog_device::regexec(string_view const dstr
       switch (jnk.starttype) {
         case BOL:
           if (pos == 0) break;
-          if (jnk.startchar != '^') { return match; }
+          if (jnk.startchar != '^') { return thrust::nullopt; }
           --pos;
           startchar = static_cast<char_utf8>('\n');
         case CHAR: {
           auto const fidx = dstr.find(startchar, pos);
-          if (fidx == string_view::npos) { return match; }
+          if (fidx == string_view::npos) { return thrust::nullopt; }
           pos = fidx + (jnk.starttype == BOL);
           break;
         }
@@ -279,7 +273,7 @@ __device__ __forceinline__ int32_t reprog_device::regexec(string_view const dstr
 
     last_character = itr.byte_offset() >= dstr.size_bytes();
 
-    c = last_character ? 0 : *itr;
+    char_utf8 const c = last_character ? 0 : *itr;
 
     // expand the non-character types like: LBRA, RBRA, BOL, EOL, BOW, NBOW, and OR
     bool expanded = false;
@@ -394,35 +388,33 @@ __device__ __forceinline__ int32_t reprog_device::regexec(string_view const dstr
     checkstart = jnk.list1->get_size() == 0;
   } while (!last_character && (!checkstart || !match));
 
-  return match;
+  return match ? match_result({begin, end}) : thrust::nullopt;
 }
 
-__device__ __forceinline__ int32_t reprog_device::find(int32_t const thread_idx,
-                                                       string_view const dstr,
-                                                       cudf::size_type& begin,
-                                                       cudf::size_type& end) const
+__device__ __forceinline__ match_result reprog_device::find(int32_t const thread_idx,
+                                                            string_view const dstr,
+                                                            string_view::const_iterator begin,
+                                                            cudf::size_type end) const
 {
-  auto const rtn = call_regexec(thread_idx, dstr, begin, end);
-  if (rtn <= 0) begin = end = -1;
-  return rtn;
+  return call_regexec(thread_idx, dstr, begin, end);
 }
 
 __device__ __forceinline__ match_result reprog_device::extract(int32_t const thread_idx,
                                                                string_view const dstr,
-                                                               cudf::size_type begin,
+                                                               string_view::const_iterator begin,
                                                                cudf::size_type end,
                                                                cudf::size_type const group_id) const
 {
-  end = begin + 1;
-  return call_regexec(thread_idx, dstr, begin, end, group_id + 1) > 0 ? match_result({begin, end})
-                                                                      : thrust::nullopt;
+  end = begin.position() + 1;
+  return call_regexec(thread_idx, dstr, begin, end, group_id + 1);
 }
 
-__device__ __forceinline__ int32_t reprog_device::call_regexec(int32_t const thread_idx,
-                                                               string_view const dstr,
-                                                               cudf::size_type& begin,
-                                                               cudf::size_type& end,
-                                                               cudf::size_type const group_id) const
+__device__ __forceinline__ match_result
+reprog_device::call_regexec(int32_t const thread_idx,
+                            string_view const dstr,
+                            string_view::const_iterator begin,
+                            cudf::size_type end,
+                            cudf::size_type const group_id) const
 {
   auto gp_ptr = reinterpret_cast<u_char*>(_buffer);
   relist list1(static_cast<int16_t>(_max_insts), _thread_count, gp_ptr, thread_idx);
diff --git a/cpp/src/strings/regex/regex_program_impl.h b/cpp/src/strings/regex/regex_program_impl.h
index eede2225bce..74cc1902739 100644
--- a/cpp/src/strings/regex/regex_program_impl.h
+++ b/cpp/src/strings/regex/regex_program_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
 #include "regcomp.h"
 #include "regex.cuh"
diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index 6bbd79166a8..23b53062bf3 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -135,8 +135,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
 
   auto const char_bytes =
     cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
-  CUDF_EXPECTS(char_bytes <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
-               "Size of output exceeds column size limit",
+  CUDF_EXPECTS(char_bytes <= std::numeric_limits<size_type>::max(),
+               "Size of output exceeds the column size limit",
                std::overflow_error);
 
   // Now build the chars column
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 3784b535a5b..396e1e6a2ac 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -51,7 +51,8 @@ std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
   if (repeat_times == 1) { return std::make_unique<string_scalar>(input, stream, mr); }
 
   CUDF_EXPECTS(input.size() <= std::numeric_limits<size_type>::max() / repeat_times,
-               "The output string has size that exceeds the maximum allowed size.");
+               "The output size exceeds the column size limit",
+               std::overflow_error);
 
   auto const str_size = input.size();
   auto const iter     = thrust::make_counting_iterator(0);
@@ -62,7 +63,7 @@ std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                     iter,
                     iter + repeat_times * str_size,
                     static_cast<char*>(buff.data()),
-                    [in_ptr = input.data(), str_size] __device__(const auto idx) {
+                    [in_ptr = input.data(), str_size] __device__(auto const idx) {
                       return in_ptr[idx % str_size];
                     });
 
@@ -83,10 +84,10 @@ auto generate_empty_output(strings_column_view const& input,
   auto chars_column = create_chars_child_column(0, stream, mr);
 
   auto offsets_column = make_numeric_column(
-    data_type{type_to_id<offset_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data<offset_type>(),
+    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data<size_type>(),
                                 0,
-                                offsets_column->size() * sizeof(offset_type),
+                                offsets_column->size() * sizeof(size_type),
                                 stream.value()));
 
   return make_strings_column(strings_count,
@@ -108,7 +109,7 @@ struct compute_size_and_repeat_fn {
   size_type const repeat_times;
   bool const has_nulls;
 
-  offset_type* d_offsets{nullptr};
+  size_type* d_offsets{nullptr};
 
   // If d_chars == nullptr: only compute sizes of the output strings.
   // If d_chars != nullptr: only repeat strings.
@@ -183,7 +184,7 @@ struct compute_sizes_and_repeat_fn {
   bool const strings_has_nulls;
   bool const rtimes_has_nulls;
 
-  offset_type* d_offsets{nullptr};
+  size_type* d_offsets{nullptr};
 
   // If d_chars == nullptr: only compute sizes of the output strings.
   // If d_chars != nullptr: only repeat strings.
diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh
index eb34ab67839..aeaea40358f 100644
--- a/cpp/src/strings/replace/backref_re.cuh
+++ b/cpp/src/strings/replace/backref_re.cuh
@@ -45,7 +45,7 @@ struct backrefs_fn {
   string_view const d_repl;  // string replacement template
   Iterator backrefs_begin;
   Iterator backrefs_end;
-  int32_t* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
@@ -59,22 +59,27 @@ struct backrefs_fn {
     auto const nchars = d_str.length();      // number of characters in input string
     auto nbytes       = d_str.size_bytes();  // number of bytes for the output string
     auto out_ptr      = d_chars ? (d_chars + d_offsets[idx]) : nullptr;
-    size_type lpos    = 0;                   // last byte position processed in d_str
-    size_type begin   = 0;                   // first character position matching regex
-    size_type end     = nchars;              // last character position (exclusive)
+    auto itr          = d_str.begin();
+    auto last_pos     = itr;
 
     // copy input to output replacing strings as we go
-    while (prog.find(prog_idx, d_str, begin, end) > 0)  // inits the begin/end vars
+    while (itr.position() <= nchars)  // inits the begin/end vars
     {
-      auto spos = d_str.byte_offset(begin);             // get offset for the
-      auto epos = d_str.byte_offset(end);               // character position values;
-      nbytes += d_repl.size_bytes() - (epos - spos);    // compute the output size
+      auto const match = prog.find(prog_idx, d_str, itr);
+      if (!match) { break; }
+
+      auto const [start_pos, end_pos] = match_positions_to_bytes(*match, d_str, itr);
+      nbytes += d_repl.size_bytes() - (end_pos - start_pos);  // compute the output size
 
       // copy the string data before the matched section
-      if (out_ptr) { out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos); }
+      if (out_ptr) {
+        out_ptr = copy_and_increment(
+          out_ptr, in_ptr + last_pos.byte_offset(), start_pos - last_pos.byte_offset());
+      }
       size_type lpos_template = 0;              // last end pos of replace template
       auto const repl_ptr     = d_repl.data();  // replace template pattern
 
+      itr += (match->first - itr.position());
       thrust::for_each(
         thrust::seq, backrefs_begin, backrefs_end, [&] __device__(backref_type backref) {
           if (out_ptr) {
@@ -83,17 +88,13 @@ struct backrefs_fn {
             lpos_template += copy_length;
           }
           // extract the specific group's string for this backref's index
-          auto extracted = prog.extract(prog_idx, d_str, begin, end, backref.first - 1);
-          if (!extracted || (extracted.value().second <= extracted.value().first)) {
+          auto extracted = prog.extract(prog_idx, d_str, itr, match->second, backref.first - 1);
+          if (!extracted || (extracted->second < extracted->first)) {
             return;  // no value for this backref number; that is ok
           }
-          auto spos_extract = d_str.byte_offset(extracted.value().first);   // convert
-          auto epos_extract = d_str.byte_offset(extracted.value().second);  // to bytes
-          nbytes += epos_extract - spos_extract;
-          if (out_ptr) {
-            out_ptr =
-              copy_and_increment(out_ptr, in_ptr + spos_extract, (epos_extract - spos_extract));
-          }
+          auto const d_str_ex = string_from_match(*extracted, d_str, itr);
+          nbytes += d_str_ex.size_bytes();
+          if (out_ptr) { out_ptr = copy_string(out_ptr, d_str_ex); }
         });
 
       // copy remainder of template
@@ -103,16 +104,16 @@ struct backrefs_fn {
       }
 
       // setup to match the next section
-      lpos  = epos;
-      begin = end;
-      end   = nchars;
+      last_pos += (match->second - last_pos.position());
+      itr = last_pos + (match->first == match->second);
     }
 
     // finally, copy remainder of input string
-    if (out_ptr && (lpos < d_str.size_bytes())) {
-      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
-    } else if (!out_ptr) {
-      d_offsets[idx] = static_cast<int32_t>(nbytes);
+    if (out_ptr) {
+      thrust::copy_n(
+        thrust::seq, in_ptr + itr.byte_offset(), d_str.size_bytes() - itr.byte_offset(), out_ptr);
+    } else {
+      d_offsets[idx] = nbytes;
     }
   }
 };
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index b554d0a815c..867b443c036 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -55,7 +55,7 @@ struct replace_multi_regex_fn {
   device_span<reprog_device const> progs;  // array of regex progs
   found_range* d_found_ranges;             // working array matched (begin,end) values
   column_device_view const d_repls;        // replacement strings
-  int32_t* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   __device__ void operator()(size_type idx)
@@ -67,61 +67,69 @@ struct replace_multi_regex_fn {
 
     auto const number_of_patterns = static_cast<size_type>(progs.size());
 
-    auto const d_str      = d_strings.element<string_view>(idx);
-    auto const nchars     = d_str.length();      // number of characters in input string
-    auto nbytes           = d_str.size_bytes();  // number of bytes in input string
-    auto in_ptr           = d_str.data();        // input pointer
-    auto out_ptr          = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    auto const d_str  = d_strings.element<string_view>(idx);
+    auto const nchars = d_str.length();      // number of characters in input string
+    auto nbytes       = d_str.size_bytes();  // number of bytes in input string
+    auto in_ptr       = d_str.data();        // input pointer
+    auto out_ptr      = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    auto itr          = d_str.begin();
+    auto last_pos     = itr;
+
     found_range* d_ranges = d_found_ranges + (idx * number_of_patterns);
-    size_type lpos        = 0;
-    size_type ch_pos      = 0;
+
     // initialize the working ranges memory to -1's
     thrust::fill(thrust::seq, d_ranges, d_ranges + number_of_patterns, found_range{-1, 1});
+
     // process string one character at a time
-    while (ch_pos < nchars) {
+    while (itr.position() < nchars) {
       // this minimizes the regex-find calls by only calling it for stale patterns
       // -- those that have not previously matched up to this point (ch_pos)
       for (size_type ptn_idx = 0; ptn_idx < number_of_patterns; ++ptn_idx) {
-        if (d_ranges[ptn_idx].first >= ch_pos)  // previously matched here
-          continue;                             // or later in the string
+        if (d_ranges[ptn_idx].first >= itr.position()) {  // previously matched here
+          continue;                                       // or later in the string
+        }
         reprog_device prog = progs[ptn_idx];
 
-        auto begin = ch_pos;
-        auto end   = nchars;
-        if (!prog.is_empty() && prog.find(idx, d_str, begin, end) > 0)
-          d_ranges[ptn_idx] = found_range{begin, end};      // found a match
-        else
-          d_ranges[ptn_idx] = found_range{nchars, nchars};  // this pattern is done
+        auto const result = !prog.is_empty() ? prog.find(idx, d_str, itr) : thrust::nullopt;
+        d_ranges[ptn_idx] =
+          result ? found_range{result->first, result->second} : found_range{nchars, nchars};
       }
       // all the ranges have been updated from each regex match;
       // look for any that match at this character position (ch_pos)
-      auto itr =
-        thrust::find_if(thrust::seq, d_ranges, d_ranges + number_of_patterns, [ch_pos](auto range) {
-          return range.first == ch_pos;
-        });
-      if (itr != d_ranges + number_of_patterns) {
+      auto const ptn_itr =
+        thrust::find_if(thrust::seq,
+                        d_ranges,
+                        d_ranges + number_of_patterns,
+                        [ch_pos = itr.position()](auto range) { return range.first == ch_pos; });
+      if (ptn_itr != d_ranges + number_of_patterns) {
         // match found, compute and replace the string in the output
-        size_type ptn_idx  = static_cast<size_type>(itr - d_ranges);
-        size_type begin    = d_ranges[ptn_idx].first;
-        size_type end      = d_ranges[ptn_idx].second;
-        string_view d_repl = d_repls.size() > 1 ? d_repls.element<string_view>(ptn_idx)
-                                                : d_repls.element<string_view>(0);
-        auto spos          = d_str.byte_offset(begin);
-        auto epos          = d_str.byte_offset(end);
-        nbytes += d_repl.size_bytes() - (epos - spos);
+        auto const ptn_idx = static_cast<size_type>(thrust::distance(d_ranges, ptn_itr));
+
+        auto d_repl = d_repls.size() > 1 ? d_repls.element<string_view>(ptn_idx)
+                                         : d_repls.element<string_view>(0);
+
+        auto const d_range = d_ranges[ptn_idx];
+        auto const [start_pos, end_pos] =
+          match_positions_to_bytes({d_range.first, d_range.second}, d_str, last_pos);
+        nbytes += d_repl.size_bytes() - (end_pos - start_pos);
         if (out_ptr) {  // copy unmodified content plus new replacement string
-          out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos);
+          out_ptr = copy_and_increment(
+            out_ptr, in_ptr + last_pos.byte_offset(), start_pos - last_pos.byte_offset());
           out_ptr = copy_string(out_ptr, d_repl);
-          lpos    = epos;
         }
-        ch_pos = end - 1;
+        last_pos += (d_range.second - last_pos.position());
+        itr = last_pos - 1;
       }
-      ++ch_pos;
+      ++itr;
+    }
+    if (out_ptr) {  // copy the remainder
+      thrust::copy_n(thrust::seq,
+                     in_ptr + last_pos.byte_offset(),
+                     d_str.size_bytes() - last_pos.byte_offset(),
+                     out_ptr);
+    } else {
+      d_offsets[idx] = nbytes;
     }
-    if (out_ptr)  // copy the remainder
-      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
-    else
-      d_offsets[idx] = static_cast<int32_t>(nbytes);
   }
 };
 
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 3fc969a4c1f..a622d1a742d 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -80,7 +80,7 @@ struct replace_row_parallel_fn {
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
-    const char* in_ptr = d_str.data();
+    char const* in_ptr = d_str.data();
 
     char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
     auto max_n    = (max_repl < 0) ? d_str.length() : max_repl;
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index c334d2b2013..460074a5296 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -42,7 +42,7 @@ struct replace_regex_fn {
   column_device_view const d_strings;
   string_view const d_repl;
   size_type const maxrepl;
-  int32_t* d_offsets{};
+  size_type* d_offsets{};
   char* d_chars{};
 
   __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
@@ -54,46 +54,42 @@ struct replace_regex_fn {
 
     auto const d_str  = d_strings.element<string_view>(idx);
     auto const nchars = d_str.length();
-    auto nbytes       = d_str.size_bytes();             // number of bytes in input string
-    auto mxn     = maxrepl < 0 ? nchars + 1 : maxrepl;  // max possible replaces for this string
-    auto in_ptr  = d_str.data();                        // input pointer (i)
-    auto out_ptr = d_chars ? d_chars + d_offsets[idx]   // output pointer (o)
-                           : nullptr;
-    size_type last_pos = 0;
-    size_type begin    = 0;   // these are for calling prog.find
-    size_type end      = -1;  // matches final word-boundary if at the end of the string
+    auto nbytes       = d_str.size_bytes();              // number of bytes in input string
+    auto mxn      = maxrepl < 0 ? nchars + 1 : maxrepl;  // max possible replaces for this string
+    auto in_ptr   = d_str.data();                        // input pointer (i)
+    auto out_ptr  = d_chars ? d_chars + d_offsets[idx]   // output pointer (o)
+                            : nullptr;
+    auto itr      = d_str.begin();
+    auto last_pos = itr;
 
     // copy input to output replacing strings as we go
-    while (mxn-- > 0 && begin <= nchars) {  // maximum number of replaces
-
-      if (prog.is_empty() || prog.find(prog_idx, d_str, begin, end) <= 0) {
-        break;  // no more matches
-      }
-
-      auto const start_pos = d_str.byte_offset(begin);        // get offset for these
-      auto const end_pos   = d_str.byte_offset(end);          // character position values
-      nbytes += d_repl.size_bytes() - (end_pos - start_pos);  // and compute new size
-
-      if (out_ptr) {                                          // replace:
-                                                              // i:bbbbsssseeee
-        out_ptr = copy_and_increment(out_ptr,                 //   ^
-                                     in_ptr + last_pos,       // o:bbbb
-                                     start_pos - last_pos);   //       ^
-        out_ptr = copy_string(out_ptr, d_repl);               // o:bbbbrrrrrr
-                                                              //  out_ptr ---^
-        last_pos = end_pos;                                   // i:bbbbsssseeee
-      }                                                       //  in_ptr --^
-
-      begin = end + (begin == end);
-      end   = -1;
+    while (mxn-- > 0 && itr.position() <= nchars && !prog.is_empty()) {
+      auto const match = prog.find(prog_idx, d_str, itr);
+      if (!match) { break; }  // no more matches
+
+      auto const [start_pos, end_pos] = match_positions_to_bytes(*match, d_str, last_pos);
+      nbytes += d_repl.size_bytes() - (end_pos - start_pos);               // add new size
+
+      if (out_ptr) {                                                       // replace:
+                                                                           // i:bbbbsssseeee
+        out_ptr = copy_and_increment(out_ptr,                              //   ^
+                                     in_ptr + last_pos.byte_offset(),      // o:bbbb
+                                     start_pos - last_pos.byte_offset());  //       ^
+        out_ptr = copy_string(out_ptr, d_repl);                            // o:bbbbrrrrrr
+      }                                                                    //  out_ptr ---^
+      last_pos += (match->second - last_pos.position());                   // i:bbbbsssseeee
+                                                                           //  in_ptr --^
+
+      itr = last_pos + (match->first == match->second);
     }
 
     if (out_ptr) {
-      memcpy(out_ptr,                         // copy the remainder
-             in_ptr + last_pos,               // o:bbbbrrrrrreeee
-             d_str.size_bytes() - last_pos);  //             ^   ^
+      thrust::copy_n(thrust::seq,                                  // copy the remainder
+                     in_ptr + last_pos.byte_offset(),              // o:bbbbrrrrrreeee
+                     d_str.size_bytes() - last_pos.byte_offset(),  //             ^   ^
+                     out_ptr);
     } else {
-      d_offsets[idx] = static_cast<int32_t>(nbytes);
+      d_offsets[idx] = nbytes;
     }
   }
 };
diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu
index 3c1fae7a00f..090705ac25d 100644
--- a/cpp/src/strings/reverse.cu
+++ b/cpp/src/strings/reverse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ namespace {
  */
 struct reverse_characters_fn {
   column_device_view const d_strings;
-  offset_type const* d_offsets;
+  size_type const* d_offsets;
   char* d_chars;
 
   __device__ void operator()(size_type idx)
@@ -64,7 +64,7 @@ std::unique_ptr<column> reverse(strings_column_view const& input,
   // copy the column; replace data in the chars column
   auto result = std::make_unique<column>(input.parent(), stream, mr);
   auto const d_offsets =
-    result->view().child(strings_column_view::offsets_column_index).data<offset_type>();
+    result->view().child(strings_column_view::offsets_column_index).data<size_type>();
   auto d_chars = result->mutable_view().child(strings_column_view::chars_column_index).data<char>();
 
   auto const d_column = column_device_view::create(input.parent(), stream);
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 5e6a273958c..3de9dd34d83 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -16,10 +16,10 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/find.hpp>
@@ -34,9 +34,12 @@
 #include <thrust/binary_search.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
+#include <cuda/atomic>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -55,17 +58,19 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;
 /**
  * @brief Find function handles a string per thread
  */
-template <bool forward = true>
+template <typename TargetIterator, bool forward = true>
 struct finder_fn {
   column_device_view const d_strings;
-  string_view const d_target;
+  TargetIterator const d_targets;
   size_type const start;
   size_type const stop;
 
   __device__ size_type operator()(size_type idx) const
   {
     if (d_strings.is_null(idx)) { return -1; }
-    auto d_str = d_strings.element<string_view>(idx);
+    auto const d_str = d_strings.element<string_view>(idx);
+    if (d_str.empty() && (start > 0)) { return -1; }
+    auto const d_target = d_targets[idx];
 
     auto const length = d_str.length();
     auto const begin  = (start > length) ? length : start;
@@ -109,9 +114,9 @@ struct empty_target_fn {
 /**
  * @brief String per warp function for find/rfind
  */
-template <bool forward = true>
+template <typename TargetIterator, bool forward = true>
 __global__ void finder_warp_parallel_fn(column_device_view const d_strings,
-                                        string_view const d_target,
+                                        TargetIterator const d_targets,
                                         size_type const start,
                                         size_type const stop,
                                         size_type* d_results)
@@ -129,7 +134,8 @@ __global__ void finder_warp_parallel_fn(column_device_view const d_strings,
   if (lane_idx == 0) { d_results[str_idx] = forward ? std::numeric_limits<size_type>::max() : -1; }
   __syncwarp();
 
-  auto const d_str = d_strings.element<string_view>(str_idx);
+  auto const d_str    = d_strings.element<string_view>(str_idx);
+  auto const d_target = d_targets[str_idx];
 
   auto const [begin, left_over] = bytes_to_character_position(d_str, start);
   auto const start_char_pos     = start - left_over;  // keep track of character position
@@ -154,7 +160,9 @@ __global__ void finder_warp_parallel_fn(column_device_view const d_strings,
 
   // find stores the minimum position while rfind stores the maximum position
   // note that this was slightly faster than using cub::WarpReduce
-  forward ? atomicMin(d_results + str_idx, position) : atomicMax(d_results + str_idx, position);
+  cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_results + str_idx)};
+  forward ? ref.fetch_min(position, cuda::std::memory_order_relaxed)
+          : ref.fetch_max(position, cuda::std::memory_order_relaxed);
   __syncwarp();
 
   if (lane_idx == 0) {
@@ -168,6 +176,33 @@ __global__ void finder_warp_parallel_fn(column_device_view const d_strings,
   }
 }
 
+template <typename TargetIterator, bool forward = true>
+void find_utility(strings_column_view const& input,
+                  TargetIterator const& target_itr,
+                  column& output,
+                  size_type start,
+                  size_type stop,
+                  rmm::cuda_stream_view stream)
+{
+  auto d_strings = column_device_view::create(input.parent(), stream);
+  auto d_results = output.mutable_view().data<size_type>();
+  if ((input.chars_size() / (input.size() - input.null_count())) > AVG_CHAR_BYTES_THRESHOLD) {
+    // warp-per-string runs faster for longer strings (but not shorter ones)
+    constexpr int block_size = 256;
+    cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+    finder_warp_parallel_fn<TargetIterator, forward>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        *d_strings, target_itr, start, stop, d_results);
+  } else {
+    // string-per-thread function
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(input.size()),
+                      d_results,
+                      finder_fn<TargetIterator, forward>{*d_strings, target_itr, start, stop});
+  }
+}
+
 template <bool forward = true>
 std::unique_ptr<column> find_fn(strings_column_view const& input,
                                 string_scalar const& target,
@@ -180,9 +215,6 @@ std::unique_ptr<column> find_fn(strings_column_view const& input,
   CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero.");
   if ((stop > 0) && (start > stop)) CUDF_FAIL("Parameter start must be less than stop.");
 
-  auto d_target  = string_view(target.data(), target.size());
-  auto d_strings = column_device_view::create(input.parent(), stream);
-
   // create output column
   auto results = make_numeric_column(data_type{type_to_id<size_type>()},
                                      input.size(),
@@ -193,32 +225,24 @@ std::unique_ptr<column> find_fn(strings_column_view const& input,
   // if input is empty or all-null then we are done
   if (input.size() == input.null_count()) { return results; }
 
-  auto d_results = results->mutable_view().data<size_type>();
+  auto d_target = string_view(target.data(), target.size());
 
+  // special logic for empty target results
   if (d_target.empty()) {
-    // special logic for empty target results
+    auto d_strings = column_device_view::create(input.parent(), stream);
+    auto d_results = results->mutable_view().data<size_type>();
     thrust::transform(rmm::exec_policy(stream),
                       thrust::counting_iterator<size_type>(0),
                       thrust::counting_iterator<size_type>(input.size()),
                       d_results,
                       empty_target_fn<forward>{*d_strings, start, stop});
-  } else if ((input.chars_size() / (input.size() - input.null_count())) >
-             AVG_CHAR_BYTES_THRESHOLD) {
-    // warp-per-string runs faster for longer strings (but not shorter ones)
-    constexpr int block_size = 256;
-    cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
-    finder_warp_parallel_fn<forward>
-      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-        *d_strings, d_target, start, stop, d_results);
-  } else {
-    // string-per-thread function
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(input.size()),
-                      d_results,
-                      finder_fn<forward>{*d_strings, d_target, start, stop});
+    return results;
   }
 
+  // find-utility function fills in the results column
+  auto target_itr      = thrust::make_constant_iterator(d_target);
+  using TargetIterator = decltype(target_itr);
+  find_utility<TargetIterator, forward>(input, target_itr, *results, start, stop, stream);
   results->set_null_count(input.null_count());
   return results;
 }
@@ -244,6 +268,35 @@ std::unique_ptr<column> rfind(strings_column_view const& input,
   return find_fn<false>(input, target, start, stop, stream, mr);
 }
 
+template <bool forward = true>
+std::unique_ptr<column> find(strings_column_view const& input,
+                             strings_column_view const& target,
+                             size_type start,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero.");
+  CUDF_EXPECTS(input.size() == target.size(), "input and target columns must be the same size");
+
+  // create output column
+  auto results = make_numeric_column(
+    data_type{type_to_id<size_type>()}, input.size(), rmm::device_buffer{}, 0, stream, mr);
+  // if input is empty or all-null then we are done
+  if (input.size() == input.null_count()) { return results; }
+
+  // call find utility with target iterator
+  auto d_targets  = column_device_view::create(target.parent(), stream);
+  auto target_itr = cudf::detail::make_null_replacement_iterator<string_view>(
+    *d_targets, string_view{}, target.has_nulls());
+  find_utility<decltype(target_itr), forward>(input, target_itr, *results, start, -1, stream);
+
+  // AND the bitmasks from input and target
+  auto [null_mask, null_count] =
+    cudf::detail::bitmask_and(table_view({input.parent(), target.parent()}), stream, mr);
+  results->set_null_mask(std::move(null_mask), null_count);
+  return results;
+}
+
 }  // namespace detail
 
 // external APIs
@@ -268,36 +321,55 @@ std::unique_ptr<column> rfind(strings_column_view const& strings,
   return detail::rfind(strings, target, start, stop, cudf::get_default_stream(), mr);
 }
 
+std::unique_ptr<column> find(strings_column_view const& input,
+                             strings_column_view const& target,
+                             size_type start,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::find<true>(input, target, start, stream, mr);
+}
+
 namespace detail {
 namespace {
 
 /**
  * @brief Check if `d_target` appears in a row in `d_strings`.
  *
- * This executes as a warp per string/row.
+ * This executes as a warp per string/row and performs well for longer strings.
+ * @see AVG_CHAR_BYTES_THRESHOLD
+ *
+ * @param d_strings Column of input strings
+ * @param d_target String to search for in each row of `d_strings`
+ * @param d_results Indicates which rows contain `d_target`
  */
-struct contains_warp_fn {
-  column_device_view const d_strings;
-  string_view const d_target;
-  bool* d_results;
+__global__ void contains_warp_parallel_fn(column_device_view const d_strings,
+                                          string_view const d_target,
+                                          bool* d_results)
+{
+  size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
+  using warp_reduce   = cub::WarpReduce<bool>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
 
-  __device__ void operator()(std::size_t idx)
-  {
-    auto const str_idx = static_cast<size_type>(idx / cudf::detail::warp_size);
-    if (d_strings.is_null(str_idx)) { return; }
-    // get the string for this warp
-    auto const d_str = d_strings.element<string_view>(str_idx);
-    // each thread of the warp will check just part of the string
-    auto found = false;
-    for (auto i = static_cast<size_type>(idx % cudf::detail::warp_size);
-         !found && (i + d_target.size_bytes()) < d_str.size_bytes();
-         i += cudf::detail::warp_size) {
-      // check the target matches this part of the d_str data
-      if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; }
-    }
-    if (found) { atomicOr(d_results + str_idx, true); }
+  if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }
+
+  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+  if (d_strings.is_null(str_idx)) { return; }
+  // get the string for this warp
+  auto const d_str = d_strings.element<string_view>(str_idx);
+  // each thread of the warp will check just part of the string
+  auto found = false;
+  for (auto i = static_cast<size_type>(idx % cudf::detail::warp_size);
+       !found && (i + d_target.size_bytes()) < d_str.size_bytes();
+       i += cudf::detail::warp_size) {
+    // check the target matches this part of the d_str data
+    if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; }
   }
-};
+  auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max());
+  if (lane_idx == 0) { d_results[str_idx] = result; }
+}
 
 std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,
                                                string_scalar const& target,
@@ -324,11 +396,11 @@ std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,
 
   if (!d_target.empty()) {
     // launch warp per string
-    auto d_strings = column_device_view::create(input.parent(), stream);
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<std::size_t>(0),
-                       static_cast<std::size_t>(input.size()) * cudf::detail::warp_size,
-                       contains_warp_fn{*d_strings, d_target, results_view.data<bool>()});
+    auto const d_strings     = column_device_view::create(input.parent(), stream);
+    constexpr int block_size = 256;
+    cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+    contains_warp_parallel_fn<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *d_strings, d_target, results_view.data<bool>());
   }
   results->set_null_count(input.null_count());
   return results;
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index 1907c0d749b..4a823ad1dcb 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,8 +70,8 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
   results->set_null_count(0);
 
   auto offsets = cudf::detail::sequence(strings_count + 1,
-                                        numeric_scalar<offset_type>(0),
-                                        numeric_scalar<offset_type>(targets_count),
+                                        numeric_scalar<size_type>(0),
+                                        numeric_scalar<size_type>(targets_count),
                                         stream,
                                         mr);
   return make_lists_column(strings_count,
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 9410109e770..2df64c6a0a7 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -40,7 +40,7 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-using string_index_pair = thrust::pair<const char*, size_type>;
+using string_index_pair = thrust::pair<char const*, size_type>;
 
 namespace {
 
@@ -50,7 +50,7 @@ namespace {
  */
 struct findall_fn {
   column_device_view const d_strings;
-  offset_type const* d_offsets;
+  size_type const* d_offsets;
   string_index_pair* d_indices;
 
   __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
@@ -62,16 +62,15 @@ struct findall_fn {
     auto d_output        = d_indices + d_offsets[idx];
     size_type output_idx = 0;
 
-    size_type begin = 0;
-    size_type end   = nchars;
-    while ((begin < end) && (prog.find(prog_idx, d_str, begin, end) > 0)) {
-      auto const spos = d_str.byte_offset(begin);  // convert
-      auto const epos = d_str.byte_offset(end);    // to bytes
+    auto itr = d_str.begin();
+    while (itr.position() < nchars) {
+      auto const match = prog.find(prog_idx, d_str, itr);
+      if (!match) { break; }
 
-      d_output[output_idx++] = string_index_pair{d_str.data() + spos, (epos - spos)};
+      auto const d_result    = string_from_match(*match, d_str, itr);
+      d_output[output_idx++] = string_index_pair{d_result.data(), d_result.size_bytes()};
 
-      begin = end + (begin == end);
-      end   = nchars;
+      itr += (match->second - itr.position());
     }
   }
 };
@@ -79,7 +78,7 @@ struct findall_fn {
 std::unique_ptr<column> findall_util(column_device_view const& d_strings,
                                      reprog_device& d_prog,
                                      size_type total_matches,
-                                     offset_type const* d_offsets,
+                                     size_type const* d_offsets,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
@@ -107,7 +106,7 @@ std::unique_ptr<column> findall(strings_column_view const& input,
 
   // Create lists offsets column
   auto offsets   = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
-  auto d_offsets = offsets->mutable_view().data<offset_type>();
+  auto d_offsets = offsets->mutable_view().data<size_type>();
 
   // Convert counts into offsets
   thrust::exclusive_scan(
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index a3182577c34..cce6a19a5a6 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -29,7 +29,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -241,140 +240,6 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
   return compute_substrings_from_fn(*strings_column, starts_iter, stops_iter, stream, mr);
 }
 
-namespace {
-
-/**
- * @brief Compute slice indices for each string.
- *
- * When slice_strings is invoked with a delimiter string and a delimiter count, we need to
- * compute the start and end indices of the substring. This function accomplishes that.
- */
-template <typename DelimiterItrT>
-void compute_substring_indices(column_device_view const& d_column,
-                               DelimiterItrT const delim_itr,
-                               size_type delimiter_count,
-                               size_type* start_char_pos,
-                               size_type* end_char_pos,
-                               rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource*)
-{
-  auto strings_count = d_column.size();
-
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    strings_count,
-    [delim_itr, delimiter_count, start_char_pos, end_char_pos, d_column] __device__(size_type idx) {
-      auto const& delim_val_pair = delim_itr[idx];
-      auto const& delim_val      = delim_val_pair.first;  // Don't use it yet
-
-      // If the column value for this row is null, result is null.
-      // If the delimiter count is 0, result is empty string.
-      // If the global delimiter or the row specific delimiter is invalid or if it is empty, row
-      // value is empty.
-      if (d_column.is_null(idx) || !delim_val_pair.second || delim_val.empty()) return;
-      auto const& col_val = d_column.element<string_view>(idx);
-
-      // If the column value for the row is empty, the row value is empty.
-      if (!col_val.empty()) {
-        auto const col_val_len   = col_val.length();
-        auto const delimiter_len = delim_val.length();
-
-        auto nsearches           = (delimiter_count < 0) ? -delimiter_count : delimiter_count;
-        bool const left_to_right = (delimiter_count > 0);
-
-        size_type start_pos = start_char_pos[idx];
-        size_type end_pos   = col_val_len;
-        size_type char_pos  = -1;
-
-        end_char_pos[idx] = col_val_len;
-
-        for (auto i = 0; i < nsearches; ++i) {
-          char_pos = left_to_right ? col_val.find(delim_val, start_pos)
-                                   : col_val.rfind(delim_val, 0, end_pos);
-          if (char_pos == string_view::npos) return;
-          if (left_to_right)
-            start_pos = char_pos + delimiter_len;
-          else
-            end_pos = char_pos;
-        }
-        if (left_to_right)
-          end_char_pos[idx] = char_pos;
-        else
-          start_char_pos[idx] = end_pos + delimiter_len;
-      }
-    });
-}
-
-}  // namespace
-
-template <typename DelimiterItrT>
-std::unique_ptr<column> slice_strings(strings_column_view const& strings,
-                                      DelimiterItrT const delimiter_itr,
-                                      size_type count,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
-{
-  auto strings_count = strings.size();
-  // If there aren't any rows, return an empty strings column
-  if (strings_count == 0) { return make_empty_column(type_id::STRING); }
-
-  // Compute the substring indices first
-  auto start_chars_pos_vec = make_column_from_scalar(numeric_scalar<size_type>(0, true, stream),
-                                                     strings_count,
-                                                     stream,
-                                                     rmm::mr::get_current_device_resource());
-  auto stop_chars_pos_vec  = make_column_from_scalar(numeric_scalar<size_type>(0, true, stream),
-                                                    strings_count,
-                                                    stream,
-                                                    rmm::mr::get_current_device_resource());
-
-  auto start_char_pos = start_chars_pos_vec->mutable_view().data<size_type>();
-  auto end_char_pos   = stop_chars_pos_vec->mutable_view().data<size_type>();
-
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-
-  // If delimiter count is 0, the output column will contain empty strings
-  if (count != 0) {
-    // Compute the substring indices first
-    compute_substring_indices(
-      d_column, delimiter_itr, count, start_char_pos, end_char_pos, stream, mr);
-  }
-
-  // Extract the substrings using the indices next
-  auto starts_iter =
-    cudf::detail::indexalator_factory::make_input_iterator(start_chars_pos_vec->view());
-  auto stops_iter =
-    cudf::detail::indexalator_factory::make_input_iterator(stop_chars_pos_vec->view());
-  return compute_substrings_from_fn(d_column, starts_iter, stops_iter, stream, mr);
-}
-
-std::unique_ptr<column> slice_strings(strings_column_view const& strings,
-                                      strings_column_view const& delimiters,
-                                      size_type count,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(strings.size() == delimiters.size(),
-               "Strings and delimiters column sizes do not match");
-  auto delimiters_dev_view_ptr = cudf::column_device_view::create(delimiters.parent(), stream);
-  auto delimiters_dev_view     = *delimiters_dev_view_ptr;
-  return (delimiters_dev_view.nullable())
-           ? detail::slice_strings(
-               strings,
-               cudf::detail::make_pair_iterator<string_view, true>(delimiters_dev_view),
-               count,
-               stream,
-               mr)
-           : detail::slice_strings(
-               strings,
-               cudf::detail::make_pair_iterator<string_view, false>(delimiters_dev_view),
-               count,
-               stream,
-               mr);
-}
-
 }  // namespace detail
 
 // external API
@@ -399,27 +264,5 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
     strings, starts_column, stops_column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> slice_strings(strings_column_view const& strings,
-                                      string_scalar const& delimiter,
-                                      size_type count,
-                                      rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings,
-                               cudf::detail::make_pair_iterator<string_view>(delimiter),
-                               count,
-                               cudf::get_default_stream(),
-                               mr);
-}
-
-std::unique_ptr<column> slice_strings(strings_column_view const& strings,
-                                      strings_column_view const& delimiters,
-                                      size_type count,
-                                      rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings, delimiters, count, cudf::get_default_stream(), mr);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 1b0b0566dcc..099f5978992 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -37,7 +37,7 @@
 namespace cudf {
 namespace strings {
 namespace detail {
-using string_index_pair = thrust::pair<const char*, size_type>;
+using string_index_pair = thrust::pair<char const*, size_type>;
 
 namespace {
 //
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 18599fb568a..bad7eef4523 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -154,26 +154,11 @@ std::unique_ptr<table> split_fn(strings_column_view const& input,
  */
 struct base_whitespace_split_tokenizer {
   // count the tokens only between non-whitespace characters
-  [[nodiscard]] __device__ size_type count_tokens(size_type idx) const
+  __device__ size_type count_tokens(size_type idx) const
   {
     if (d_strings.is_null(idx)) return 0;
-    const string_view d_str = d_strings.element<string_view>(idx);
-    size_type token_count   = 0;
-    // run of whitespace is considered a single delimiter
-    bool spaces = true;
-    auto itr    = d_str.begin();
-    while (itr != d_str.end()) {
-      char_utf8 ch = *itr;
-      if (spaces == (ch <= ' '))
-        itr++;
-      else {
-        token_count += static_cast<size_type>(spaces);
-        spaces = !spaces;
-      }
-    }
-    if (max_tokens && (token_count > max_tokens)) token_count = max_tokens;
-    if (token_count == 0) token_count = 1;  // always at least 1 token
-    return token_count;
+    string_view const d_str = d_strings.element<string_view>(idx);
+    return count_tokens_whitespace(d_str, max_tokens);
   }
 
   base_whitespace_split_tokenizer(column_device_view const& d_strings, size_type max_tokens)
@@ -401,8 +386,7 @@ std::unique_ptr<table> split(strings_column_view const& strings_column,
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
-  size_type max_tokens = 0;
-  if (maxsplit > 0) max_tokens = maxsplit + 1;  // makes consistent with Pandas
+  size_type max_tokens = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits<size_type>::max();
 
   auto strings_device_view = column_device_view::create(strings_column.parent(), stream);
   if (delimiter.size() == 0) {
@@ -425,8 +409,7 @@ std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
-  size_type max_tokens = 0;
-  if (maxsplit > 0) max_tokens = maxsplit + 1;  // makes consistent with Pandas
+  size_type max_tokens = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits<size_type>::max();
 
   auto strings_device_view = column_device_view::create(strings_column.parent(), stream);
   if (delimiter.size() == 0) {
@@ -448,19 +431,21 @@ std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
 std::unique_ptr<table> split(strings_column_view const& strings_column,
                              string_scalar const& delimiter,
                              size_type maxsplit,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split(strings_column, delimiter, maxsplit, cudf::get_default_stream(), mr);
+  return detail::split(strings_column, delimiter, maxsplit, stream, mr);
 }
 
 std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
                               string_scalar const& delimiter,
                               size_type maxsplit,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit(strings_column, delimiter, maxsplit, cudf::get_default_stream(), mr);
+  return detail::rsplit(strings_column, delimiter, maxsplit, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 31257a441a1..e76d8ac1c60 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -20,7 +20,6 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/strings/detail/split_utils.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -37,6 +36,8 @@
 #include <thrust/scan.h>
 #include <thrust/transform.h>
 
+#include <cuda/atomic>
+
 namespace cudf::strings::detail {
 
 /**
@@ -356,7 +357,9 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
                        delimiter_count,
                        [d_string_indices, d_delimiter_offsets] __device__(size_type idx) {
                          auto const str_idx = d_string_indices[idx] - 1;
-                         atomicAdd(d_delimiter_offsets + str_idx, 1);
+                         cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{
+                           *(d_delimiter_offsets + str_idx)};
+                         ref.fetch_add(1, cuda::std::memory_order_relaxed);
                        });
     // finally, convert the delimiter counts into offsets
     thrust::exclusive_scan(rmm::exec_policy(stream),
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 841c642368e..9aeb6b69bdc 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -43,7 +43,7 @@ namespace strings {
 namespace detail {
 namespace {
 
-using string_index_pair = thrust::pair<const char*, size_type>;
+using string_index_pair = thrust::pair<char const*, size_type>;
 
 enum class split_direction {
   FORWARD,  ///< for split logic
@@ -60,26 +60,31 @@ enum class split_direction {
 struct token_reader_fn {
   column_device_view const d_strings;
   split_direction const direction;
-  offset_type const* d_token_offsets;
+  size_type const* d_token_offsets;
   string_index_pair* d_tokens;
 
   __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
   {
     if (d_strings.is_null(idx)) { return; }
-    auto const d_str = d_strings.element<string_view>(idx);
+    auto const d_str  = d_strings.element<string_view>(idx);
+    auto const nchars = d_str.length();
 
     auto const token_offset = d_token_offsets[idx];
     auto const token_count  = d_token_offsets[idx + 1] - token_offset;
     auto const d_result     = d_tokens + token_offset;  // store tokens here
 
     size_type token_idx = 0;
-    size_type begin     = 0;  // characters
-    size_type end       = -1;
-    size_type last_pos  = 0;  // bytes
-    while (prog.find(prog_idx, d_str, begin, end) > 0) {
+    auto itr            = d_str.begin();
+    auto last_pos       = itr;
+    while (itr.position() <= nchars) {
+      auto const match = prog.find(prog_idx, d_str, itr);
+      if (!match) { break; }
+
+      auto const start_pos = thrust::get<0>(match_positions_to_bytes(*match, d_str, last_pos));
+
       // get the token (characters just before this match)
-      auto const token =
-        string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos};
+      auto const token = string_index_pair{d_str.data() + last_pos.byte_offset(),
+                                           start_pos - last_pos.byte_offset()};
       // store it if we have space
       if (token_idx < token_count - 1) {
         d_result[token_idx++] = token;
@@ -91,13 +96,13 @@ struct token_reader_fn {
         d_result[token_idx - 1] = token;
       }
       // setup for next match
-      last_pos = d_str.byte_offset(end);
-      begin    = end + (begin == end);
-      end      = -1;
+      last_pos += (match->second - last_pos.position());
+      itr = last_pos + (match->first == match->second);
     }
 
     // set the last token to the remainder of the string
-    d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos};
+    d_result[token_idx] = string_index_pair{d_str.data() + last_pos.byte_offset(),
+                                            d_str.size_bytes() - last_pos.byte_offset()};
 
     if (direction == split_direction::BACKWARD) {
       // update first entry -- this happens when max_tokens is hit before the end of the string
@@ -138,17 +143,17 @@ rmm::device_uvector<string_index_pair> generate_tokens(column_device_view const&
 
   auto const begin     = thrust::make_counting_iterator<size_type>(0);
   auto const end       = thrust::make_counting_iterator<size_type>(strings_count);
-  auto const d_offsets = offsets.data<offset_type>();
+  auto const d_offsets = offsets.data<size_type>();
 
   // convert match counts to token offsets
   auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) {
     return d_strings.is_null(idx) ? 0 : std::min(d_offsets[idx], max_tokens) + 1;
   };
   thrust::transform_exclusive_scan(
-    rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus<offset_type>{});
+    rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus<size_type>{});
 
   // the last offset entry is the total number of tokens to be generated
-  auto const total_tokens = cudf::detail::get_value<offset_type>(offsets, strings_count, stream);
+  auto const total_tokens = cudf::detail::get_value<size_type>(offsets, strings_count, stream);
 
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
   if (total_tokens == 0) { return tokens; }
@@ -171,7 +176,7 @@ rmm::device_uvector<string_index_pair> generate_tokens(column_device_view const&
 struct tokens_transform_fn {
   column_device_view const d_strings;
   string_index_pair const* d_tokens;
-  offset_type const* d_token_offsets;
+  size_type const* d_token_offsets;
   size_type const column_index;
 
   __device__ string_index_pair operator()(size_type idx) const
@@ -210,7 +215,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
   auto offsets = count_matches(
     *d_strings, *d_prog, strings_count + 1, stream, rmm::mr::get_current_device_resource());
   auto offsets_view = offsets->mutable_view();
-  auto d_offsets    = offsets_view.data<offset_type>();
+  auto d_offsets    = offsets_view.data<size_type>();
 
   // get the split tokens from the input column; this also converts the counts into offsets
   auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream);
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 0b5ee5a900e..52f27c68111 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -77,51 +77,24 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& input,
                            mr);
 }
 
-enum class Dir { FORWARD, BACKWARD };
+enum class Direction { FORWARD, BACKWARD };
 
 /**
- * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
+ * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
  */
-struct whitespace_token_counter_fn {
+template <Direction direction>
+struct whitespace_token_reader_fn {
   column_device_view const d_strings;  // strings to split
   size_type const max_tokens = std::numeric_limits<size_type>::max();
+  size_type const* d_token_offsets{};
+  string_index_pair* d_tokens{};
 
-  __device__ size_type operator()(size_type idx) const
+  __device__ size_type count_tokens(size_type idx) const
   {
     if (d_strings.is_null(idx)) { return 0; }
-
-    auto const d_str        = d_strings.element<string_view>(idx);
-    size_type token_count   = 0;
-    auto spaces             = true;
-    auto reached_max_tokens = false;
-    for (auto ch : d_str) {
-      if (spaces != (ch <= ' ')) {
-        if (!spaces) {
-          if (token_count < max_tokens - 1) {
-            token_count++;
-          } else {
-            reached_max_tokens = true;
-            break;
-          }
-        }
-        spaces = !spaces;
-      }
-    }
-    // pandas.Series.str.split("") returns 0 tokens.
-    if (reached_max_tokens || !spaces) token_count++;
-    return token_count;
+    auto const d_str = d_strings.element<string_view>(idx);
+    return count_tokens_whitespace(d_str, max_tokens);
   }
-};
-
-/**
- * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
- */
-template <Dir dir>
-struct whitespace_token_reader_fn {
-  column_device_view const d_strings;  // strings to split
-  size_type const max_tokens{};
-  int32_t* d_token_offsets{};
-  string_index_pair* d_tokens{};
 
   __device__ void operator()(size_type idx)
   {
@@ -131,10 +104,10 @@ struct whitespace_token_reader_fn {
     auto d_result = d_tokens + token_offset;
 
     auto const d_str = d_strings.element<string_view>(idx);
-    whitespace_string_tokenizer tokenizer(d_str, dir != Dir::FORWARD);
+    whitespace_string_tokenizer tokenizer(d_str, direction != Direction::FORWARD);
     size_type token_idx = 0;
     position_pair token{0, 0};
-    if constexpr (dir == Dir::FORWARD) {
+    if constexpr (direction == Direction::FORWARD) {
       while (tokenizer.next_token() && (token_idx < token_count)) {
         token = tokenizer.get_token();
         d_result[token_idx++] =
@@ -161,47 +134,38 @@ struct whitespace_token_reader_fn {
 }  // namespace
 
 // The output is one list item per string
-template <typename TokenCounter, typename TokenReader>
-std::unique_ptr<column> whitespace_split_record_fn(strings_column_view const& strings,
-                                                   TokenCounter counter,
+template <typename TokenReader>
+std::unique_ptr<column> whitespace_split_record_fn(strings_column_view const& input,
                                                    TokenReader reader,
                                                    rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
   // create offsets column by counting the number of tokens per string
-  auto strings_count = strings.size();
-  auto offsets       = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto d_offsets = offsets->mutable_view().data<int32_t>();
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings_count),
-                    d_offsets,
-                    counter);
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
+    0, [reader] __device__(auto idx) { return reader.count_tokens(idx); });
+  auto [offsets, total_tokens] =
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto d_offsets = offsets->view().template data<cudf::size_type>();
 
-  // last entry is the total number of tokens to be generated
-  auto total_tokens = cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
   // split each string into an array of index-pair values
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
   reader.d_token_offsets = d_offsets;
   reader.d_tokens        = tokens.data();
   thrust::for_each_n(
-    rmm::exec_policy(stream), thrust::make_counting_iterator<size_type>(0), strings_count, reader);
+    rmm::exec_policy(stream), thrust::make_counting_iterator<size_type>(0), input.size(), reader);
   // convert the index-pairs into one big strings column
   auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
   // create a lists column using the offsets and the strings columns
-  return make_lists_column(strings_count,
+  return make_lists_column(input.size(),
                            std::move(offsets),
                            std::move(strings_output),
-                           strings.null_count(),
-                           copy_bitmask(strings.parent(), stream, mr),
+                           input.null_count(),
+                           copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
 
-template <Dir dir>
+template <Direction direction>
 std::unique_ptr<column> split_record(strings_column_view const& strings,
                                      string_scalar const& delimiter,
                                      size_type maxsplit,
@@ -217,13 +181,12 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
   if (delimiter.size() == 0) {
     return whitespace_split_record_fn(
       strings,
-      whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens},
-      whitespace_token_reader_fn<dir>{*d_strings_column_ptr, max_tokens},
+      whitespace_token_reader_fn<direction>{*d_strings_column_ptr, max_tokens},
       stream,
       mr);
   } else {
     string_view d_delimiter(delimiter.data(), delimiter.size());
-    if (dir == Dir::FORWARD) {
+    if (direction == Direction::FORWARD) {
       return split_record_fn(
         strings, split_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr);
     } else {
@@ -240,21 +203,22 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
 std::unique_ptr<column> split_record(strings_column_view const& strings,
                                      string_scalar const& delimiter,
                                      size_type maxsplit,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record<detail::Dir::FORWARD>(
-    strings, delimiter, maxsplit, cudf::get_default_stream(), mr);
+  return detail::split_record<detail::Direction::FORWARD>(strings, delimiter, maxsplit, stream, mr);
 }
 
 std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
                                       string_scalar const& delimiter,
                                       size_type maxsplit,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record<detail::Dir::BACKWARD>(
-    strings, delimiter, maxsplit, cudf::get_default_stream(), mr);
+  return detail::split_record<detail::Direction::BACKWARD>(
+    strings, delimiter, maxsplit, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index 2159b67774e..6e3b93973d2 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,11 +35,11 @@ namespace {
 struct string_view_to_pair {
   string_view null_placeholder;
   string_view_to_pair(string_view n) : null_placeholder(n) {}
-  __device__ thrust::pair<const char*, size_type> operator()(const string_view& i)
+  __device__ thrust::pair<char const*, size_type> operator()(string_view const& i)
   {
     return (i.data() == null_placeholder.data())
-             ? thrust::pair<const char*, size_type>{nullptr, 0}
-             : thrust::pair<const char*, size_type>{i.data(), i.size_bytes()};
+             ? thrust::pair<char const*, size_type>{nullptr, 0}
+             : thrust::pair<char const*, size_type>{i.data(), i.size_bytes()};
   }
 };
 
@@ -47,7 +47,7 @@ struct string_view_to_pair {
 
 // Create a strings-type column from vector of pointer/size pairs
 std::unique_ptr<column> make_strings_column(
-  device_span<thrust::pair<const char*, size_type> const> strings,
+  device_span<thrust::pair<char const*, size_type> const> strings,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/strings/strings_column_view.cpp b/cpp/src/strings/strings_column_view.cpp
index 6de478d3e1e..4b206666d4b 100644
--- a/cpp/src/strings/strings_column_view.cpp
+++ b/cpp/src/strings/strings_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ column_view strings_column_view::offsets() const
 
 strings_column_view::offset_iterator strings_column_view::offsets_begin() const
 {
-  return offsets().begin<offset_type>() + offset();
+  return offsets().begin<size_type>() + offset();
 }
 
 strings_column_view::offset_iterator strings_column_view::offsets_end() const
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index cac1c4fd021..57a868485df 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,10 +86,10 @@ thread_safe_per_context_cache<special_case_mapping> d_special_case_mappings;
 
 }  // namespace
 
-/**
- * @copydoc cudf::strings::detail::get_character_flags_table
- */
-const character_flags_table_type* get_character_flags_table()
+   /**
+    * @copydoc cudf::strings::detail::get_character_flags_table
+    */
+character_flags_table_type const* get_character_flags_table()
 {
   return d_character_codepoint_flags.find_or_initialize([&](void) {
     character_flags_table_type* table = nullptr;
@@ -103,7 +103,7 @@ const character_flags_table_type* get_character_flags_table()
 /**
  * @copydoc cudf::strings::detail::get_character_cases_table
  */
-const character_cases_table_type* get_character_cases_table()
+character_cases_table_type const* get_character_cases_table()
 {
   return d_character_cases_table.find_or_initialize([&](void) {
     character_cases_table_type* table = nullptr;
@@ -117,7 +117,7 @@ const character_cases_table_type* get_character_cases_table()
 /**
  * @copydoc cudf::strings::detail::get_special_case_mapping_table
  */
-const special_case_mapping* get_special_case_mapping_table()
+special_case_mapping const* get_special_case_mapping_table()
 {
   return d_special_case_mappings.find_or_initialize([&](void) {
     special_case_mapping* table = nullptr;
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index 19552b2dc03..27c8aa1a6a3 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -17,9 +17,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/concatenate.hpp>
+#include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/structs/structs_column_view.hpp>
@@ -65,14 +65,9 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
   rmm::device_buffer null_mask =
     create_null_mask(total_length, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
-  cudf::size_type null_count{0};
-  if (has_nulls) {
-    cudf::detail::concatenate_masks(columns, static_cast<bitmask_type*>(null_mask.data()), stream);
-    null_count =
-      std::transform_reduce(columns.begin(), columns.end(), 0, std::plus{}, [](auto const& col) {
-        return col.null_count();
-      });
-  }
+  auto null_mask_data = static_cast<bitmask_type*>(null_mask.data());
+  auto const null_count =
+    has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0};
 
   // assemble into outgoing list column
   return make_structs_column(
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index ea244260161..acb153f28d6 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -112,7 +112,6 @@ struct table_flattener {
       mr{mr}
   {
     superimpose_nulls(input);
-    fail_if_unsupported_types(input);
   }
 
   /**
@@ -126,12 +125,6 @@ struct table_flattener {
     this->nullable_data             = std::move(tmp_nullable_data);
   }
 
-  void fail_if_unsupported_types(table_view const& input) const
-  {
-    auto const has_lists = std::any_of(input.begin(), input.end(), is_or_has_nested_lists);
-    CUDF_EXPECTS(not has_lists, "Flattening LIST columns is not supported.");
-  }
-
   // Convert null_mask to BOOL8 columns and flatten the struct children in order.
   void flatten_struct_column(structs_column_view const& col,
                              order col_order,
@@ -388,7 +381,7 @@ std::unique_ptr<column> superimpose_nulls(bitmask_type const* null_mask,
 {
   input = superimpose_nulls_no_sanitize(null_mask, null_count, std::move(input), stream, mr);
 
-  if (auto const input_view = input->view(); has_nonempty_nulls(input_view, stream)) {
+  if (auto const input_view = input->view(); cudf::detail::has_nonempty_nulls(input_view, stream)) {
     // We can't call `purge_nonempty_nulls` for individual child column(s) that need to be
     // sanitized. Instead, we have to call it from the top level column.
     // This is to make sure all the columns (top level + all children) have consistent offsets.
@@ -406,7 +399,8 @@ std::pair<column_view, temporary_nullable_data> push_down_nulls(column_view cons
 {
   auto output = push_down_nulls_no_sanitize(input, stream, mr);
 
-  if (auto const output_view = output.first; has_nonempty_nulls(output_view, stream)) {
+  if (auto const output_view = output.first;
+      cudf::detail::has_nonempty_nulls(output_view, stream)) {
     output.second.new_columns.emplace_back(
       cudf::detail::purge_nonempty_nulls(output_view, stream, mr));
     output.first = output.second.new_columns.back()->view();
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 9f3a5bcdfea..770a7c775b4 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -393,6 +393,88 @@ namespace lexicographic {
 
 namespace {
 
+/**
+ * @brief Replace child of the input lists column by a new child column.
+ *
+ * If the input is not sliced, just replace the input child by the new_child.
+ * Otherwise, we have to generate new offsets and replace both the offsets and the child of the
+ * input by the new ones. This is because the new child was generated by ranking and always
+ * has zero offset, so it cannot replace the input child if it is sliced.
+ *
+ * The new generated offsets column needs to be returned and kept alive.
+ *
+ * @param[in] input The input column_view of type LIST
+ * @param[in] new_child A new child column to replace the existing child of the input
+ * @param[out] out_cols An array to store the new generated offsets (if applicable)
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @param[in] mr Device memory resource used to allocate the returned column
+ * @return An output column_view with child replaced
+ */
+auto replace_child(column_view const& input,
+                   column_view const& new_child,
+                   std::vector<std::unique_ptr<column>>& out_cols,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+{
+  auto const make_output = [&input](auto const& offsets_cv, auto const& child_cv) {
+    return column_view{data_type{type_id::LIST},
+                       input.size(),
+                       nullptr,
+                       input.null_mask(),
+                       input.null_count(),
+                       0,
+                       {offsets_cv, child_cv}};
+  };
+
+  if (input.offset() == 0) {
+    return make_output(input.child(lists_column_view::offsets_column_index), new_child);
+  }
+
+  out_cols.emplace_back(
+    cudf::lists::detail::get_normalized_offsets(lists_column_view{input}, stream, mr));
+  return make_output(out_cols.back()->view(), new_child);
+}
+
+/**
+ * @brief Compute ranks of the input column.
+ *
+ * `Dense` rank type must be used for compute ranking of the input for later lexicographic
+ * comparison.
+ *
+ * To understand why, consider: `input = [ [{0, "a"}, {3, "c"}], [{0, "a"}, {2, "b"}] ]`.
+ * If first rank is used, `transformed_input = [ [0, 3], [1, 2] ]`. Comparing them will lead
+ * to the result row(0) < row(1) which is incorrect.
+ * With dense rank, `transformed_input = [ [0, 2], [0, 1] ]`, producing the correct output for
+ * lexicographic comparison.
+ *
+ * In addition, since the input column being ranked is always a nested child column instead of
+ * a top-level column, the column order for ranking should be fixed to the same value
+ * `order::ASCENDING` in all situations.
+ * For example, with the same input above, using column order as `order::ASCENDING` we will have
+ * `transformed_input = [ [0, 2], [0, 1] ]`. The output of sorting `transformed_input` will be
+ * exactly the same as sorting `input` regardless of the sorting order (ASC or DESC).
+ *
+ * @param input The input column to compute ranks
+ * @param column_null_order The flag indicating how nulls compare to non-null values
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column
+ * @return The output rank columns
+ */
+auto compute_ranks(column_view const& input,
+                   null_order column_null_order,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+{
+  return cudf::detail::rank(input,
+                            rank_method::DENSE,
+                            order::ASCENDING,
+                            null_policy::EXCLUDE,
+                            column_null_order,
+                            false /*percentage*/,
+                            stream,
+                            mr);
+}
+
 /**
  * @brief Transform any nested lists-of-structs column into lists-of-integers column.
  *
@@ -402,126 +484,128 @@ namespace {
  * If the input column is not lists-of-structs, or does not contain lists-of-structs at any nested
  * level, the input will be passed through without any changes.
  *
+ * @param input The input column to transform
+ * @param column_null_order The flag indicating how nulls compare to non-null values
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column(s)
+ * @return A pair consisting of new column_view representing the transformed input, along with
+ *         an array containing its rank column(s) (of `size_type` type) and possibly new list
+ *         offsets generated during the transformation process
+ */
+std::pair<column_view, std::vector<std::unique_ptr<column>>> transform_lists_of_structs(
+  column_view const& input,
+  null_order column_null_order,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  std::vector<std::unique_ptr<column>> out_cols;
+
+  if (input.type().id() == type_id::LIST) {
+    auto const child = cudf::lists_column_view{input}.get_sliced_child(stream);
+
+    // Found a lists-of-structs column.
+    if (child.type().id() == type_id::STRUCT) {
+      out_cols.emplace_back(compute_ranks(child, column_null_order, stream, mr));
+      return {replace_child(input, out_cols.back()->view(), out_cols, stream, mr),
+              std::move(out_cols)};
+    }
+    // Found a lists-of-lists column.
+    else if (child.type().id() == type_id::LIST) {
+      // Recursively call transformation on the child column.
+      auto [new_child, out_cols_child] =
+        transform_lists_of_structs(child, column_null_order, stream, mr);
+
+      // Only transform the current column if its child has been transformed.
+      if (out_cols_child.size() > 0) {
+        out_cols.insert(out_cols.end(),
+                        std::make_move_iterator(out_cols_child.begin()),
+                        std::make_move_iterator(out_cols_child.end()));
+        return {replace_child(input, new_child, out_cols, stream, mr), std::move(out_cols)};
+      }
+      // else: child was not transformed so input is also not transformed.
+    }
+    // else: child is not STRUCT or LIST: no transformation.
+  }
+  // else: lhs.type().id() != type_id::LIST.
+  // In such situations, lhs.type().id() can still be type_id::STRUCT. However, any
+  // structs-of-lists should be decomposed into empty struct type `Struct<>` before being
+  // processed by this function so we do nothing here.
+
+  // Passthrough: nothing changed.
+  return {input, std::move(out_cols)};
+}
+
+/**
+ * @brief Transform any nested lists-of-structs column into lists-of-integers column.
+ *
+ * For a lists-of-structs column at any nested level, its child structs column will be replaced by a
+ * `size_type` column computed as its ranks. In addition, equivalent child columns of both input
+ * columns (i.e., child columns at the same order, same nested level) will be combined and
+ * ranked together.
+ *
+ * If the input columns are not lists-of-structs, or do not contain lists-of-structs at any nested
+ * level, there will not be any changes.
+ *
  * @param lhs The input lhs column to transform
- * @param rhs The input rhs column to transform (if available)
+ * @param rhs The input rhs column to transform
  * @param column_null_order The flag indicating how nulls compare to non-null values
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return A tuple consisting of new column_view representing the transformed input, along with
- *         their ranks column(s) (of `size_type` type) and possibly new list offsets generated
+ * @param mr Device memory resource used to allocate the returned column(s)
+ * @return A tuple consisting of new column_view(s) representing the transformed input, along with
+ *         their rank column(s) (of `size_type` type) and possibly new list offsets generated
  *         during the transformation process
  */
 std::tuple<column_view,
-           std::optional<column_view>,
+           column_view,
            std::vector<std::unique_ptr<column>>,
            std::vector<std::unique_ptr<column>>>
 transform_lists_of_structs(column_view const& lhs,
-                           std::optional<column_view> const& rhs_opt,
+                           column_view const& rhs,
                            null_order column_null_order,
-                           rmm::cuda_stream_view stream)
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr)
 {
-  auto const default_mr = rmm::mr::get_current_device_resource();
-
-  // If the input is not sliced, just replace the input child by new_child.
-  // Otherwise, we have to generate new offsets and replace both offsets/child of the input by the
-  // new ones. This is because the new child here is generated by ranking and always has zero
-  // offset thus cannot replace the input child if it is sliced.
-  // The new offsets column needs to be returned and kept alive.
-  auto const replace_child = [&](column_view const& input,
-                                 column_view const& new_child,
-                                 std::vector<std::unique_ptr<column>>& out_cols) {
-    auto const make_output = [&input](auto const& offsets_cv, auto const& child_cv) {
-      return column_view{data_type{type_id::LIST},
-                         input.size(),
-                         nullptr,
-                         input.null_mask(),
-                         input.null_count(),
-                         0,
-                         {offsets_cv, child_cv}};
-    };
-
-    if (input.offset() == 0) {
-      return make_output(input.child(lists_column_view::offsets_column_index), new_child);
-    }
-
-    out_cols.emplace_back(
-      cudf::lists::detail::get_normalized_offsets(lists_column_view{input}, stream, default_mr));
-    return make_output(out_cols.back()->view(), new_child);
-  };
-
-  // Dense ranks should be used instead of first rank.
-  // Consider this example: `input = [ [{0, "a"}, {3, "c"}], [{0, "a"}, {2, "b"}] ]`.
-  // If first rank is used, `transformed_input = [ [0, 3], [1, 2] ]`. Comparing them will lead
-  // to the result row(0) < row(1) which is incorrect.
-  // With dense rank, `transformed_input = [ [0, 2], [0, 1] ]`, producing correct comparison.
-  //
-  // In addition, since the ranked structs column(s) are nested child column instead of
-  // top-level column, the column order should be fixed to the same values in all situations.
-  // For example, with the same input above, using the fixed values for column order
-  // (`order::ASCENDING`), we have `transformed_input = [ [0, 2], [0, 1] ]`. Sorting of
-  // `transformed_input` will produce the same result as sorting `input` regardless of sorting
-  // order (ASC or DESC).
-  auto const compute_ranks = [&](auto const& input) {
-    return cudf::detail::rank(input,
-                              rank_method::DENSE,
-                              order::ASCENDING,
-                              null_policy::EXCLUDE,
-                              column_null_order,
-                              false /*percentage*/,
-                              stream,
-                              default_mr);
-  };
-
   std::vector<std::unique_ptr<column>> out_cols_lhs;
   std::vector<std::unique_ptr<column>> out_cols_rhs;
 
+  auto const make_output = [&](auto const& new_child_lhs, auto const& new_child_rhs) {
+    return std::tuple{replace_child(lhs, new_child_lhs, out_cols_lhs, stream, mr),
+                      replace_child(rhs, new_child_rhs, out_cols_rhs, stream, mr),
+                      std::move(out_cols_lhs),
+                      std::move(out_cols_rhs)};
+  };
+
   if (lhs.type().id() == type_id::LIST) {
     auto const child_lhs = cudf::lists_column_view{lhs}.get_sliced_child(stream);
+    auto const child_rhs = cudf::lists_column_view{rhs}.get_sliced_child(stream);
 
     // Found a lists-of-structs column.
     if (child_lhs.type().id() == type_id::STRUCT) {
-      if (rhs_opt) {  // rhs table is available
-        auto const child_rhs = cudf::lists_column_view{rhs_opt.value()}.get_sliced_child(stream);
-        auto const concatenated_children = cudf::detail::concatenate(
-          std::vector<column_view>{child_lhs, child_rhs}, stream, default_mr);
-
-        auto const ranks        = compute_ranks(concatenated_children->view());
-        auto const ranks_slices = cudf::detail::slice(
-          ranks->view(),
-          {0, child_lhs.size(), child_lhs.size(), child_lhs.size() + child_rhs.size()},
-          stream);
-
-        out_cols_lhs.emplace_back(std::make_unique<column>(ranks_slices.front()));
-        out_cols_rhs.emplace_back(std::make_unique<column>(ranks_slices.back()));
-
-        auto transformed_lhs = replace_child(lhs, out_cols_lhs.back()->view(), out_cols_lhs);
-        auto transformed_rhs =
-          replace_child(rhs_opt.value(), out_cols_rhs.back()->view(), out_cols_rhs);
-
-        return {std::move(transformed_lhs),
-                std::optional<column_view>{std::move(transformed_rhs)},
-                std::move(out_cols_lhs),
-                std::move(out_cols_rhs)};
-      } else {  // rhs table is not available
-        out_cols_lhs.emplace_back(compute_ranks(child_lhs));
-        auto transformed_lhs = replace_child(lhs, out_cols_lhs.back()->view(), out_cols_lhs);
-
-        return {std::move(transformed_lhs),
-                std::nullopt,
-                std::move(out_cols_lhs),
-                std::move(out_cols_rhs)};
-      }
+      auto const concatenated_children =
+        cudf::detail::concatenate(std::vector<column_view>{child_lhs, child_rhs},
+                                  stream,
+                                  rmm::mr::get_current_device_resource());
+
+      auto const ranks        = compute_ranks(concatenated_children->view(),
+                                       column_null_order,
+                                       stream,
+                                       rmm::mr::get_current_device_resource());
+      auto const ranks_slices = cudf::detail::slice(
+        ranks->view(),
+        {0, child_lhs.size(), child_lhs.size(), child_lhs.size() + child_rhs.size()},
+        stream);
+
+      out_cols_lhs.emplace_back(std::make_unique<column>(ranks_slices.front(), stream, mr));
+      out_cols_rhs.emplace_back(std::make_unique<column>(ranks_slices.back(), stream, mr));
+
+      return make_output(out_cols_lhs.back()->view(), out_cols_rhs.back()->view());
+
     }
     // Found a lists-of-lists column.
     else if (child_lhs.type().id() == type_id::LIST) {
-      auto const child_rhs_opt =
-        rhs_opt
-          ? std::optional<column_view>{cudf::lists_column_view{rhs_opt.value()}.get_sliced_child(
-              stream)}
-          : std::nullopt;
-
       // Recursively call transformation on the child column.
-      auto [new_child_lhs, new_child_rhs_opt, out_cols_child_lhs, out_cols_child_rhs] =
-        transform_lists_of_structs(child_lhs, child_rhs_opt, column_null_order, stream);
+      auto [new_child_lhs, new_child_rhs, out_cols_child_lhs, out_cols_child_rhs] =
+        transform_lists_of_structs(child_lhs, child_rhs, column_null_order, stream, mr);
 
       // Only transform the current pair of columns if their children have been transformed.
       if (out_cols_child_lhs.size() > 0 || out_cols_child_rhs.size() > 0) {
@@ -532,21 +616,7 @@ transform_lists_of_structs(column_view const& lhs,
                             std::make_move_iterator(out_cols_child_rhs.begin()),
                             std::make_move_iterator(out_cols_child_rhs.end()));
 
-        auto transformed_lhs = replace_child(lhs, new_child_lhs, out_cols_lhs);
-        if (rhs_opt) {
-          auto transformed_rhs =
-            replace_child(rhs_opt.value(), new_child_rhs_opt.value(), out_cols_rhs);
-
-          return {std::move(transformed_lhs),
-                  std::optional<column_view>{std::move(transformed_rhs)},
-                  std::move(out_cols_lhs),
-                  std::move(out_cols_rhs)};
-        } else {
-          return {std::move(transformed_lhs),
-                  std::nullopt,
-                  std::move(out_cols_lhs),
-                  std::move(out_cols_rhs)};
-        }
+        return make_output(new_child_lhs, new_child_rhs);
       }
     }
     // else: child is not STRUCT or LIST: just go to the end of this function, no transformation.
@@ -557,67 +627,7 @@ transform_lists_of_structs(column_view const& lhs,
   // processed by this function so we do nothing here.
 
   // Passthrough: nothing changed.
-  return {lhs, rhs_opt, std::move(out_cols_lhs), std::move(out_cols_rhs)};
-}
-
-/**
- * @brief Transform any nested lists-of-structs column in the given table(s) into lists-of-integers
- * column.
- *
- * If the rhs table is specified, its shape should be pre-checked to match with the shape of lhs
- * table using `check_shape_compatibility` before being passed into this function.
- *
- * @param lhs The input lhs table to transform
- * @param rhs The input rhs table to transform (if available)
- * @param null_precedence Optional, an array having the same length as the number of columns in
- *        the input tables that indicates how null values compare to all other. If it is empty,
- *        the order `null_order::BEFORE` will be used for all columns.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return A tuple consisting of new table_view representing the transformed input, along with
- *         the ranks columns (of `size_type` type) and possibly new list offsets generated during
- *         the transformation process
- */
-std::tuple<table_view,
-           std::optional<table_view>,
-           std::vector<std::unique_ptr<column>>,
-           std::vector<std::unique_ptr<column>>>
-transform_lists_of_structs(table_view const& lhs,
-                           std::optional<table_view> const& rhs,
-                           host_span<null_order const> null_precedence,
-                           rmm::cuda_stream_view stream)
-{
-  std::vector<column_view> transformed_lhs_cvs;
-  std::vector<column_view> transformed_rhs_cvs;
-  std::vector<std::unique_ptr<column>> out_cols_lhs;
-  std::vector<std::unique_ptr<column>> out_cols_rhs;
-
-  for (size_type col_idx = 0; col_idx < lhs.num_columns(); ++col_idx) {
-    auto const& lhs_col = lhs.column(col_idx);
-    auto const rhs_col_opt =
-      rhs ? std::optional<column_view>{rhs.value().column(col_idx)} : std::nullopt;
-
-    auto [transformed_lhs, transformed_rhs_opt, curr_out_cols_lhs, curr_out_cols_rhs] =
-      transform_lists_of_structs(
-        lhs_col,
-        rhs_col_opt,
-        null_precedence.empty() ? null_order::BEFORE : null_precedence[col_idx],
-        stream);
-
-    transformed_lhs_cvs.emplace_back(std::move(transformed_lhs));
-    if (rhs) { transformed_rhs_cvs.emplace_back(std::move(transformed_rhs_opt.value())); }
-
-    out_cols_lhs.insert(out_cols_lhs.end(),
-                        std::make_move_iterator(curr_out_cols_lhs.begin()),
-                        std::make_move_iterator(curr_out_cols_lhs.end()));
-    out_cols_rhs.insert(out_cols_rhs.end(),
-                        std::make_move_iterator(curr_out_cols_rhs.begin()),
-                        std::make_move_iterator(curr_out_cols_rhs.end()));
-  }
-
-  return {table_view{transformed_lhs_cvs},
-          rhs ? std::optional<table_view>{table_view{transformed_rhs_cvs}} : std::nullopt,
-          std::move(out_cols_lhs),
-          std::move(out_cols_rhs)};
+  return {lhs, rhs, std::move(out_cols_lhs), std::move(out_cols_rhs)};
 }
 
 }  // namespace
@@ -672,9 +682,29 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(
   auto [decomposed_input, new_column_order, new_null_precedence, verticalized_col_depths] =
     decompose_structs(input, decompose_lists_column::NO, column_order, null_precedence);
 
-  // Unused variables are generated for rhs table which is not available here.
-  [[maybe_unused]] auto [transformed_input, unused_0, transformed_columns, unused_1] =
-    transform_lists_of_structs(decomposed_input, std::nullopt, new_null_precedence, stream);
+  // Transform any (nested) lists-of-structs column into lists-of-integers column.
+  std::vector<std::unique_ptr<column>> transformed_columns;
+  auto const transformed_input =
+    [&, &decomposed_input = decomposed_input, &new_null_precedence = new_null_precedence] {
+      std::vector<column_view> transformed_cvs;
+
+      for (size_type col_idx = 0; col_idx < decomposed_input.num_columns(); ++col_idx) {
+        auto const& lhs_col = decomposed_input.column(col_idx);
+
+        auto [transformed, curr_out_cols] = transform_lists_of_structs(
+          lhs_col,
+          null_precedence.empty() ? null_order::BEFORE : new_null_precedence[col_idx],
+          stream,
+          rmm::mr::get_current_device_resource());
+
+        transformed_cvs.emplace_back(std::move(transformed));
+        transformed_columns.insert(transformed_columns.end(),
+                                   std::make_move_iterator(curr_out_cols.begin()),
+                                   std::make_move_iterator(curr_out_cols.end()));
+      }
+
+      return table_view{transformed_cvs};
+    }();
 
   auto const has_ranked_children = !transformed_columns.empty();
   return create(transformed_input,
@@ -707,8 +737,40 @@ preprocessed_table::create(table_view const& lhs,
     decompose_structs(rhs, decompose_lists_column::NO, column_order, null_precedence);
 
   // Transform any (nested) lists-of-structs column into lists-of-integers column.
-  auto [transformed_lhs, transformed_rhs_opt, transformed_columns_lhs, transformed_columns_rhs] =
-    transform_lists_of_structs(decomposed_lhs, decomposed_rhs, new_null_precedence_lhs, stream);
+  std::vector<std::unique_ptr<column>> transformed_columns_lhs;
+  std::vector<std::unique_ptr<column>> transformed_columns_rhs;
+  auto const [transformed_lhs,
+              transformed_rhs] = [&,
+                                  &decomposed_lhs          = decomposed_lhs,
+                                  &decomposed_rhs          = decomposed_rhs,
+                                  &new_null_precedence_lhs = new_null_precedence_lhs] {
+    std::vector<column_view> transformed_lhs_cvs;
+    std::vector<column_view> transformed_rhs_cvs;
+
+    for (size_type col_idx = 0; col_idx < decomposed_lhs.num_columns(); ++col_idx) {
+      auto const& lhs_col = decomposed_lhs.column(col_idx);
+      auto const& rhs_col = decomposed_rhs.column(col_idx);
+
+      auto [transformed_lhs, transformed_rhs, curr_out_cols_lhs, curr_out_cols_rhs] =
+        transform_lists_of_structs(
+          lhs_col,
+          rhs_col,
+          null_precedence.empty() ? null_order::BEFORE : null_precedence[col_idx],
+          stream,
+          rmm::mr::get_current_device_resource());
+
+      transformed_lhs_cvs.emplace_back(std::move(transformed_lhs));
+      transformed_rhs_cvs.emplace_back(std::move(transformed_rhs));
+      transformed_columns_lhs.insert(transformed_columns_lhs.end(),
+                                     std::make_move_iterator(curr_out_cols_lhs.begin()),
+                                     std::make_move_iterator(curr_out_cols_lhs.end()));
+      transformed_columns_rhs.insert(transformed_columns_rhs.end(),
+                                     std::make_move_iterator(curr_out_cols_rhs.begin()),
+                                     std::make_move_iterator(curr_out_cols_rhs.end()));
+    }
+
+    return std::pair{table_view{transformed_lhs_cvs}, table_view{transformed_rhs_cvs}};
+  }();
 
   // This should be the same for both lhs and rhs but not all the time, such as when one table
   // has 0 rows while the other has >0 rows. So we check separately for each of them.
@@ -722,7 +784,7 @@ preprocessed_table::create(table_view const& lhs,
                  new_null_precedence_lhs,
                  has_ranked_children_lhs,
                  stream),
-          create(transformed_rhs_opt.value(),
+          create(transformed_rhs,
                  std::move(verticalized_col_depths_rhs),
                  std::move(transformed_columns_rhs),
                  new_column_order_lhs,
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index fb0ecdb7677..1460be4fcf5 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,11 +29,13 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
+#include <thrust/sequence.h>
 #include <thrust/transform.h>
 #include <thrust/transform_scan.h>
 
@@ -42,91 +44,76 @@ namespace detail {
 namespace {
 
 /**
- * @brief Compute the edit-distance between two strings
+ * @brief Compute the Levenshtein distance for each string pair
  *
- * The temporary buffer must be able to hold 3 int16 values for each character
- * in the smaller of the two provided strings.
+ * Documentation here: https://www.cuelogic.com/blog/the-levenshtein-algorithm
+ * And here: https://en.wikipedia.org/wiki/Levenshtein_distance
  *
  * @param d_str First string
  * @param d_tgt Second string
- * @param buffer Temporary memory buffer used for the calculation.
- * @return Edit distance value
+ * @param buffer Working buffer for intermediate calculations
+ * @return The edit distance value
  */
-__device__ int32_t compute_distance(cudf::string_view const& d_str,
-                                    cudf::string_view const& d_tgt,
-                                    int16_t* buffer)
+__device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
+                                            cudf::string_view const& d_tgt,
+                                            cudf::size_type* buffer)
 {
   auto const str_length = d_str.length();
   auto const tgt_length = d_tgt.length();
   if (str_length == 0) return tgt_length;
   if (tgt_length == 0) return str_length;
 
-  auto itr_A = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
-  auto itr_B = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
+  auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
+  auto itr   = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
   // .first is min and .second is max
-  auto const lengths = std::minmax(str_length, tgt_length);
+  auto const [n, m] = std::minmax(str_length, tgt_length);
   // setup compute buffer pointers
-  auto line2 = buffer;
-  auto line1 = line2 + lengths.first;
-  auto line0 = line1 + lengths.first;
-  // range is both lengths
-  auto const range = lengths.first + lengths.second - 1;
-  for (cudf::size_type i = 0; i < range; ++i) {
-    auto tmp = line2;
-    line2    = line1;
-    line1    = line0;
-    line0    = tmp;
-    // checking pairs of characters
-    for (int x = (i < lengths.second ? 0 : i - lengths.second + 1);
-         (x < lengths.first) && (x < i + 1);
-         ++x) {
-      int const y = i - x;
-      itr_A += (x - itr_A.position());  // point to next
-      itr_B += (y - itr_B.position());  // characters to check
-      int16_t const w =
-        (((x > 0) && (y > 0)) ? line2[x - 1] : static_cast<int16_t>(std::max(x, y))) +
-        static_cast<int16_t>(*itr_A != *itr_B);  // add 1 if characters do not match
-      int16_t const u = (y > 0 ? line1[x] : x + 1) + 1;
-      int16_t const v = (x > 0 ? line1[x - 1] : y + 1) + 1;
-      // store min(u,v,w)
-      line0[x] = std::min(std::min(u, v), w);
+  auto v0 = buffer;
+  auto v1 = v0 + n + 1;
+  // initialize v0
+  thrust::sequence(thrust::seq, v0, v1);
+
+  for (int i = 0; i < m; ++i, ++itr) {
+    auto itr_tgt = begin;
+    v1[0]        = i + 1;
+    for (int j = 0; j < n; ++j, ++itr_tgt) {
+      auto sub_cost = v0[j] + (*itr != *itr_tgt);
+      auto del_cost = v0[j + 1] + 1;
+      auto ins_cost = v1[j] + 1;
+      v1[j + 1]     = std::min(std::min(sub_cost, del_cost), ins_cost);
     }
+    thrust::swap(v0, v1);
   }
-  return static_cast<int32_t>(line0[lengths.first - 1]);
+  return v0[n];
 }
 
-/**
- * @brief Compute the Levenshtein distance for each string.
- *
- * Documentation here: https://www.cuelogic.com/blog/the-levenshtein-algorithm
- * And here: https://en.wikipedia.org/wiki/Levenshtein_distances
- */
 struct edit_distance_levenshtein_algorithm {
   cudf::column_device_view d_strings;  // computing these
   cudf::column_device_view d_targets;  // against these;
-  int16_t* d_buffer;                   // compute buffer for each string
-  int32_t* d_results;                  // input is buffer offset; output is edit distance
+  cudf::size_type* d_buffer;           // compute buffer for each string
+  std::ptrdiff_t const* d_offsets;     // locate sub-buffer for each string
+  cudf::size_type* d_results;          // edit distance values
 
-  __device__ void operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx) const
   {
     auto d_str =
       d_strings.is_null(idx) ? cudf::string_view{} : d_strings.element<cudf::string_view>(idx);
     auto d_tgt = [&] __device__ {  // d_targets is also allowed to have only one entry
-      if (d_targets.is_null(idx)) return cudf::string_view{};
+      if (d_targets.is_null(idx)) { return cudf::string_view{}; }
       return d_targets.size() == 1 ? d_targets.element<cudf::string_view>(0)
                                    : d_targets.element<cudf::string_view>(idx);
     }();
-    d_results[idx] = compute_distance(d_str, d_tgt, d_buffer + d_results[idx]);
+    d_results[idx] = compute_distance(d_str, d_tgt, d_buffer + d_offsets[idx]);
   }
 };
 
 struct edit_distance_matrix_levenshtein_algorithm {
   cudf::column_device_view d_strings;  // computing these against itself
-  int16_t* d_buffer;                   // compute buffer for each string
-  int32_t const* d_offsets;            // locate sub-buffer for each string
-  int32_t* d_results;                  // edit distance values
+  cudf::size_type* d_buffer;           // compute buffer for each string
+  std::ptrdiff_t const* d_offsets;     // locate sub-buffer for each string
+  cudf::size_type* d_results;          // edit distance values
 
-  __device__ void operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx) const
   {
     auto const strings_count = d_strings.size();
     auto const row           = idx / strings_count;
@@ -136,9 +123,9 @@ struct edit_distance_matrix_levenshtein_algorithm {
       d_strings.is_null(row) ? cudf::string_view{} : d_strings.element<cudf::string_view>(row);
     cudf::string_view d_str2 =
       d_strings.is_null(col) ? cudf::string_view{} : d_strings.element<cudf::string_view>(col);
-    auto work_buffer       = d_buffer + d_offsets[idx - ((row + 1) * (row + 2)) / 2];
-    int32_t const distance = (row == col) ? 0 : compute_distance(d_str1, d_str2, work_buffer);
-    d_results[idx]         = distance;                // top half of matrix
+    auto work_buffer    = d_buffer + d_offsets[idx - ((row + 1) * (row + 2)) / 2];
+    auto const distance = (row == col) ? 0 : compute_distance(d_str1, d_str2, work_buffer);
+    d_results[idx]      = distance;                   // top half of matrix
     d_results[col * strings_count + row] = distance;  // bottom half of matrix
   }
 };
@@ -153,10 +140,13 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
                                             rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
-  cudf::size_type strings_count = strings.size();
-  if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
-  if (targets.size() > 1)
+  auto const strings_count = strings.size();
+  if (strings_count == 0) {
+    return cudf::make_empty_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()});
+  }
+  if (targets.size() > 1) {
     CUDF_EXPECTS(strings_count == targets.size(), "targets.size() must equal strings.size()");
+  }
 
   // create device columns from the input columns
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
@@ -165,46 +155,46 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
   auto d_targets      = *targets_column;
 
   // calculate the size of the compute-buffer;
-  // we can use the output column buffer to hold the size/offset values temporarily
-  auto results   = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
-                                               strings_count,
-                                               rmm::device_buffer{0, stream, mr},
-                                               0,
-                                               stream,
-                                               mr);
-  auto d_results = results->mutable_view().data<int32_t>();
-
+  rmm::device_uvector<std::ptrdiff_t> offsets(strings_count, stream);
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(strings_count),
-                    d_results,
+                    offsets.begin(),
                     [d_strings, d_targets] __device__(auto idx) {
-                      if (d_strings.is_null(idx) || d_targets.is_null(idx)) return int32_t{0};
+                      if (d_strings.is_null(idx) || d_targets.is_null(idx)) {
+                        return cudf::size_type{0};
+                      }
                       auto d_str = d_strings.element<cudf::string_view>(idx);
                       auto d_tgt = d_targets.size() == 1
                                      ? d_targets.element<cudf::string_view>(0)
                                      : d_targets.element<cudf::string_view>(idx);
-                      // just need 3 int16's for each character of the shorter string
-                      return static_cast<int32_t>(std::min(d_str.length(), d_tgt.length()) * 3);
+                      // just need 2 integers for each character of the shorter string
+                      return (std::min(d_str.length(), d_tgt.length()) + 1) * 2;
                     });
 
   // get the total size of the temporary compute buffer
-  size_t compute_size =
-    thrust::reduce(rmm::exec_policy(stream), d_results, d_results + strings_count, size_t{0});
+  int64_t compute_size =
+    thrust::reduce(rmm::exec_policy(stream), offsets.begin(), offsets.end(), int64_t{0});
   // convert sizes to offsets in-place
-  thrust::exclusive_scan(rmm::exec_policy(stream), d_results, d_results + strings_count, d_results);
+  thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin());
   // create the temporary compute buffer
-  rmm::device_uvector<int16_t> compute_buffer(compute_size, stream);
+  rmm::device_uvector<cudf::size_type> compute_buffer(compute_size, stream);
   auto d_buffer = compute_buffer.data();
 
-  // compute the edit distance into the output column in-place
-  // - on input, d_results is the offset to the working section of d_buffer for each row
-  // - on output, d_results is the calculated edit distance for that row
+  auto results = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                                               strings_count,
+                                               rmm::device_buffer{0, stream, mr},
+                                               0,
+                                               stream,
+                                               mr);
+  auto d_results = results->mutable_view().data<cudf::size_type>();
+
+  // compute the edit distance into the output column
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
     strings_count,
-    edit_distance_levenshtein_algorithm{d_strings, d_targets, d_buffer, d_results});
+    edit_distance_levenshtein_algorithm{d_strings, d_targets, d_buffer, offsets.data(), d_results});
   return results;
 }
 
@@ -216,7 +206,9 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
                                                    rmm::mr::device_memory_resource* mr)
 {
   cudf::size_type strings_count = strings.size();
-  if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
+  if (strings_count == 0) {
+    return cudf::make_empty_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()});
+  }
   CUDF_EXPECTS(strings_count > 1, "the input strings must include at least 2 strings");
   CUDF_EXPECTS(static_cast<size_t>(strings_count) * static_cast<size_t>(strings_count) <
                  static_cast<std::size_t>(std::numeric_limits<cudf::size_type>().max()),
@@ -230,7 +222,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
   // We only need memory for half the size of the output matrix since the edit distance calculation
   // is commutative -- `distance(strings[i],strings[j]) == distance(strings[j],strings[i])`
   cudf::size_type n_upper = (strings_count * (strings_count - 1)) / 2;
-  rmm::device_uvector<int32_t> offsets(n_upper, stream);
+  rmm::device_uvector<std::ptrdiff_t> offsets(n_upper, stream);
   auto d_offsets = offsets.data();
   CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value()));
   thrust::for_each_n(
@@ -245,28 +237,29 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
         d_strings.is_null(row) ? cudf::string_view{} : d_strings.element<cudf::string_view>(row);
       cudf::string_view const d_str2 =
         d_strings.is_null(col) ? cudf::string_view{} : d_strings.element<cudf::string_view>(col);
-      if (d_str1.empty() || d_str2.empty()) return;
-      // the temp size needed is 3 int16s per character of the shorter string
-      d_offsets[idx - ((row + 1) * (row + 2)) / 2] = std::min(d_str1.length(), d_str2.length()) * 3;
+      if (d_str1.empty() || d_str2.empty()) { return; }
+      // the temp size needed is 2 integers per character of the shorter string
+      d_offsets[idx - ((row + 1) * (row + 2)) / 2] =
+        (std::min(d_str1.length(), d_str2.length()) + 1) * 2;
     });
 
   // get the total size for the compute buffer
-  size_t compute_size =
-    thrust::reduce(rmm::exec_policy(stream), offsets.begin(), offsets.end(), size_t{0});
+  int64_t compute_size =
+    thrust::reduce(rmm::exec_policy(stream), offsets.begin(), offsets.end(), int64_t{0});
   // convert sizes to offsets in-place
   thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin());
   // create the compute buffer
-  rmm::device_uvector<int16_t> compute_buffer(compute_size, stream);
+  rmm::device_uvector<cudf::size_type> compute_buffer(compute_size, stream);
   auto d_buffer = compute_buffer.data();
 
   // compute the edit distance into the output column
-  auto results   = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
+  auto results = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                                                strings_count * strings_count,
                                                rmm::device_buffer{0, stream, mr},
                                                0,
                                                stream,
                                                mr);
-  auto d_results = results->mutable_view().data<int32_t>();
+  auto d_results = results->mutable_view().data<cudf::size_type>();
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -274,20 +267,21 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
     edit_distance_matrix_levenshtein_algorithm{d_strings, d_buffer, d_offsets, d_results});
 
   // build a lists column of the results
-  auto offsets_column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
-                                                      strings_count + 1,
-                                                      rmm::device_buffer{0, stream, mr},
-                                                      0,
-                                                      stream,
-                                                      mr);
+  auto offsets_column =
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                                  strings_count + 1,
+                                  rmm::device_buffer{0, stream, mr},
+                                  0,
+                                  stream,
+                                  mr);
   thrust::transform_exclusive_scan(
     rmm::exec_policy(stream),
-    thrust::make_counting_iterator<int32_t>(0),
-    thrust::make_counting_iterator<int32_t>(strings_count + 1),
-    offsets_column->mutable_view().data<int32_t>(),
+    thrust::counting_iterator<cudf::size_type>(0),
+    thrust::counting_iterator<cudf::size_type>(strings_count + 1),
+    offsets_column->mutable_view().data<cudf::size_type>(),
     [strings_count] __device__(auto idx) { return strings_count; },
-    int32_t{0},
-    thrust::plus<int32_t>());
+    cudf::size_type{0},
+    thrust::plus<cudf::size_type>());
   return cudf::make_lists_column(strings_count,
                                  std::move(offsets_column),
                                  std::move(results),
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 8039729d749..938fd45246d 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -14,13 +14,16 @@
  * limitations under the License.
  */
 
-#include <nvtext/generate_ngrams.hpp>
+#include <nvtext/detail/generate_ngrams.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy_if.cuh>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -99,8 +102,11 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
 
   // first create a new offsets vector removing nulls and empty strings from the input column
   std::unique_ptr<cudf::column> non_empty_offsets_column = [&] {
-    cudf::column_view offsets_view(
-      cudf::data_type{cudf::type_id::INT32}, strings_count + 1, strings.offsets_begin());
+    cudf::column_view offsets_view(cudf::data_type{cudf::type_id::INT32},
+                                   strings_count + 1,
+                                   strings.offsets_begin(),
+                                   nullptr,
+                                   0);
     auto table_offsets = cudf::detail::copy_if(
                            cudf::table_view({offsets_view}),
                            [d_strings, strings_count] __device__(cudf::size_type idx) {
@@ -153,10 +159,17 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
 namespace detail {
 namespace {
 
+/**
+ * @brief Generate character ngrams for each string
+ *
+ * Each string produces many strings depending on the ngram width and the string size.
+ * This functor can be used with `make_strings_children` to build the offsets and
+ * the chars child columns.
+ */
 struct character_ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
-  int32_t const* d_ngram_offsets{};
+  cudf::size_type const* d_ngram_offsets{};
   cudf::size_type* d_offsets{};
   char* d_chars{};
 
@@ -173,15 +186,15 @@ struct character_ngram_generator_fn {
     for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
       auto const begin = itr.byte_offset();
       auto const end   = (itr + ngrams).byte_offset();
-      if (out_ptr)
+      if (d_chars) {
         out_ptr =
           cudf::strings::detail::copy_and_increment(out_ptr, d_str.data() + begin, (end - begin));
-      else
+      } else {
         *d_sizes++ = end - begin;
+      }
     }
   }
 };
-
 }  // namespace
 
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
@@ -199,7 +212,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   auto const d_strings      = *strings_column;
 
   // create a vector of ngram offsets for each string
-  rmm::device_uvector<int32_t> ngram_offsets(strings_count + 1, stream);
+  rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
   thrust::transform_exclusive_scan(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -208,7 +221,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
     [d_strings, strings_count, ngrams] __device__(auto idx) {
       if (d_strings.is_null(idx) || (idx == strings_count)) return 0;
       auto const length = d_strings.element<cudf::string_view>(idx).length();
-      return std::max(0, static_cast<int32_t>(length + 1 - ngrams));
+      return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
     },
     cudf::size_type{0},
     thrust::plus<cudf::size_type>());
@@ -226,6 +239,80 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
     total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
 }
 
+namespace {
+/**
+ * @brief Computes the hash of each character ngram
+ *
+ * Each thread processes a single string. Substrings are resolved for every character
+ * of the string and hashed.
+ */
+struct character_ngram_hash_fn {
+  cudf::column_device_view const d_strings;
+  cudf::size_type ngrams;
+  cudf::size_type const* d_ngram_offsets;
+  cudf::hash_value_type* d_results;
+
+  __device__ void operator()(cudf::size_type idx) const
+  {
+    if (d_strings.is_null(idx)) return;
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
+    if (d_str.empty()) return;
+    auto itr                = d_str.begin();
+    auto const ngram_offset = d_ngram_offsets[idx];
+    auto const ngram_count  = d_ngram_offsets[idx + 1] - ngram_offset;
+    auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+    auto d_hashes           = d_results + ngram_offset;
+    for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
+      auto const begin = itr.byte_offset();
+      auto const end   = (itr + ngrams).byte_offset();
+      auto const ngram = cudf::string_view(d_str.data() + begin, end - begin);
+      *d_hashes++      = hasher(ngram);
+    }
+  }
+};
+}  // namespace
+
+std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
+                                                    cudf::size_type ngrams,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(ngrams >= 2, "Parameter ngrams should be an integer value of 2 or greater");
+
+  auto output_type = cudf::data_type{cudf::type_to_id<cudf::hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  // build offsets column by computing the number of ngrams per string
+  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
+    0, [d_strings = *d_strings, ngrams] __device__(auto idx) {
+      if (d_strings.is_null(idx)) { return 0; }
+      auto const length = d_strings.element<cudf::string_view>(idx).length();
+      return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
+    });
+  auto [offsets, total_ngrams] =
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto d_offsets = offsets->view().data<cudf::size_type>();
+
+  CUDF_EXPECTS(total_ngrams > 0,
+               "Insufficient number of characters in each string to generate ngrams");
+
+  // compute ngrams and build hashes
+  auto hashes =
+    cudf::make_numeric_column(output_type, total_ngrams, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();
+
+  character_ngram_hash_fn generator{*d_strings, ngrams, d_offsets, d_hashes};
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::counting_iterator<cudf::size_type>(0),
+                     input.size(),
+                     generator);
+
+  return make_lists_column(
+    input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr);
+}
+
 }  // namespace detail
 
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
@@ -236,4 +323,12 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
 }
 
+std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
+                                                    cudf::size_type ngrams,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+}
+
 }  // namespace nvtext
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
new file mode 100644
index 00000000000..5b55745c2c7
--- /dev/null
+++ b/cpp/src/text/jaccard.cu
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/detail/generate_ngrams.hpp>
+#include <nvtext/jaccard.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+#include <cub/cub.cuh>
+
+namespace nvtext {
+namespace detail {
+namespace {
+
+/**
+ * @brief Retrieve the row data (span) for the given column/row-index
+ *
+ * @param d_input Input lists column
+ * @param idx Row index to retrieve
+ * @return A device-span of the row values
+ */
+__device__ auto get_row(cudf::column_device_view const& d_input, cudf::size_type idx)
+{
+  auto const offsets =
+    d_input.child(cudf::lists_column_view::offsets_column_index).data<cudf::size_type>();
+  auto const offset = offsets[idx];
+  auto const size   = offsets[idx + 1] - offset;
+  auto const begin =
+    d_input.child(cudf::lists_column_view::child_column_index).data<uint32_t>() + offset;
+  return cudf::device_span<uint32_t const>(begin, size);
+}
+
+/**
+ * @brief Count the unique values within each row of the input column
+ *
+ * This is called with a warp per row
+ */
+struct sorted_unique_fn {
+  cudf::column_device_view const d_input;
+  cudf::size_type* d_results;
+
+  // warp per row
+  __device__ void operator()(cudf::size_type idx) const
+  {
+    using warp_reduce = cub::WarpReduce<cudf::size_type>;
+    __shared__ typename warp_reduce::TempStorage temp_storage;
+
+    auto const row_idx  = idx / cudf::detail::warp_size;
+    auto const lane_idx = idx % cudf::detail::warp_size;
+    auto const row      = get_row(d_input, row_idx);
+    auto const begin    = row.begin();
+
+    cudf::size_type count = 0;
+    for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) {
+      count += (itr == begin || *itr != *(itr - 1));
+    }
+    auto const result = warp_reduce(temp_storage).Sum(count);
+    if (lane_idx == 0) { d_results[row_idx] = result; }
+  }
+};
+
+rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view const& input,
+                                                           rmm::cuda_stream_view stream)
+{
+  auto const d_input = cudf::column_device_view::create(input, stream);
+  auto d_results     = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+  sorted_unique_fn fn{*d_input, d_results.data()};
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::counting_iterator<cudf::size_type>(0),
+                     input.size() * cudf::detail::warp_size,
+                     fn);
+  return d_results;
+}
+
+/**
+ * @brief Count the number of common values within each row of the 2 input columns
+ *
+ * This is called with a warp per row
+ */
+struct sorted_interset_fn {
+  cudf::column_device_view const d_input1;
+  cudf::column_device_view const d_input2;
+  cudf::size_type* d_results;
+
+  // warp per row
+  __device__ float operator()(cudf::size_type idx) const
+  {
+    using warp_reduce = cub::WarpReduce<cudf::size_type>;
+    __shared__ typename warp_reduce::TempStorage temp_storage;
+
+    auto const row_idx  = idx / cudf::detail::warp_size;
+    auto const lane_idx = idx % cudf::detail::warp_size;
+
+    auto const needles  = get_row(d_input1, row_idx);
+    auto const haystack = get_row(d_input2, row_idx);
+
+    auto begin     = haystack.begin();
+    auto const end = haystack.end();
+
+    // TODO: investigate cuCollections device-side static-map to match row values
+
+    cudf::size_type count = 0;
+    for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end;
+         itr += cudf::detail::warp_size) {
+      if (itr != needles.begin() && *itr == *(itr - 1)) { continue; }  // skip duplicates
+      // search haystack for this needle (*itr)
+      auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr);
+      count += (found != end) && (*found == *itr);  // increment if found;
+      begin = found;                                // shorten the next lower-bound range
+    }
+    // sum up the counts across this warp
+    auto const result = warp_reduce(temp_storage).Sum(count);
+    if (lane_idx == 0) { d_results[row_idx] = result; }
+  }
+};
+
+rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view const& input1,
+                                                              cudf::column_view const& input2,
+                                                              rmm::cuda_stream_view stream)
+{
+  auto const d_input1 = cudf::column_device_view::create(input1, stream);
+  auto const d_input2 = cudf::column_device_view::create(input2, stream);
+  auto d_results      = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
+  sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()};
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::counting_iterator<cudf::size_type>(0),
+                     input1.size() * cudf::detail::warp_size,
+                     fn);
+  return d_results;
+}
+
+/**
+ * @brief Compute the jaccard distance for each row
+ *
+ * Formula is J = |A ∩ B| / |A ∪ B|
+ *              = |A ∩ B| / (|A| + |B| - |A ∩ B|)
+ *
+ * where |A ∩ B| is number of common values between A and B
+ * and |x| is the number of unique values in x.
+ */
+struct jaccard_fn {
+  cudf::size_type const* d_uniques1;
+  cudf::size_type const* d_uniques2;
+  cudf::size_type const* d_intersects;
+
+  __device__ float operator()(cudf::size_type idx) const
+  {
+    auto const count1     = d_uniques1[idx];
+    auto const count2     = d_uniques2[idx];
+    auto const intersects = d_intersects[idx];
+    // the intersect values are in both sets so a union count
+    // would need to subtract the intersect count from one set
+    // (see formula in comment above)
+    auto const unions = count1 + count2 - intersects;
+    return unions ? (static_cast<float>(intersects) / static_cast<float>(unions)) : 0.f;
+  }
+};
+
+/**
+ * @brief Create hashes for each substring
+ *
+ * Uses the hash_character_ngrams to hash substrings of the input column.
+ * This returns a lists column where each row is the hashes for the substrings
+ * of the corresponding input string row.
+ *
+ * The hashes are then sorted using a segmented-sort as setup to
+ * perform the unique and intersect operations.
+ */
+std::unique_ptr<cudf::column> hash_substrings(cudf::strings_column_view const& col,
+                                              cudf::size_type width,
+                                              rmm::cuda_stream_view stream)
+{
+  auto hashes = hash_character_ngrams(col, width, stream, rmm::mr::get_current_device_resource());
+  auto const input   = cudf::lists_column_view(hashes->view());
+  auto const offsets = input.offsets_begin();
+  auto const data    = input.child().data<uint32_t>();
+
+  rmm::device_uvector<uint32_t> sorted(input.child().size(), stream);
+
+  // this is wicked fast and much faster than using cudf::lists::detail::sort_list
+  rmm::device_buffer d_temp_storage;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(),
+                                     temp_storage_bytes,
+                                     data,
+                                     sorted.data(),
+                                     sorted.size(),
+                                     input.size(),
+                                     offsets,
+                                     offsets + 1,
+                                     stream.value());
+  d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
+  cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(),
+                                     temp_storage_bytes,
+                                     data,
+                                     sorted.data(),
+                                     sorted.size(),
+                                     input.size(),
+                                     offsets,
+                                     offsets + 1,
+                                     stream.value());
+
+  auto contents = hashes->release();
+  // the offsets are taken from the hashes column since they are the same
+  // before and after the segmented-sort
+  return cudf::make_lists_column(
+    col.size(),
+    std::move(contents.children.front()),
+    std::make_unique<cudf::column>(std::move(sorted), rmm::device_buffer{}, 0),
+    0,
+    rmm::device_buffer{},
+    stream,
+    rmm::mr::get_current_device_resource());
+}
+}  // namespace
+
+std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& input1,
+                                            cudf::strings_column_view const& input2,
+                                            cudf::size_type width,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(
+    input1.size() == input2.size(), "input columns must be the same size", std::invalid_argument);
+  CUDF_EXPECTS(width >= 2,
+               "Parameter width should be an integer value of 2 or greater",
+               std::invalid_argument);
+
+  constexpr auto output_type = cudf::data_type{cudf::type_id::FLOAT32};
+  if (input1.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const [d_uniques1, d_uniques2, d_intersects] = [&] {
+    // build hashes of the substrings
+    auto const hash1 = hash_substrings(input1, width, stream);
+    auto const hash2 = hash_substrings(input2, width, stream);
+
+    // compute the unique counts in each set and the intersection counts
+    auto d_uniques1   = compute_unique_counts(hash1->view(), stream);
+    auto d_uniques2   = compute_unique_counts(hash2->view(), stream);
+    auto d_intersects = compute_intersect_counts(hash1->view(), hash2->view(), stream);
+
+    return std::tuple{std::move(d_uniques1), std::move(d_uniques2), std::move(d_intersects)};
+  }();
+
+  auto results = cudf::make_numeric_column(
+    output_type, input1.size(), cudf::mask_state::UNALLOCATED, stream, mr);
+  auto d_results = results->mutable_view().data<float>();
+
+  // compute the jaccard using the unique counts and the intersect counts
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::counting_iterator<cudf::size_type>(0),
+                    thrust::counting_iterator<cudf::size_type>(results->size()),
+                    d_results,
+                    jaccard_fn{d_uniques1.data(), d_uniques2.data(), d_intersects.data()});
+
+  if (input1.null_count() || input2.null_count()) {
+    auto [null_mask, null_count] =
+      cudf::detail::bitmask_and(cudf::table_view({input1.parent(), input2.parent()}), stream, mr);
+    results->set_null_mask(null_mask, null_count);
+  }
+
+  return results;
+}
+
+}  // namespace detail
+
+std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& input1,
+                                            cudf::strings_column_view const& input2,
+                                            cudf::size_type width,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::jaccard_index(input1, input2, width, stream, mr);
+}
+
+}  // namespace nvtext
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index eb3b9092185..f06eaa5b52c 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -20,13 +20,13 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/hashing.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -36,11 +36,11 @@
 
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
 
 #include <limits>
 
+#include <cuda/atomic>
+
 namespace nvtext {
 namespace detail {
 namespace {
@@ -50,77 +50,96 @@ namespace {
  *
  * This is a warp-per-string algorithm where parallel threads within a warp
  * work on substrings of a single string row.
+ *
+ * @tparam HashFunction hash function to use on each substring
+ *
+ * @param d_strings Strings column to process
+ * @param seeds Seeds for hashing each string
+ * @param width Substring window size in characters
+ * @param d_hashes Minhash output values for each string
  */
-struct minhash_fn {
-  cudf::column_device_view d_strings;
-  cudf::device_span<cudf::hash_value_type const> seeds;
-  cudf::size_type width;
-  cudf::hash_value_type* d_hashes;
-
-  __device__ void operator()(std::size_t idx)
-  {
-    auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
-    auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
-
-    if (d_strings.is_null(str_idx)) { return; }
-
-    auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
-    auto const d_output = d_hashes + (str_idx * seeds.size());
-
-    // initialize hashes output for this string
-    if (lane_idx == 0) {
-      auto const init = d_str.empty() ? 0 : std::numeric_limits<cudf::hash_value_type>::max();
-      thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
-    }
-    __syncwarp();
-
-    auto const begin = d_str.begin() + lane_idx;
-    auto const end   = [d_str, width = width] {
-      auto const length = d_str.length();
-      if (length > width) { return (d_str.end() - (width - 1)); }
-      return d_str.begin() + static_cast<cudf::size_type>(length > 0);
-    }();
-
-    // each lane hashes substrings of the given width
-    for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
-      auto const offset = itr.byte_offset();
-      auto const hash_str =
-        cudf::string_view(d_str.data() + offset, (itr + width).byte_offset() - offset);
-
-      // hashing each seed on the same section of string is 10x faster than
-      // computing the substrings for each seed
-      for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
-        auto const hasher = cudf::detail::MurmurHash3_32<cudf::string_view>{seeds[seed_idx]};
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+__global__ void minhash_kernel(cudf::column_device_view const d_strings,
+                               cudf::device_span<hash_value_type const> seeds,
+                               cudf::size_type width,
+                               hash_value_type* d_hashes)
+{
+  auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (idx >= (static_cast<std::size_t>(d_strings.size()) *
+              static_cast<std::size_t>(cudf::detail::warp_size))) {
+    return;
+  }
+
+  auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
+
+  if (d_strings.is_null(str_idx)) { return; }
+
+  auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
+  auto const d_output = d_hashes + (str_idx * seeds.size());
+
+  // initialize hashes output for this string
+  if (lane_idx == 0) {
+    auto const init = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
+    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
+  }
+  __syncwarp();
+
+  auto const begin = d_str.data() + lane_idx;
+  auto const end   = d_str.data() + d_str.size_bytes();
+
+  // each lane hashes 'width' substrings of d_str
+  for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
+    if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
+    auto const check_str =  // used for counting 'width' characters
+      cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+    auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width);
+    if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
+
+    auto const hash_str = cudf::string_view(itr, bytes);
+    // hashing with each seed on the same section of the string is 10x faster than
+    // computing the substrings for each seed
+    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
+      auto const hasher = HashFunction(seeds[seed_idx]);
+      // hash substring and store the min value
+      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
         auto const hvalue = hasher(hash_str);
-        atomicMin(d_output + seed_idx, hvalue);
+        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
+        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
+      } else {
+        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
+        // but only uses the first uint64 value as requested by the LLM team.
+        auto const hvalue = thrust::get<0>(hasher(hash_str));
+        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
+        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
       }
     }
   }
-};
-
-}  // namespace
+}
 
-std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::device_span<cudf::hash_value_type const> seeds,
-                                      cudf::size_type width,
-                                      cudf::hash_id hash_function,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
+                                         cudf::device_span<hash_value_type const> seeds,
+                                         cudf::size_type width,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
   CUDF_EXPECTS(width >= 2,
                "Parameter width should be an integer value of 2 or greater",
                std::invalid_argument);
-  CUDF_EXPECTS(hash_function == cudf::hash_id::HASH_MURMUR3,
-               "Only murmur3 hash algorithm supported",
-               std::invalid_argument);
-  CUDF_EXPECTS(
-    (static_cast<std::size_t>(input.size()) * seeds.size()) <
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-    "The number of seeds times the number of input rows must not exceed maximum of size_type",
-    std::invalid_argument);
+  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
+                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+               "The number of seeds times the number of input rows exceeds the column size limit",
+               std::overflow_error);
 
-  auto output_type = cudf::data_type{cudf::type_to_id<cudf::hash_value_type>()};
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
   if (input.is_empty()) { return cudf::make_empty_column(output_type); }
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
@@ -130,27 +149,26 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                           cudf::mask_state::UNALLOCATED,
                                           stream,
                                           mr);
-  auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();
-
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::counting_iterator(std::size_t{0}),
-    static_cast<std::size_t>(input.size()) * static_cast<std::size_t>(cudf::detail::warp_size),
-    minhash_fn{*d_strings, seeds, width, d_hashes});
-
-  if (seeds.size() == 1) {
-    hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr),
-                          input.null_count());
-    return hashes;
-  }
+  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
+
+  constexpr int block_size = 256;
+  cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+  minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    *d_strings, seeds, width, d_hashes);
+
+  return hashes;
+}
 
+std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const& input,
+                                                std::unique_ptr<cudf::column>&& hashes,
+                                                cudf::size_type seeds_size,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
+{
   // build the offsets for the output lists column
-  auto offsets = cudf::detail::sequence(
-    input.size() + 1,
-    cudf::numeric_scalar<cudf::size_type>(0),
-    cudf::numeric_scalar<cudf::size_type>(static_cast<cudf::size_type>(seeds.size())),
-    stream,
-    mr);
+  auto const zero = cudf::numeric_scalar<cudf::size_type>(0);
+  auto const size = cudf::numeric_scalar<cudf::size_type>(seeds_size);
+  auto offsets    = cudf::detail::sequence(input.size() + 1, zero, size, stream, mr);
   hashes->set_null_mask(rmm::device_buffer{}, 0);  // children have no nulls
 
   // build the lists column from the offsets and the hashes
@@ -167,28 +185,95 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   }
   return result;
 }
+}  // namespace
 
+std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
+                                      cudf::numeric_scalar<uint32_t> seed,
+                                      cudf::size_type width,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto const seeds   = cudf::device_span<uint32_t const>{seed.data(), 1};
+  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
+  hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count());
+  return hashes;
+}
+
+std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
+                                      cudf::device_span<uint32_t const> seeds,
+                                      cudf::size_type width,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
+  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+}
+
+std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        cudf::numeric_scalar<uint64_t> seed,
+                                        cudf::size_type width,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto const seeds   = cudf::device_span<uint64_t const>{seed.data(), 1};
+  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
+  hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count());
+  return hashes;
+}
+
+std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        cudf::device_span<uint64_t const> seeds,
+                                        cudf::size_type width,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
+  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+}
 }  // namespace detail
 
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::numeric_scalar<cudf::hash_value_type> seed,
+                                      cudf::numeric_scalar<uint32_t> seed,
                                       cudf::size_type width,
-                                      cudf::hash_id hash_function,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto seeds = cudf::device_span<cudf::hash_value_type const>{seed.data(), 1};
-  return detail::minhash(input, seeds, width, hash_function, cudf::get_default_stream(), mr);
+  return detail::minhash(input, seed, width, stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::device_span<cudf::hash_value_type const> seeds,
+                                      cudf::device_span<uint32_t const> seeds,
                                       cudf::size_type width,
-                                      cudf::hash_id hash_function,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minhash(input, seeds, width, hash_function, cudf::get_default_stream(), mr);
+  return detail::minhash(input, seeds, width, stream, mr);
+}
+
+std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        cudf::numeric_scalar<uint64_t> seed,
+                                        cudf::size_type width,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash64(input, seed, width, stream, mr);
+}
+
+std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        cudf::device_span<uint64_t const> seeds,
+                                        cudf::size_type width,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash64(input, seeds, width, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 93757fa37e4..fd1cbf99221 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -222,7 +222,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
     chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream);
   CUDF_EXPECTS(
     output_chars_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
-    "Size of output exceeds column size limit",
+    "Size of output exceeds the column size limit",
     std::overflow_error);
 
   // This will contain the size in bytes of each ngram to generate
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 3ef251611eb..78dfb6bf1a6 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -58,7 +58,7 @@ namespace {
  */
 struct normalize_spaces_fn {
   cudf::column_device_view const d_strings;  // strings to normalize
-  int32_t* d_offsets{};                      // offsets into d_buffer
+  cudf::size_type* d_offsets{};              // offsets into d_chars
   char* d_chars{};                           // output buffer for characters
 
   __device__ void operator()(cudf::size_type idx)
@@ -70,8 +70,9 @@ struct normalize_spaces_fn {
     cudf::string_view const single_space(" ", 1);
     auto const d_str = d_strings.element<cudf::string_view>(idx);
     char* buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    char* optr       = buffer;  // running output pointer
-    int32_t nbytes   = 0;       // holds the number of bytes per output string
+    char* optr       = buffer;   // running output pointer
+
+    cudf::size_type nbytes = 0;  // holds the number of bytes per output string
 
     // create a tokenizer for this string with whitespace delimiter (default)
     characters_tokenizer tokenizer(d_str);
@@ -79,15 +80,16 @@ struct normalize_spaces_fn {
     // this will retrieve tokens automatically skipping runs of whitespace
     while (tokenizer.next_token()) {
       auto const token_pos = tokenizer.token_byte_positions();
-      nbytes += token_pos.second - token_pos.first + 1;  // token size plus a single space
+      auto const token =
+        cudf::string_view(d_str.data() + token_pos.first, token_pos.second - token_pos.first);
       if (optr) {
-        cudf::string_view const token(d_str.data() + token_pos.first,
-                                      token_pos.second - token_pos.first);
-        if (optr != buffer)  // prepend space unless we are at the beginning
-          optr = cudf::strings::detail::copy_string(optr, single_space);
+        // prepend space unless we are at the beginning
+        if (optr != buffer) { optr = cudf::strings::detail::copy_string(optr, single_space); }
         // write token to output buffer
-        optr = cudf::strings::detail::copy_string(optr, token);
+        thrust::copy_n(thrust::seq, token.data(), token.size_bytes(), optr);
+        optr += token.size_bytes();
       }
+      nbytes += token.size_bytes() + 1;  // token size plus a single space
     }
     // remove trailing space
     if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0;
@@ -105,8 +107,8 @@ constexpr uint32_t UTF8_3BYTE = 0x01'0000;
 struct codepoint_to_utf8_fn {
   cudf::column_device_view const d_strings;  // input strings
   uint32_t const* cp_data;                   // full code-point array
-  int32_t const* d_cp_offsets{};             // offsets to each string's code-point array
-  int32_t* d_offsets{};                      // offsets for the output strings
+  cudf::size_type const* d_cp_offsets{};     // offsets to each string's code-point array
+  cudf::size_type* d_offsets{};              // offsets for the output strings
   char* d_chars{};                           // buffer for the output strings column
 
   /**
@@ -116,7 +118,7 @@ struct codepoint_to_utf8_fn {
    * @param count number of code-points in `str_cps`
    * @return Number of bytes required for the output
    */
-  __device__ int32_t compute_output_size(uint32_t const* str_cps, uint32_t count)
+  __device__ cudf::size_type compute_output_size(uint32_t const* str_cps, uint32_t count)
   {
     return thrust::transform_reduce(
       thrust::seq,
@@ -124,7 +126,7 @@ struct codepoint_to_utf8_fn {
       str_cps + count,
       [](auto cp) { return 1 + (cp >= UTF8_1BYTE) + (cp >= UTF8_2BYTE) + (cp >= UTF8_3BYTE); },
       0,
-      thrust::plus<int32_t>());
+      thrust::plus());
   }
 
   __device__ void operator()(cudf::size_type idx)
@@ -206,21 +208,22 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
     auto const aux_table   = get_aux_codepoint_data(stream);
     auto const normalizer  = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
     auto const offsets     = strings.offsets();
-    auto const d_offsets   = offsets.data<uint32_t>() + strings.offset();
-    auto const offset      = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
-    auto const d_chars     = strings.chars().data<char>() + offset;
+    auto const d_offsets   = offsets.data<cudf::size_type>() + strings.offset();
+    auto const offset = cudf::detail::get_value<cudf::size_type>(offsets, strings.offset(), stream);
+    auto const d_chars = strings.chars().data<char>() + offset;
     return normalizer.normalize(d_chars, d_offsets, strings.size(), stream);
   }();
 
   CUDF_EXPECTS(
-    result.first->size() <= static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-    "output too large for strings column");
+    result.first->size() < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+    "output exceeds the column size limit",
+    std::overflow_error);
 
   // convert the result into a strings column
   // - the cp_chars are the new 4-byte code-point values for all the characters in the output
   // - the cp_offsets identify which code-points go with which strings
-  uint32_t const* cp_chars  = result.first->data();
-  int32_t const* cp_offsets = reinterpret_cast<int32_t const*>(result.second->data());
+  uint32_t const* cp_chars          = result.first->data();
+  cudf::size_type const* cp_offsets = result.second->data();
 
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 3cfaece64d7..d122f048a4e 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -47,7 +47,7 @@ using replace_result = thrust::pair<bool, cudf::string_view>;
 struct base_token_replacer_fn {
   cudf::column_device_view const d_strings;  ///< strings to tokenize
   cudf::string_view const d_delimiter;       ///< delimiter characters for tokenizing
-  int32_t* d_offsets{};                      ///< for locating output string in d_chars
+  cudf::size_type* d_offsets{};              ///< for locating output string in d_chars
   char* d_chars{};                           ///< output buffer
 
   /**
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index 6aad75bef71..2b2b8429d9c 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -184,17 +184,19 @@ struct dispatch_is_letter_fn {
 struct porter_stemmer_measure_fn {
   cudf::column_device_view const d_strings;  // strings to measure
 
-  __device__ int32_t operator()(cudf::size_type idx) const
+  __device__ cudf::size_type operator()(cudf::size_type idx) const
   {
-    if (d_strings.is_null(idx)) return 0;
+    if (d_strings.is_null(idx)) { return 0; }
     cudf::string_view d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) return 0;
-    int32_t measure = 0;
-    auto itr        = d_str.begin();
-    bool vowel_run  = !is_consonant(itr);
+    if (d_str.empty()) { return 0; }
+
+    cudf::size_type measure = 0;
+
+    auto itr       = d_str.begin();
+    bool vowel_run = !is_consonant(itr);
     while (itr != d_str.end()) {
       if (is_consonant(itr)) {
-        if (vowel_run) measure++;
+        if (vowel_run) { measure++; }
         vowel_run = false;
       } else {
         vowel_run = true;
@@ -211,11 +213,13 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
                                                      rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
+  if (strings.is_empty()) {
+    return cudf::make_empty_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()});
+  }
 
   // create empty output column
   auto results =
-    cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                                   strings.size(),
                                   cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                                   strings.null_count(),
@@ -226,7 +230,7 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(strings.size()),
-                    results->mutable_view().data<int32_t>(),
+                    results->mutable_view().data<cudf::size_type>(),
                     porter_stemmer_measure_fn{*strings_column});
   results->set_null_count(strings.null_count());
   return results;
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 2ce29ec8d5c..4c4f5b3a4b1 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -22,7 +22,6 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -38,6 +37,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/find.h>
 #include <thrust/for_each.h>
+#include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/merge.h>
@@ -80,10 +80,11 @@ __device__ cudf::string_view get_first_token(cudf::string_view const& d_str)
  *
  * @see The byte_pair_encoding_fn::operator() function below for details.
  */
+template <typename MapRefType>
 struct byte_pair_encoding_fn {
   cudf::column_device_view const d_merges;
   cudf::column_device_view const d_strings;
-  merge_pairs_map_type::device_view const d_map;
+  MapRefType const d_map;
   cudf::size_type* d_sizes;  // output size of encoded string
   string_hasher_type const hasher;
   cudf::size_type* d_byte_indices;
@@ -136,17 +137,13 @@ struct byte_pair_encoding_fn {
   }
 
   /**
-   * @brief Compute the hash over the input strings.
+   * @brief Look up the pair of strings in the d_map/d_merges
    *
-   * The input strings are combined with a space to produce hash for matching
-   * a merge pair within the `d_map`.
-   *
-   * @param lhs First string.
-   * @param rhs Second string.
-   * @return The hash value to match with `d_map`.
+   * @param lhs Left half of the string
+   * @param rhs Right half of the string
+   * @return Position of merge pair within d_map
    */
-  __device__ cudf::hash_value_type compute_hash(cudf::string_view const& lhs,
-                                                cudf::string_view const& rhs)
+  __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs)
   {
     __shared__ char shmem[48 * 1024];  // max for Pascal
     auto const total_size         = lhs.size_bytes() + rhs.size_bytes() + 1;
@@ -154,8 +151,8 @@ struct byte_pair_encoding_fn {
 
     // Edge case check.
     // Empirically found only two merge pair strings that were greater than 70 bytes
-    // and they both looked like ignorable errors. Double check this analysis with Vibhu.
-    if (thread_memory_size < total_size) { return 0; }
+    // and they both looked like ignorable errors.
+    if (thread_memory_size < total_size) { return d_map.end(); }
 
     // build the target string in shared memory
     char* ptr = &shmem[threadIdx.x * thread_memory_size];
@@ -165,8 +162,8 @@ struct byte_pair_encoding_fn {
     memcpy(ptr + lhs.size_bytes(), " ", 1);
     memcpy(ptr + lhs.size_bytes() + 1, rhs.data(), rhs.size_bytes());
 
-    auto const d_hash_str = cudf::string_view(ptr, total_size);
-    return hasher(d_hash_str);  // return the hash for the temp string
+    auto const d_str = cudf::string_view(ptr, total_size);
+    return d_map.find(d_str);
   }
 
   /**
@@ -199,7 +196,7 @@ struct byte_pair_encoding_fn {
     }
 
     auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::offset_type>(idx);
+                          .element<cudf::size_type>(idx);
     auto const d_indices = d_byte_indices + offset;
 
     // initialize the byte indices for this string;
@@ -233,11 +230,10 @@ struct byte_pair_encoding_fn {
         auto const rhs = next_substr(itr, end, d_str);
         if (rhs.empty()) break;  // no more adjacent pairs
 
-        auto const hash    = compute_hash(lhs, rhs);
-        auto const map_itr = d_map.find(hash);
+        auto const map_itr = get_merge_pair(lhs, rhs);
         if (map_itr != d_map.end()) {
           // found a match; record the rank (and other min_ vars)
-          auto const rank = static_cast<cudf::size_type>(map_itr->second);
+          auto const rank = map_itr->second;
           if (rank < min_rank) {
             min_rank = rank;
             min_itr  = itr;
@@ -304,7 +300,7 @@ struct byte_pair_encoding_fn {
 struct build_encoding_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type const* d_byte_indices;
-  cudf::offset_type const* d_offsets;
+  cudf::size_type const* d_offsets;
   char* d_chars{};
 
   __device__ void operator()(cudf::size_type idx)
@@ -314,7 +310,7 @@ struct build_encoding_fn {
     if (d_str.empty()) { return; }
 
     auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::offset_type>(idx);
+                          .element<cudf::size_type>(idx);
     auto const d_indices = d_byte_indices + offset;
     auto d_output        = d_chars ? d_chars + d_offsets[idx] : nullptr;
 
@@ -354,27 +350,24 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs,
   rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(!merge_pairs.get_merge_pairs().is_empty(), "Merge pairs table must not be empty");
+  auto const d_merges = merge_pairs.get_merge_pairs();
+  CUDF_EXPECTS(d_merges.size() > 0, "Merge pairs table must not be empty");
 
   // build working vector to hold index values per byte
   rmm::device_uvector<cudf::size_type> d_byte_indices(input.chars().size(), stream);
 
-  auto const d_merges  = cudf::column_device_view::create(merge_pairs.get_merge_pairs(), stream);
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::offset_type>()},
+  auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                                            static_cast<cudf::size_type>(input.size() + 1),
                                            cudf::mask_state::UNALLOCATED,
                                            stream,
                                            rmm::mr::get_current_device_resource());
-  auto d_offsets = offsets->mutable_view().data<cudf::offset_type>();
-
-  byte_pair_encoding_fn fn{*d_merges,
-                           *d_strings,
-                           merge_pairs.get_merge_pairs_map(),
-                           d_offsets,
-                           string_hasher_type{},
-                           d_byte_indices.data()};
+  auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
+
+  auto map_ref = merge_pairs.get_merge_pairs_ref();
+  byte_pair_encoding_fn<decltype(map_ref)> fn{
+    d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()};
   thrust::for_each_n(
     rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0), input.size(), fn);
 
@@ -406,14 +399,14 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
  */
 struct edge_of_space_fn {
   cudf::column_device_view const d_strings;
-  __device__ bool operator()(cudf::offset_type offset)
+  __device__ bool operator()(cudf::size_type offset)
   {
     auto const d_chars =
       d_strings.child(cudf::strings_column_view::chars_column_index).data<char>();
     if (is_whitespace(d_chars[offset]) || !is_whitespace(d_chars[offset - 1])) { return false; }
 
     auto const offsets   = d_strings.child(cudf::strings_column_view::offsets_column_index);
-    auto const d_offsets = offsets.data<cudf::offset_type>() + d_strings.offset();
+    auto const d_offsets = offsets.data<cudf::size_type>() + d_strings.offset();
     // ignore offsets outside sliced range
     if (offset < d_offsets[0] || offset >= d_offsets[d_strings.size()]) { return false; }
 
@@ -452,12 +445,12 @@ std::unique_ptr<cudf::column> space_offsets(cudf::strings_column_view const& inp
   auto const space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space);
 
   // copy space offsets
-  rmm::device_uvector<cudf::offset_type> space_offsets(space_count, stream);
+  rmm::device_uvector<cudf::size_type> space_offsets(space_count, stream);
   thrust::copy_if(rmm::exec_policy(stream), begin, end, space_offsets.data(), edge_of_space);
 
   // create output offsets
   auto result =
-    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::offset_type>()},
+    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                               static_cast<cudf::size_type>(space_count + input.size() + 1),
                               cudf::mask_state::UNALLOCATED,
                               stream,
@@ -469,7 +462,7 @@ std::unique_ptr<cudf::column> space_offsets(cudf::strings_column_view const& inp
                 input.offsets_end(),
                 space_offsets.begin(),
                 space_offsets.end(),
-                result->mutable_view().begin<cudf::offset_type>());
+                result->mutable_view().begin<cudf::size_type>());
 
   return result;
 }
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 24b10fc4a36..83aa22aaae9 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,39 +21,95 @@
 #include <hash/hash_allocator.cuh>
 
 #include <cudf/column/column.hpp>
-#include <cudf/detail/utilities/hash_functions.cuh>
-
-#include <cuco/static_map.cuh>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/strings/string_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
+#include <cuco/static_map.cuh>
+
 #include <cstdint>
+#include <type_traits>
 
 namespace nvtext {
 namespace detail {
 
+using hash_value_type    = uint32_t;
+using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+
+/**
+ * @brief Hasher function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
+ */
+struct bpe_hasher {
+  cudf::column_device_view const d_strings;
+  string_hasher_type hasher{};
+  // used by insert
+  __device__ hash_value_type operator()(cudf::size_type index) const
+  {
+    return hasher(d_strings.element<cudf::string_view>(index));
+  }
+  // used by find
+  __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); }
+};
+
+/**
+ * @brief Equal function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
+ */
+struct bpe_equal {
+  cudf::column_device_view const d_strings;
+  // used by insert
+  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
+  {
+    return d_strings.element<cudf::string_view>(lhs) == d_strings.element<cudf::string_view>(rhs);
+  }
+  // used by find
+  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
+  {
+    return d_strings.element<cudf::string_view>(lhs) == rhs;
+  }
+};
+
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
-using merge_pairs_map_type = cuco::static_map<cudf::hash_value_type,
-                                              cudf::size_type,
-                                              cuda::thread_scope_device,
-                                              hash_table_allocator_type>;
+using probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>;
 
-using string_hasher_type = cudf::detail::MurmurHash3_32<cudf::string_view>;
+using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
+                                                            cudf::size_type,
+                                                            cuco::experimental::extent<std::size_t>,
+                                                            cuda::thread_scope_device,
+                                                            bpe_equal,
+                                                            probe_scheme,
+                                                            hash_table_allocator_type>;
 
 }  // namespace detail
 
+// since column_device_view::create returns is a little more than
+// std::unique_ptr<column_device_view> this helper simplifies the return type in a more maintainable
+// way
+using col_device_view = std::invoke_result_t<decltype(&cudf::column_device_view::create),
+                                             cudf::column_view,
+                                             rmm::cuda_stream_view>;
+
 struct bpe_merge_pairs::bpe_merge_pairs_impl {
   std::unique_ptr<cudf::column> const merge_pairs;
+  col_device_view const d_merge_pairs;
   std::unique_ptr<detail::merge_pairs_map_type> merge_pairs_map;
 
   bpe_merge_pairs_impl(std::unique_ptr<cudf::column>&& merge_pairs,
+                       col_device_view&& d_merge_pairs,
                        std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map);
 
-  auto get_merge_pairs() const { return merge_pairs->view(); }
-  auto get_merge_pairs_map() const { return merge_pairs_map->get_device_view(); }
+  auto const get_merge_pairs() const { return *d_merge_pairs; }
+  auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
 };
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 71f9e3f7043..34eb95bea5c 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,9 +124,10 @@ __device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6)
  * @param start_byte_for_thread Which byte to start analyzing
  * @return New code point value for this byte.
  */
-__device__ uint32_t extract_code_points_from_utf8(unsigned char const* strings,
-                                                  size_t const total_bytes,
-                                                  uint32_t const start_byte_for_thread)
+__device__ uint32_t
+extract_code_points_from_utf8(unsigned char const* strings,
+                              size_t const total_bytes,
+                              cudf::thread_index_type const start_byte_for_thread)
 {
   constexpr uint8_t max_utf8_blocks_for_char    = 4;
   uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0};
@@ -137,7 +138,7 @@ __device__ uint32_t extract_code_points_from_utf8(unsigned char const* strings,
     utf8_blocks[i] = strings[start_byte_for_thread + i];
   }
 
-  const uint8_t length_encoding_bits = utf8_blocks[0] >> 3;
+  uint8_t const length_encoding_bits = utf8_blocks[0] >> 3;
   // UTF-8 format is variable-width character encoding using up to 4 bytes.
   // If the first byte is:
   // - [x00-x7F] -- beginning of a 1-byte character (ASCII)
@@ -214,8 +215,9 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,
   constexpr uint32_t init_val                     = (1 << FILTER_BIT);
   uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val};
 
-  uint32_t const char_for_thread = blockDim.x * blockIdx.x + threadIdx.x;
-  uint32_t num_new_chars         = 0;
+  cudf::thread_index_type const char_for_thread =
+    threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x);
+  uint32_t num_new_chars = 0;
 
   if (char_for_thread < total_bytes) {
     auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread);
@@ -273,31 +275,34 @@ data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata,
 }
 
 uvector_pair data_normalizer::normalize(char const* d_strings,
-                                        uint32_t const* d_offsets,
-                                        uint32_t num_strings,
+                                        cudf::size_type const* d_offsets,
+                                        cudf::size_type num_strings,
                                         rmm::cuda_stream_view stream) const
 {
-  if (num_strings == 0)
-    return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
-                     std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
+  if (num_strings == 0) {
+    return uvector_pair{std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
+                        std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream)};
+  }
 
   // copy offsets to working memory
-  size_t const num_offsets = num_strings + 1;
-  auto d_strings_offsets   = std::make_unique<rmm::device_uvector<uint32_t>>(num_offsets, stream);
+  auto const num_offsets = num_strings + 1;
+  auto d_strings_offsets =
+    std::make_unique<rmm::device_uvector<cudf::size_type>>(num_offsets, stream);
   thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<uint32_t>(0),
-                    thrust::make_counting_iterator<uint32_t>(num_offsets),
+                    thrust::counting_iterator<cudf::size_type>(0),
+                    thrust::counting_iterator<cudf::size_type>(num_offsets),
                     d_strings_offsets->begin(),
                     [d_offsets] __device__(auto idx) {
                       auto const offset = d_offsets[0];  // adjust for any offset to the offsets
                       return d_offsets[idx] - offset;
                     });
-  uint32_t const bytes_count = d_strings_offsets->element(num_strings, stream);
-  if (bytes_count == 0)  // if no bytes, nothing to do
-    return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
-                     std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
+  auto const bytes_count = d_strings_offsets->element(num_strings, stream);
+  if (bytes_count == 0) {  // if no bytes, nothing to do
+    return uvector_pair{std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
+                        std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream)};
+  }
 
-  cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(bytes_count), THREADS_PER_BLOCK, 1};
+  cudf::detail::grid_1d const grid{bytes_count, THREADS_PER_BLOCK, 1};
   size_t const threads_on_device  = grid.num_threads_per_block * grid.num_blocks;
   size_t const max_new_char_total = MAX_NEW_CHARS * threads_on_device;
 
@@ -305,7 +310,7 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
   rmm::device_uvector<uint32_t> d_chars_per_thread(threads_on_device, stream);
 
   kernel_data_normalizer<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    reinterpret_cast<const unsigned char*>(d_strings),
+    reinterpret_cast<unsigned char const*>(d_strings),
     bytes_count,
     d_cp_metadata,
     d_aux_table,
@@ -333,7 +338,7 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
     num_strings,
     update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()});
 
-  uint32_t const num_chars = d_strings_offsets->element(num_strings, stream);
+  auto const num_chars = d_strings_offsets->element(num_strings, stream);
   d_code_points->resize(num_chars, stream);  // should be smaller than original allocated size
 
   // return the normalized code points and the new offsets
diff --git a/cpp/src/text/subword/detail/codepoint_metadata.ah b/cpp/src/text/subword/detail/codepoint_metadata.ah
index 794d14e4b6c..e8862032087 100644
--- a/cpp/src/text/subword/detail/codepoint_metadata.ah
+++ b/cpp/src/text/subword/detail/codepoint_metadata.ah
@@ -40,7 +40,7 @@ constexpr uint32_t aux_section4_end   = 119232;
 using codepoint_metadata_type = uint32_t;
 using aux_codepoint_data_type = uint64_t;
 
-const codepoint_metadata_type codepoint_metadata[] = {
+codepoint_metadata_type const codepoint_metadata[] = {
 33554432,33554432,33554432,33554432,33554432,33554432,33554432,33554432,33554432,67108896,67108896,33554464,33554464,67108896,33554432,33554432,
 33554432,33554432,33554432,33554432,33554432,33554432,33554432,33554432,33554432,33554432,33554432,33554432,33554464,33554464,33554464,33554464,
 67108896,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -12271,7 +12271,7 @@ uint32_t cp_metadata_917505_917999[] = {
 50331680,50331680,50331680,50331680,50331680,50331680,50331680,50331680,50331680,50331680,50331680,50331680,50331680,50331680,50331680,50331680
 };
 
-const aux_codepoint_data_type aux_codepoint_data[] = {
+aux_codepoint_data_type const aux_codepoint_data[] = {
 0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp
index 927de5a74f9..fb507b88e7e 100644
--- a/cpp/src/text/subword/detail/data_normalizer.hpp
+++ b/cpp/src/text/subword/detail/data_normalizer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,11 +18,13 @@
 
 #include <text/subword/detail/cp_data.h>
 
+#include <cudf/types.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
 using uvector_pair = std::pair<std::unique_ptr<rmm::device_uvector<uint32_t>>,
-                               std::unique_ptr<rmm::device_uvector<uint32_t>>>;
+                               std::unique_ptr<rmm::device_uvector<cudf::size_type>>>;
 
 namespace nvtext {
 namespace detail {
@@ -85,8 +87,8 @@ class data_normalizer {
    *         used to locate the code points for each string.
    */
   uvector_pair normalize(char const* d_strings,
-                         uint32_t const* d_offsets,
-                         uint32_t num_strings,
+                         cudf::size_type const* d_offsets,
+                         cudf::size_type num_strings,
                          rmm::cuda_stream_view stream) const;
 
  private:
diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh
index 5e8de1ba244..7cc0e7c0e24 100644
--- a/cpp/src/text/subword/detail/tokenizer_utils.cuh
+++ b/cpp/src/text/subword/detail/tokenizer_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <text/subword/detail/cp_data.h>
 
+#include <cudf/types.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -47,8 +49,9 @@ constexpr int THREADS_PER_BLOCK = 64;
  */
 struct update_strings_lengths_fn {
   uint32_t const* d_chars_up_to_idx;
-  uint32_t* d_offsets;
-  __device__ void operator()(uint32_t idx)
+  cudf::size_type* d_offsets;
+
+  __device__ void operator()(cudf::size_type idx)
   {
     auto const offset = d_offsets[idx];
     d_offsets[idx]    = offset > 0 ? d_chars_up_to_idx[offset - 1] : 0;
diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
index b5ad9724d72..e191890eeca 100644
--- a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
+++ b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,9 +49,6 @@ class wordpiece_tokenizer {
    * @brief Creates a full tokenizer that cleans the text and splits it into tokens.
    *
    * @param vocab_table The preprocessed hashed vocabulary data.
-   * @param max_rows_final_tensor Maximum number of rows in tensor_token-ids expected by tokenizer.
-   *        Used to allocate temporary working memory on the GPU.
-   *        If the output contains a larger number of rows, behavior is undefined.
    * @param max_sequence_length Limit the number of token-ids per row in the output
    * @param stride Each row in tensor-token-ids will replicate `max_sequence_length - stride`
    *        token-ids from the previous row, unless it is the first string.
@@ -66,7 +63,6 @@ class wordpiece_tokenizer {
    *        specified in the `vocab_file`.
    */
   wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
-                      uint32_t max_rows_final_tensor,
                       uint32_t max_sequence_length,
                       uint32_t stride,
                       bool do_truncate,
@@ -86,8 +82,8 @@ class wordpiece_tokenizer {
    * @return Pointer to token-ids and token-id offsets
    */
   uvector_pair tokenize(char const* d_strings,
-                        uint32_t const* d_offsets,
-                        uint32_t num_strings,
+                        cudf::size_type const* d_offsets,
+                        cudf::size_type num_strings,
                         rmm::cuda_stream_view stream);
 
  private:
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index 3eebce92c69..cb18d0e0ecf 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -268,13 +268,19 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
 
   auto cp_metadata            = detail::get_codepoint_metadata(stream);
   auto const cp_metadata_size = static_cast<cudf::size_type>(cp_metadata.size());
-  result.cp_metadata          = std::make_unique<cudf::column>(
-    cudf::data_type{cudf::type_id::UINT32}, cp_metadata_size, cp_metadata.release());
+  result.cp_metadata = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::UINT32},
+                                                      cp_metadata_size,
+                                                      cp_metadata.release(),
+                                                      rmm::device_buffer{},
+                                                      0);
 
   auto aux_cp_table            = detail::get_aux_codepoint_data(stream);
   auto const aux_cp_table_size = static_cast<cudf::size_type>(aux_cp_table.size());
-  result.aux_cp_table          = std::make_unique<cudf::column>(
-    cudf::data_type{cudf::type_id::UINT64}, aux_cp_table_size, aux_cp_table.release());
+  result.aux_cp_table = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::UINT64},
+                                                       aux_cp_table_size,
+                                                       aux_cp_table.release(),
+                                                       rmm::device_buffer{},
+                                                       0);
 
   return std::make_unique<hashed_vocabulary>(std::move(result));
 }
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index 2a85ed0a37f..1f1b90b3f49 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -21,7 +21,6 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -37,23 +36,8 @@
 
 namespace nvtext {
 namespace detail {
-
 namespace {
 
-struct make_pair_function {
-  /**
-   * @brief Hash the merge pair entry
-   */
-  __device__ cuco::pair_type<cudf::hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
-  {
-    auto const result = _hasher(d_strings.element<cudf::string_view>(idx));
-    return cuco::make_pair(result, idx);
-  }
-
-  string_hasher_type const _hasher;
-  cudf::column_device_view const d_strings;
-};
-
 /**
  * @brief Loads a text file of merge-pairs into a strings column.
  *
@@ -79,7 +63,7 @@ std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_me
   CUDF_EXPECTS(merges_file.good(), "Could not open " + filename_merges);
 
   std::vector<char> chars{};
-  std::vector<cudf::offset_type> offsets(1, 0);
+  std::vector<cudf::size_type> offsets(1, 0);
 
   std::string line;
   std::getline(merges_file, line);
@@ -102,26 +86,23 @@ std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_me
 }
 
 std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
-  cudf::strings_column_view const& input, rmm::cuda_stream_view stream)
+  cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
   // Ensure capacity is at least (size/0.7) as documented here:
   // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
     static_cast<size_t>(input.size() * 2),  // capacity is 2x;
-    cuco::empty_key{std::numeric_limits<cudf::hash_value_type>::max()},
+    cuco::empty_key{-1},
     cuco::empty_value{-1},                  // empty value is not used
+    bpe_equal{input},
+    probe_scheme{bpe_hasher{input}},
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 
-  auto d_strings = cudf::column_device_view::create(input.parent(), stream);
-  make_pair_function pair_func{string_hasher_type{}, *d_strings};
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+  auto iter = cudf::detail::make_counting_transform_iterator(
+    0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); });
 
-  merge_pairs_map->insert(iter,
-                          iter + input.size(),
-                          cuco::murmurhash3_32<cudf::hash_value_type>{},
-                          thrust::equal_to<cudf::hash_value_type>{},
-                          stream.value());
+  merge_pairs_map->insert_async(iter, iter + input.size(), stream.value());
 
   return merge_pairs_map;
 }
@@ -129,9 +110,10 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& input, rmm::cuda_stream_view stream)
 {
-  auto merge_pairs = initialize_merge_pairs_map(cudf::strings_column_view(input->view()), stream);
-  return std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(std::move(input),
-                                                                         std::move(merge_pairs));
+  auto d_input     = cudf::column_device_view::create(input->view(), stream);
+  auto merge_pairs = initialize_merge_pairs_map(*d_input, stream);
+  return std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(
+    std::move(input), std::move(d_input), std::move(merge_pairs));
 }
 
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
@@ -164,8 +146,12 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filena
 
 bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& merge_pairs,
+  std::unique_ptr<cudf::column_device_view, std::function<void(cudf::column_device_view*)>>&&
+    d_merge_pairs,
   std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map)
-  : merge_pairs(std::move(merge_pairs)), merge_pairs_map(std::move(merge_pairs_map))
+  : merge_pairs(std::move(merge_pairs)),
+    d_merge_pairs(std::move(d_merge_pairs)),
+    merge_pairs_map(std::move(merge_pairs_map))
 {
 }
 
@@ -185,7 +171,4 @@ bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input,
 
 bpe_merge_pairs::~bpe_merge_pairs() = default;
 
-cudf::size_type bpe_merge_pairs::get_size() { return impl->merge_pairs->size(); }
-std::size_t bpe_merge_pairs::get_map_size() { return impl->merge_pairs_map->get_size(); }
-
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index 844f2a625e0..1a3084a257f 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,13 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -31,6 +34,7 @@
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/tabulate.h>
 #include <thrust/transform_scan.h>
 
 namespace nvtext {
@@ -55,7 +59,7 @@ namespace {
 __global__ void kernel_compute_tensor_metadata(
   // input
   uint32_t const* token_ids,
-  uint32_t const* offsets,
+  cudf::size_type const* offsets,
   uint32_t const* row2tensor,
   uint32_t const* row2row_within_tensor,
   uint32_t max_sequence_length,
@@ -67,8 +71,13 @@ __global__ void kernel_compute_tensor_metadata(
   uint32_t* attn_mask,
   uint32_t* metadata)
 {
-  uint32_t const output_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (output_idx >= (nrows_tensor_token_ids * max_sequence_length)) return;
+  cudf::thread_index_type const output_idx =
+    threadIdx.x + static_cast<cudf::thread_index_type>(blockIdx.x) *
+                    static_cast<cudf::thread_index_type>(blockDim.x);
+  if (output_idx >= (static_cast<cudf::thread_index_type>(nrows_tensor_token_ids) *
+                     static_cast<cudf::thread_index_type>(max_sequence_length))) {
+    return;
+  }
 
   uint32_t const absolute_row_id         = output_idx / max_sequence_length;
   uint32_t const tensor_id               = row2tensor[absolute_row_id];
@@ -125,6 +134,28 @@ __global__ void kernel_compute_tensor_metadata(
   }
 }
 
+// this happens if there are no tokens in the input
+tokenizer_result build_empty_result(cudf::size_type size,
+                                    uint32_t max_sequence_length,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  auto zero = cudf::numeric_scalar<uint32_t>(0, true, stream);
+  auto ids  = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);
+  auto mask = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);
+
+  auto metadata = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_id::UINT32}, size * 3, cudf::mask_state::UNALLOCATED, stream, mr);
+  thrust::tabulate(rmm::exec_policy(stream),
+                   metadata->mutable_view().begin<uint32_t>(),
+                   metadata->mutable_view().end<uint32_t>(),
+                   [] __device__(auto idx) { return ((idx % 3) == 0) ? idx : 0; });
+  metadata->set_null_count(0);
+
+  return tokenizer_result{
+    0, max_sequence_length, std::move(ids), std::move(mask), std::move(metadata)};
+}
+
 }  // namespace
 
 tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
@@ -133,36 +164,38 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   uint32_t stride,
                                   bool do_lower_case,
                                   bool do_truncate,
-                                  uint32_t max_rows_tensor,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(stride <= max_sequence_length,
                "stride must be less than or equal to max_sequence_length");
-  CUDF_EXPECTS(max_sequence_length * max_rows_tensor <
-                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-               "max_sequence_length x max_rows_tensor is too large for cudf output column size");
   auto const strings_count = strings.size();
-  if (strings_count == 0 || strings.chars_size() == 0)
+  if (strings_count == strings.null_count()) {  // empty or all-null returns empty
     return tokenizer_result{0,
                             max_sequence_length,
                             cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}),
                             cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}),
                             cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32})};
+  }
+  CUDF_EXPECTS(
+    max_sequence_length <=
+      (static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()) / strings_count),
+    "max_sequence_length times number of input rows exceeds the column size limit",
+    std::overflow_error);
 
   auto const offsets   = strings.offsets();
-  auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
-  auto const offset    = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
-  auto const d_chars   = strings.chars().data<char>() + offset;
+  auto const d_offsets = offsets.data<cudf::size_type>() + strings.offset();
+  auto const offset  = cudf::detail::get_value<cudf::size_type>(offsets, strings.offset(), stream);
+  auto const d_chars = strings.chars().data<char>() + offset;
 
   // Create tokenizer
   wordpiece_tokenizer tokenizer(
-    vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case);
+    vocab_table, max_sequence_length, stride, do_truncate, do_lower_case);
   // Run tokenizer
   auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
   // assign output components
-  uint32_t const* device_token_ids = tokens.first->data();
-  uint32_t const* device_offsets   = tokens.second->data();
+  auto device_token_ids = tokens.first->data();
+  auto device_offsets   = tokens.second->data();
 
   // Format output from tokenizer
   // Each string can create 1 or more tensor entries.
@@ -187,6 +220,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
     thrust::plus<uint32_t>());
   // last element is the total number of output rows
   uint32_t const nrows_tensor_token_ids = offsets_per_tensor.element(strings_count, stream);
+  // if there are no tokens at all, build a specific empty result
+  if (nrows_tensor_token_ids == 0) {
+    return build_empty_result(strings_count, max_sequence_length, stream, mr);
+  }
 
   // compute global_row to tensor, and global_row to within_tensor_row correspondence
   rmm::device_uvector<uint32_t> row2tensor(nrows_tensor_token_ids, stream);
@@ -259,7 +296,6 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   uint32_t stride,
                                   bool do_lower_case,
                                   bool do_truncate,
-                                  uint32_t max_rows_tensor,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -269,7 +305,6 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   stride,
                                   do_lower_case,
                                   do_truncate,
-                                  max_rows_tensor,
                                   cudf::get_default_stream(),
                                   mr);
 }
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index 601072d583c..3b912017320 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,7 +61,7 @@ namespace {
  * Memory required is 13 bytes per code point values:
  * - 4 bytes each for `start_word_indices` and `end_word_indices`
  * - 4 bytes for each `token_ids`
- * - 1 byte for each each `tokens_per_word`
+ * - 1 byte for each `tokens_per_word`
  * Also, there is a code point value for each byte in the input strings.
  *
  * @param[in] code_points A pointer to the code points in the strings after normalization.
@@ -82,7 +82,9 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
                                                        uint32_t* token_ids,
                                                        uint8_t* tokens_per_word)
 {
-  uint32_t char_for_thread = blockDim.x * blockIdx.x + threadIdx.x;
+  cudf::thread_index_type char_for_thread = static_cast<cudf::thread_index_type>(blockDim.x) *
+                                              static_cast<cudf::thread_index_type>(blockIdx.x) +
+                                            threadIdx.x;
 
   // Deal with the start_word_indices array
   if (char_for_thread < num_code_points) {
@@ -130,12 +132,14 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
  * @param num_strings The total number of strings to be processed.
  */
 __global__ void mark_string_start_and_ends(uint32_t const* code_points,
-                                           uint32_t const* strings_offsets,
+                                           cudf::size_type const* strings_offsets,
                                            uint32_t* start_word_indices,
                                            uint32_t* end_word_indices,
                                            uint32_t num_strings)
 {
-  uint32_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  cudf::thread_index_type idx = static_cast<cudf::thread_index_type>(blockDim.x) *
+                                  static_cast<cudf::thread_index_type>(blockIdx.x) +
+                                threadIdx.x;
   // Ensure the starting character of each strings is written to the word start array.
   if (idx <= num_strings) {
     auto const offset = strings_offsets[idx];
@@ -330,7 +334,9 @@ __global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points,
                                            uint32_t* token_ids,
                                            uint8_t* tokens_per_word)
 {
-  uint32_t const word_to_tokenize = blockDim.x * blockIdx.x + threadIdx.x;
+  cudf::thread_index_type word_to_tokenize = static_cast<cudf::thread_index_type>(blockDim.x) *
+                                               static_cast<cudf::thread_index_type>(blockIdx.x) +
+                                             threadIdx.x;
 
   if (word_to_tokenize >= total_words) return;
   // Each thread gets the start code_point offset for each word and resets the token_id memory to
@@ -397,7 +403,6 @@ __global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points,
 }  // namespace
 
 wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
-                                         uint32_t max_rows_final_tensor,
                                          uint32_t max_sequence_length,
                                          uint32_t stride,
                                          bool do_truncate,
@@ -415,8 +420,8 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
 }
 
 uvector_pair wordpiece_tokenizer::tokenize(char const* d_strings,
-                                           uint32_t const* d_offsets,
-                                           uint32_t num_strings,
+                                           cudf::size_type const* d_offsets,
+                                           cudf::size_type num_strings,
                                            rmm::cuda_stream_view stream)
 {
   auto cps_and_offsets = normalizer.normalize(d_strings, d_offsets, num_strings, stream);
@@ -434,13 +439,13 @@ struct tranform_fn {  // just converting uint8 value to uint32
 
 void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream)
 {
-  uint32_t* device_code_points     = cps_and_offsets.first->data();
-  size_t const num_code_points     = cps_and_offsets.first->size();
-  uint32_t* device_strings_offsets = cps_and_offsets.second->data();
-  uint32_t const num_strings       = cps_and_offsets.second->size() - 1;
+  auto device_code_points     = cps_and_offsets.first->data();
+  auto const num_code_points  = cps_and_offsets.first->size();
+  auto device_strings_offsets = cps_and_offsets.second->data();
+  auto const num_strings      = cps_and_offsets.second->size() - 1;
 
-  const size_t four_byte_cp_chunks = 1 + (num_code_points - 1) / sizeof(uint32_t);
-  const size_t rounded_num_cps     = sizeof(uint32_t) * four_byte_cp_chunks;
+  size_t const four_byte_cp_chunks = 1 + (num_code_points - 1) / sizeof(uint32_t);
+  size_t const rounded_num_cps     = sizeof(uint32_t) * four_byte_cp_chunks;
   rmm::device_uvector<uint8_t> device_tokens_per_word(rounded_num_cps, stream);
   rmm::device_uvector<uint32_t> device_token_ids(num_code_points, stream);
   rmm::device_uvector<uint32_t> device_word_indices(2 * num_code_points, stream);
@@ -498,9 +503,12 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
   // We need to change the end_word_indices pointer after the selection is complete
   device_end_word_indices = device_start_word_indices + num_words;
 
-  cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(num_words), THREADS_PER_BLOCK};
-  detail::
-    kernel_wordpiece_tokenizer<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+  if (num_words > 0) {
+    cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(num_words), THREADS_PER_BLOCK};
+    detail::kernel_wordpiece_tokenizer<<<grid.num_blocks,
+                                         grid.num_threads_per_block,
+                                         0,
+                                         stream.value()>>>(
       device_code_points,
       vocab_table.table->view().data<uint64_t>(),
       vocab_table.bin_coefficients->view().data<uint64_t>(),
@@ -515,7 +523,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
       num_words,
       device_token_ids.data(),
       device_tokens_per_word.data());
-  CUDF_CHECK_CUDA(stream.value());
+    CUDF_CHECK_CUDA(stream.value());
+  }
 
   // Repurpose the input array for the token ids. In the worst case, each code point ends up being a
   // token so this will always have enough memory to store the contiguous tokens.
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 4ffd1b08998..16b9f25b802 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+#include <text/utilities/tokenize_ops.cuh>
+
+#include <nvtext/detail/tokenize.hpp>
+#include <nvtext/tokenize.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -24,9 +29,6 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <nvtext/detail/tokenize.hpp>
-#include <nvtext/tokenize.hpp>
-#include <text/utilities/tokenize_ops.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -50,12 +52,13 @@ std::unique_ptr<cudf::column> token_count_fn(cudf::size_type strings_count,
                                              rmm::mr::device_memory_resource* mr)
 {
   // create output column
-  auto token_counts   = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32},
-                                                strings_count,
-                                                cudf::mask_state::UNALLOCATED,
-                                                stream,
-                                                mr);
-  auto d_token_counts = token_counts->mutable_view().data<int32_t>();
+  auto token_counts =
+    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                              strings_count,
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              mr);
+  auto d_token_counts = token_counts->mutable_view().data<cudf::size_type>();
   // add the counts to the column
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -77,10 +80,10 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
     token_count_fn(strings_count, tokenizer, stream, rmm::mr::get_current_device_resource());
   auto d_token_counts = token_counts->view();
   // create token-index offsets from the counts
-  rmm::device_uvector<int32_t> token_offsets(strings_count + 1, stream);
+  rmm::device_uvector<cudf::size_type> token_offsets(strings_count + 1, stream);
   thrust::inclusive_scan(rmm::exec_policy(stream),
-                         d_token_counts.template begin<int32_t>(),
-                         d_token_counts.template end<int32_t>(),
+                         d_token_counts.template begin<cudf::size_type>(),
+                         d_token_counts.template end<cudf::size_type>(),
                          token_offsets.begin() + 1);
   token_offsets.set_element_to_zero_async(0, stream);
   auto const total_tokens = token_offsets.back_element(stream);
@@ -175,10 +178,10 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   }
 
   auto offsets = strings_column.offsets();
-  auto offset  = cudf::detail::get_value<int32_t>(offsets, strings_column.offset(), stream);
-  auto chars_bytes =
-    cudf::detail::get_value<int32_t>(offsets, strings_column.offset() + strings_count, stream) -
-    offset;
+  auto offset  = cudf::detail::get_value<cudf::size_type>(offsets, strings_column.offset(), stream);
+  auto chars_bytes = cudf::detail::get_value<cudf::size_type>(
+                       offsets, strings_column.offset() + strings_count, stream) -
+                     offset;
   auto d_chars = strings_column.chars().data<uint8_t>();  // unsigned is necessary for checking bits
   d_chars += offset;
 
@@ -198,16 +201,17 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   // create output offsets column
   // -- conditionally copy a counting iterator where
   //    the first byte of each character is located
-  auto offsets_column = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32},
-                                                  num_characters + 1,
-                                                  cudf::mask_state::UNALLOCATED,
-                                                  stream,
-                                                  mr);
-  auto d_new_offsets  = offsets_column->mutable_view().begin<int32_t>();
+  auto offsets_column =
+    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                              num_characters + 1,
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              mr);
+  auto d_new_offsets = offsets_column->mutable_view().begin<cudf::size_type>();
   thrust::copy_if(
     rmm::exec_policy(stream),
-    thrust::make_counting_iterator<int32_t>(0),
-    thrust::make_counting_iterator<int32_t>(chars_bytes + 1),
+    thrust::counting_iterator<cudf::size_type>(0),
+    thrust::counting_iterator<cudf::size_type>(chars_bytes + 1),
     d_new_offsets,
     [d_chars, chars_bytes] __device__(auto idx) {
       // this will also set the final value to the size chars_bytes
@@ -215,7 +219,8 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
     });
 
   // create the output chars column -- just a copy of the input's chars column
-  cudf::column_view chars_view(cudf::data_type{cudf::type_id::INT8}, chars_bytes, d_chars);
+  cudf::column_view chars_view(
+    cudf::data_type{cudf::type_id::INT8}, chars_bytes, d_chars, nullptr, 0);
   auto chars_column = std::make_unique<cudf::column>(chars_view, stream, mr);
 
   // return new strings column
diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh
index 6a2d8c94835..fbd2d1efcff 100644
--- a/cpp/src/text/utilities/tokenize_ops.cuh
+++ b/cpp/src/text/utilities/tokenize_ops.cuh
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.cuh>
 
 #include <thrust/execution_policy.h>
@@ -24,7 +25,7 @@
 
 namespace nvtext {
 namespace detail {
-using string_index_pair = thrust::pair<const char*, cudf::size_type>;
+using string_index_pair = thrust::pair<char const*, cudf::size_type>;
 using position_pair     = thrust::pair<cudf::size_type, cudf::size_type>;
 
 /**
@@ -50,7 +51,7 @@ struct characters_tokenizer {
     : d_str{d_str},
       d_delimiter{d_delimiter},
       spaces{true},
-      itr{d_str.begin()},
+      current_position{0},
       start_position(0),
       end_position(d_str.size_bytes())
   {
@@ -64,7 +65,7 @@ struct characters_tokenizer {
    * @param chr The character to test.
    * @return true if the character is a delimiter
    */
-  __device__ bool is_delimiter(cudf::char_utf8 chr)
+  __device__ bool is_delimiter(cudf::char_utf8 chr) const
   {
     return d_delimiter.empty() ? (chr <= ' ') :  // whitespace check
              thrust::any_of(thrust::seq,
@@ -78,7 +79,7 @@ struct characters_tokenizer {
    * string at the specified iterator position.
    *
    * For empty delimiter, whitespace code-point is checked.
-   * Starting at the given iterator (itr) position, a token
+   * Starting at the current_position, a token
    * start position is identified when a delimiter is
    * not found. Once found, the end position is identified
    * when a delimiter or the end of the string is found.
@@ -87,27 +88,33 @@ struct characters_tokenizer {
    */
   __device__ bool next_token()
   {
-    if (itr != d_str.begin()) {  // skip these 2 lines the first time through
-      ++itr;
-      start_position = itr.byte_offset();
+    auto const src_ptr = d_str.data();
+    if (current_position >= d_str.size_bytes()) { return false; }
+    if (current_position != 0) {  // skip these 2 lines the first time through
+      current_position += cudf::strings::detail::bytes_in_char_utf8(src_ptr[current_position]);
+      start_position = current_position;
     }
-    if (start_position >= d_str.size_bytes()) return false;
+    if (start_position >= d_str.size_bytes()) { return false; }
     // continue search for the next token
     end_position = d_str.size_bytes();
-    for (; itr != d_str.end(); ++itr) {
-      cudf::char_utf8 ch = *itr;
+    while (current_position < d_str.size_bytes()) {
+      cudf::char_utf8 ch   = 0;
+      auto const chr_width = cudf::strings::detail::to_char_utf8(src_ptr + current_position, ch);
       if (spaces == is_delimiter(ch)) {
-        if (spaces)
-          start_position = (itr + 1).byte_offset();
-        else
-          end_position = (itr + 1).byte_offset();
+        current_position += chr_width;
+        if (spaces) {
+          start_position = current_position;
+        } else {
+          end_position = current_position;
+        }
         continue;
       }
       spaces = !spaces;
       if (spaces) {
-        end_position = itr.byte_offset();
+        end_position = current_position;
         break;
       }
+      current_position += chr_width;
     }
     return start_position < end_position;
   }
@@ -118,18 +125,18 @@ struct characters_tokenizer {
    *
    * @return Byte positions of the current token.
    */
-  __device__ position_pair token_byte_positions()
+  __device__ position_pair token_byte_positions() const
   {
     return position_pair{start_position, end_position};
   }
 
  private:
-  cudf::string_view const d_str;          ///< string to tokenize
-  cudf::string_view const d_delimiter;    ///< delimiter characters
-  bool spaces;                            ///< true if current position is delimiter
-  cudf::string_view::const_iterator itr;  ///< current position in d_str
-  cudf::size_type start_position;         ///< starting character position of token found
-  cudf::size_type end_position;           ///< ending character position (excl) of token found
+  cudf::string_view const d_str;        ///< string to tokenize
+  cudf::string_view const d_delimiter;  ///< delimiter characters
+  bool spaces;                          ///< true if current position is delimiter
+  cudf::size_type current_position;     ///< current position in d_str
+  cudf::size_type start_position;       ///< starting byte position of token found
+  cudf::size_type end_position;         ///< ending byte position (exclusive) of token found
 };
 
 /**
@@ -142,7 +149,7 @@ struct characters_tokenizer {
 struct strings_tokenizer {
   cudf::column_device_view const d_strings;  ///< strings to tokenize
   cudf::string_view const d_delimiter;       ///< delimiter characters to tokenize around
-  int32_t* d_offsets{};                      ///< offsets into the d_tokens vector for each string
+  cudf::size_type* d_offsets{};              ///< offsets into the d_tokens vector for each string
   string_index_pair* d_tokens{};             ///< token positions in device memory
 
   /**
@@ -187,7 +194,7 @@ struct multi_delimiter_strings_tokenizer {
   cudf::column_device_view const d_strings;  ///< strings column to tokenize
   delimiterator delimiters_begin;            ///< first delimiter
   delimiterator delimiters_end;              ///< last delimiter
-  int32_t* d_offsets{};                      ///< offsets into the d_tokens output vector
+  cudf::size_type* d_offsets{};              ///< offsets into the d_tokens output vector
   string_index_pair* d_tokens{};             ///< token positions found for each string
 
   /**
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 8fa5e75664f..224dd93b048 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -19,6 +19,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -68,9 +69,8 @@ __launch_bounds__(max_block_size) __global__
 
   auto thread_intermediate_storage =
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
-  auto const start_idx =
-    static_cast<cudf::thread_index_type>(threadIdx.x + blockIdx.x * blockDim.x);
-  auto const stride = static_cast<cudf::thread_index_type>(blockDim.x * gridDim.x);
+  auto start_idx    = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
   auto evaluator =
     cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);
 
@@ -128,6 +128,8 @@ std::unique_ptr<column> compute_column(table_view const& table,
         *table_device, device_expression_data, *mutable_output_device);
   }
   CUDF_CHECK_CUDA(stream.value());
+  output_column->set_null_count(
+    cudf::detail::null_count(mutable_output_device->null_mask(), 0, output_column->size(), stream));
   return output_column;
 }
 
diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu
index 3360ac8cf77..0170cc50c6f 100644
--- a/cpp/src/transform/jit/kernel.cu
+++ b/cpp/src/transform/jit/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,15 +37,12 @@ namespace jit {
 template <typename TypeOut, typename TypeIn>
 __global__ void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data)
 {
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
+  // cannot use global_thread_id utility due to a JIT build issue by including
+  // the `cudf/detail/utilities/cuda.cuh` header
+  thread_index_type const start  = threadIdx.x + blockIdx.x * blockDim.x;
+  thread_index_type const stride = blockDim.x * gridDim.x;
 
-  int start = tid + blkid * blksz;
-  int step  = blksz * gridsz;
-
-  for (cudf::size_type i = start; i < size; i += step) {
+  for (auto i = start; i < static_cast<thread_index_type>(size); i += stride) {
     GENERIC_UNARY_OP(&out_data[i], in_data[i]);
   }
 }
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 3f3dd422f9d..c7bb40e3bcb 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -62,12 +62,12 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
 {
   CUDF_EXPECTS(input.type() == categories.type(), "Mismatch type between input and categories.");
 
-  if (categories.is_empty()) { return std::pair(make_empty_column(type_id::BOOL8), table_view{}); }
+  if (categories.is_empty()) { return {make_empty_column(type_id::BOOL8), table_view{}}; }
 
   if (input.is_empty()) {
     auto empty_data = make_empty_column(type_id::BOOL8);
     std::vector<column_view> views(categories.size(), empty_data->view());
-    return std::pair(std::move(empty_data), table_view{views});
+    return {std::move(empty_data), table_view{views}};
   }
 
   auto const total_size = input.size() * categories.size();
@@ -101,9 +101,9 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
     make_counting_transform_iterator(1, [width = input.size()](auto i) { return i * width; });
   std::vector<size_type> split_indices(split_iter, split_iter + categories.size() - 1);
 
-  auto encodings_view = table_view{split(all_encodings->view(), split_indices, stream)};
+  auto encodings_view = table_view{detail::split(all_encodings->view(), split_indices, stream)};
 
-  return std::pair(std::move(all_encodings), encodings_view);
+  return {std::move(all_encodings), encodings_view};
 }
 
 }  // namespace detail
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 1507a8ce7c6..b151b44565d 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -352,10 +352,10 @@ __device__ size_type row_size_functor::operator()<string_view>(column_device_vie
     return 0;
   }
 
-  auto const offsets_size  = sizeof(offset_type) * CHAR_BIT;
+  auto const offsets_size  = sizeof(size_type) * CHAR_BIT;
   auto const validity_size = col.nullable() ? 1 : 0;
   auto const chars_size =
-    (offsets.data<offset_type>()[row_end] - offsets.data<offset_type>()[row_start]) * CHAR_BIT;
+    (offsets.data<size_type>()[row_end] - offsets.data<size_type>()[row_start]) * CHAR_BIT;
   return ((offsets_size + validity_size) * num_rows) + chars_size;
 }
 
@@ -372,7 +372,7 @@ __device__ size_type row_size_functor::operator()<list_view>(column_device_view
 {
   auto const num_rows{span.row_end - span.row_start};
 
-  auto const offsets_size  = sizeof(offset_type) * CHAR_BIT;
+  auto const offsets_size  = sizeof(size_type) * CHAR_BIT;
   auto const validity_size = col.nullable() ? 1 : 0;
   return (offsets_size + validity_size) * num_rows;
 }
@@ -451,10 +451,10 @@ __global__ void compute_row_sizes(device_span<column_device_view const> cols,
     // if this is a list column, update the working span from our offsets
     if (col.type().id() == type_id::LIST && col.size() > 0) {
       column_device_view const& offsets = col.child(lists_column_view::offsets_column_index);
-      auto const base_offset            = offsets.data<offset_type>()[col.offset()];
+      auto const base_offset            = offsets.data<size_type>()[col.offset()];
       cur_span.row_start =
-        offsets.data<offset_type>()[cur_span.row_start + col.offset()] - base_offset;
-      cur_span.row_end = offsets.data<offset_type>()[cur_span.row_end + col.offset()] - base_offset;
+        offsets.data<size_type>()[cur_span.row_start + col.offset()] - base_offset;
+      cur_span.row_end = offsets.data<size_type>()[cur_span.row_end + col.offset()] - base_offset;
     }
 
     last_branch_depth = info[idx].branch_depth_end;
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 6fb125617dc..57a18b64aa2 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -37,7 +37,7 @@ namespace jit {
 
 void unary_operation(mutable_column_view output,
                      column_view input,
-                     const std::string& udf,
+                     std::string const& udf,
                      data_type output_type,
                      bool is_ptx,
                      rmm::cuda_stream_view stream)
diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu
index 94ede5d3c65..45c2e650095 100644
--- a/cpp/src/transpose/transpose.cu
+++ b/cpp/src/transpose/transpose.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
 {
   // If there are no rows in the input, return successfully
   if (input.num_columns() == 0 || input.num_rows() == 0) {
-    return std::pair(std::make_unique<column>(), table_view{});
+    return {std::make_unique<column>(), table_view{}};
   }
 
   // Check datatype homogeneity
@@ -53,9 +53,9 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
   auto splits_iter   = thrust::make_transform_iterator(
     one_iter, [width = input.num_columns()](size_type idx) { return idx * width; });
   auto splits = std::vector<size_type>(splits_iter, splits_iter + input.num_rows() - 1);
-  auto output_column_views = split(output_column->view(), splits, stream);
+  auto output_column_views = detail::split(output_column->view(), splits, stream);
 
-  return std::pair(std::move(output_column), table_view(output_column_views));
+  return {std::move(output_column), table_view(output_column_views)};
 }
 }  // namespace detail
 
diff --git a/cpp/src/utilities/stacktrace.cpp b/cpp/src/utilities/stacktrace.cpp
new file mode 100644
index 00000000000..48cf632bbaa
--- /dev/null
+++ b/cpp/src/utilities/stacktrace.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/stacktrace.hpp>
+
+#if defined(__GNUC__) && defined(CUDF_BUILD_STACKTRACE_DEBUG)
+#include <cxxabi.h>
+#include <execinfo.h>
+
+#include <cstdlib>
+#include <cstring>
+#include <sstream>
+#endif  // defined(__GNUC__) && defined(CUDF_BUILD_STACKTRACE_DEBUG)
+
+namespace cudf::detail {
+
+std::string get_stacktrace(capture_last_stackframe capture_last_frame)
+{
+#if defined(__GNUC__) && defined(CUDF_BUILD_STACKTRACE_DEBUG)
+  constexpr int max_stack_depth = 64;
+  void* stack[max_stack_depth];
+
+  auto const depth   = backtrace(stack, max_stack_depth);
+  auto const modules = backtrace_symbols(stack, depth);
+
+  if (modules == nullptr) { return "No stacktrace could be captured!"; }
+
+  std::stringstream ss;
+
+  // Skip one more depth to avoid including the stackframe of this function.
+  auto const skip_depth = 1 + (capture_last_frame == capture_last_stackframe::YES ? 0 : 1);
+  for (auto i = skip_depth; i < depth; ++i) {
+    // Each modules[i] string contains a mangled name in the format like following:
+    // `module_name(function_name+0x012) [0x01234567890a]`
+    // We need to extract function name and function offset.
+    char* begin_func_name   = std::strstr(modules[i], "(");
+    char* begin_func_offset = std::strstr(modules[i], "+");
+    char* end_func_offset   = std::strstr(modules[i], ")");
+
+    auto const frame_idx = i - skip_depth;
+    if (begin_func_name && begin_func_offset && end_func_offset &&
+        begin_func_name < begin_func_offset) {
+      // Split `modules[i]` into separate null-terminated strings.
+      // After this, mangled function name will then be [begin_func_name, begin_func_offset), and
+      // function offset is in [begin_func_offset, end_func_offset).
+      *(begin_func_name++)   = '\0';
+      *(begin_func_offset++) = '\0';
+      *end_func_offset       = '\0';
+
+      // We need to demangle function name.
+      int status{0};
+      char* func_name = abi::__cxa_demangle(begin_func_name, nullptr, nullptr, &status);
+
+      ss << "#" << frame_idx << ": " << modules[i] << " : "
+         << (status == 0 /*demangle success*/ ? func_name : begin_func_name) << "+"
+         << begin_func_offset << "\n";
+      free(func_name);
+    } else {
+      ss << "#" << frame_idx << ": " << modules[i] << "\n";
+    }
+  }
+
+  free(modules);
+
+  return ss.str();
+#else
+#ifdef CUDF_BUILD_STACKTRACE_DEBUG
+  return "Stacktrace is only supported when built with a GNU compiler.";
+#else
+  return "libcudf was not built with stacktrace support.";
+#endif  // CUDF_BUILD_STACKTRACE_DEBUG
+#endif  // __GNUC__
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 7f2807fc30e..a69dc9bf2f8 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -24,7 +24,7 @@ rapids_test_init()
 # properties and linking to build the test
 function(ConfigureTest CMAKE_TEST_NAME)
   set(options)
-  set(one_value GPUS PERCENT)
+  set(one_value GPUS PERCENT STREAM_MODE)
   set(multi_value)
   cmake_parse_arguments(_CUDF_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN})
   if(NOT DEFINED _CUDF_TEST_GPUS AND NOT DEFINED _CUDF_TEST_PERCENT)
@@ -37,6 +37,9 @@ function(ConfigureTest CMAKE_TEST_NAME)
   if(NOT DEFINED _CUDF_TEST_PERCENT)
     set(_CUDF_TEST_PERCENT 100)
   endif()
+  if(NOT DEFINED _CUDF_TEST_STREAM_MODE)
+    set(_CUDF_TEST_STREAM_MODE cudf)
+  endif()
 
   add_executable(${CMAKE_TEST_NAME} ${_CUDF_TEST_UNPARSED_ARGUMENTS})
   set_target_properties(
@@ -62,6 +65,13 @@ function(ConfigureTest CMAKE_TEST_NAME)
     PERCENT ${_CUDF_TEST_PERCENT}
     INSTALL_COMPONENT_SET testing
   )
+
+  set_tests_properties(
+    ${CMAKE_TEST_NAME}
+    PROPERTIES
+      ENVIRONMENT
+      "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage_mode_${_CUDF_TEST_STREAM_MODE}>"
+  )
 endfunction()
 
 # ##################################################################################################
@@ -153,7 +163,14 @@ ConfigureTest(DATETIME_OPS_TEST datetime/datetime_ops_test.cpp)
 
 # ##################################################################################################
 # * hashing tests ---------------------------------------------------------------------------------
-ConfigureTest(HASHING_TEST hashing/hash_test.cpp)
+ConfigureTest(
+  HASHING_TEST
+  hashing/md5_test.cpp
+  hashing/murmurhash3_x86_32_test.cpp
+  hashing/murmurhash3_x64_128_test.cpp
+  hashing/spark_murmurhash3_x86_32_test.cpp
+  hashing/xxhash_64_test.cpp
+)
 
 # ##################################################################################################
 # * partitioning tests ----------------------------------------------------------------------------
@@ -349,7 +366,25 @@ ConfigureTest(
 
 # ##################################################################################################
 # * span tests -------------------------------------------------------------------------------
+
+# This test must be split into two executables so that one can use the preload library and one does
+# not. The one that doesn't includes a thrust::device_vector copy, which is always synchronous on
+# the default stream and is out of libcudf's control (but must be tested).
+set(_allowlist_filter SpanTest.CanConstructFromDeviceContainers)
+
 ConfigureTest(SPAN_TEST utilities_tests/span_tests.cu)
+ConfigureTest(SPAN_TEST_DEVICE_VECTOR utilities_tests/span_tests.cu)
+
+# Overwrite the environments set by ConfigureTest
+set_tests_properties(
+  SPAN_TEST
+  PROPERTIES
+    ENVIRONMENT
+    "GTEST_FILTER=-${_allowlist_filter};GTEST_CUDF_STREAM_MODE=new_cudf_default;LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage_mode_cudf>"
+)
+set_tests_properties(
+  SPAN_TEST_DEVICE_VECTOR PROPERTIES ENVIRONMENT "GTEST_FILTER=${_allowlist_filter}"
+)
 
 # ##################################################################################################
 # * iterator tests --------------------------------------------------------------------------------
@@ -385,7 +420,7 @@ ConfigureTest(
 # * table tests -----------------------------------------------------------------------------------
 ConfigureTest(
   TABLE_TEST table/table_tests.cpp table/table_view_tests.cu table/row_operators_tests.cpp
-  table/experimental_row_operator_tests.cu
+  table/experimental_row_operator_tests.cu table/row_operator_tests_utilities.cu
 )
 
 # ##################################################################################################
@@ -401,8 +436,9 @@ ConfigureTest(
   stream_compaction/apply_boolean_mask_tests.cpp
   stream_compaction/distinct_count_tests.cpp
   stream_compaction/distinct_tests.cpp
-  stream_compaction/drop_nulls_tests.cpp
   stream_compaction/drop_nans_tests.cpp
+  stream_compaction/drop_nulls_tests.cpp
+  stream_compaction/stable_distinct_tests.cpp
   stream_compaction/unique_count_tests.cpp
   stream_compaction/unique_tests.cpp
 )
@@ -417,6 +453,7 @@ ConfigureTest(
   rolling/grouped_rolling_test.cpp
   rolling/lead_lag_test.cpp
   rolling/nth_element_test.cpp
+  rolling/range_comparator_test.cu
   rolling/range_rolling_window_test.cpp
   rolling/range_window_bounds_test.cpp
   rolling/rolling_test.cpp
@@ -507,6 +544,7 @@ ConfigureTest(
   TEXT_TEST
   text/bpe_tests.cpp
   text/edit_distance_tests.cpp
+  text/jaccard_tests.cpp
   text/minhash_tests.cpp
   text/ngrams_tests.cpp
   text/ngrams_tokenize_tests.cpp
@@ -577,17 +615,18 @@ ConfigureTest(
 ConfigureTest(LABEL_BINS_TEST labeling/label_bins_tests.cpp)
 
 # ##################################################################################################
-# * stream identification tests -------------------------------------------------------------------
+# * stream testing ---------------------------------------------------------------------------------
 ConfigureTest(
   STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu
 )
-# Note that this only works when the test is invoked via ctest. At the moment CI is running all
-# tests by manually invoking the executable, so we'll have to manually pass this environment
-# variable in that setup.
-set_tests_properties(
-  STREAM_IDENTIFICATION_TEST
-  PROPERTIES ENVIRONMENT LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage_mode_cudf>
-)
+
+ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index 737224f2624..c0109a40cec 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -29,6 +29,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -94,6 +95,33 @@ TEST_F(TransformTest, NullLiteral)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
 }
 
+TEST_F(TransformTest, IsNull)
+{
+  auto c_0   = column_wrapper<int32_t>{{0, 1, 2, 0}, {0, 1, 1, 0}};
+  auto table = cudf::table_view{{c_0}};
+
+  // result of IS_NULL on literal, will be a column of table size, with all values set to
+  // !literal.is_valid(). The table values are irrelevant.
+  auto literal_value = cudf::numeric_scalar<int32_t>(-123);
+  auto literal       = cudf::ast::literal(literal_value);
+  auto expression    = cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, literal);
+
+  auto result    = cudf::compute_column(table, expression);
+  auto expected1 = column_wrapper<bool>({0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result->view(), verbosity);
+
+  literal_value.set_valid_async(false);
+  result         = cudf::compute_column(table, expression);
+  auto expected2 = column_wrapper<bool>({1, 1, 1, 1}, cudf::test::iterators::no_nulls());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result->view(), verbosity);
+
+  auto col_ref_0   = cudf::ast::column_reference(0);
+  auto expression2 = cudf::ast::operation(cudf::ast::ast_operator::IS_NULL, col_ref_0);
+  result           = cudf::compute_column(table, expression2);
+  auto expected3   = column_wrapper<bool>({1, 0, 0, 1}, cudf::test::iterators::no_nulls());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, result->view(), verbosity);
+}
+
 TEST_F(TransformTest, BasicAddition)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
index 89bb1365ead..03cc87a1968 100644
--- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp
+++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
@@ -35,7 +35,7 @@ struct BinaryOperationGenericPTXTest : public BinaryOperationTest {
 TEST_F(BinaryOperationGenericPTXTest, CAdd_Vector_Vector_FP32_FP32_FP32)
 {
   // c = a*a*a + b
-  const char* ptx =
+  char const* ptx =
     R"***(
 //
 // Generated by NVIDIA NVVM Compiler
@@ -94,7 +94,7 @@ TEST_F(BinaryOperationGenericPTXTest, CAdd_Vector_Vector_FP32_FP32_FP32)
 TEST_F(BinaryOperationGenericPTXTest, CAdd_Vector_Vector_INT64_INT32_INT32)
 {
   // c = a*a*a + b
-  const char* ptx =
+  char const* ptx =
     R"***(
 //
 // Generated by NVIDIA NVVM Compiler
@@ -153,7 +153,7 @@ TEST_F(BinaryOperationGenericPTXTest, CAdd_Vector_Vector_INT64_INT32_INT32)
 TEST_F(BinaryOperationGenericPTXTest, CAdd_Vector_Vector_INT64_INT32_INT64)
 {
   // c = a*a*a + b*b
-  const char* ptx =
+  char const* ptx =
     R"***(
 //
 // Generated by NVIDIA NVVM Compiler
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index a4bbb213930..ab008b51b51 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -37,6 +37,7 @@ TEST_F(BitmaskUtilitiesTest, StateNullCount)
   EXPECT_EQ(0, cudf::state_null_count(cudf::mask_state::UNALLOCATED, 42));
   EXPECT_EQ(42, cudf::state_null_count(cudf::mask_state::ALL_NULL, 42));
   EXPECT_EQ(0, cudf::state_null_count(cudf::mask_state::ALL_VALID, 42));
+  EXPECT_THROW(cudf::state_null_count(cudf::mask_state::UNINITIALIZED, 42), std::invalid_argument);
 }
 
 TEST_F(BitmaskUtilitiesTest, BitmaskAllocationSize)
@@ -581,7 +582,7 @@ TEST_F(CopyBitmaskTest, TestZeroOffset)
     validity_bit.begin() + begin_bit, validity_bit.begin() + end_bit));
 
   auto splice_mask = cudf::copy_bitmask(
-    static_cast<const cudf::bitmask_type*>(input_mask.data()), begin_bit, end_bit);
+    static_cast<cudf::bitmask_type const*>(input_mask.data()), begin_bit, end_bit);
 
   cleanEndWord(splice_mask, begin_bit, end_bit);
   auto number_of_bits = end_bit - begin_bit;
@@ -604,7 +605,7 @@ TEST_F(CopyBitmaskTest, TestNonZeroOffset)
     validity_bit.begin() + begin_bit, validity_bit.begin() + end_bit));
 
   auto splice_mask = cudf::copy_bitmask(
-    static_cast<const cudf::bitmask_type*>(input_mask.data()), begin_bit, end_bit);
+    static_cast<cudf::bitmask_type const*>(input_mask.data()), begin_bit, end_bit);
 
   cleanEndWord(splice_mask, begin_bit, end_bit);
   auto number_of_bits = end_bit - begin_bit;
@@ -620,14 +621,15 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorContiguous)
   for (auto& m : validity_bit) {
     m = this->generate();
   }
-  auto gold_mask =
-    std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));
+  auto [gold_mask, null_count] =
+    cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
 
   rmm::device_buffer copy_mask{gold_mask, cudf::get_default_stream()};
   cudf::column original{t,
                         num_elements,
                         rmm::device_buffer{num_elements * sizeof(int), cudf::get_default_stream()},
-                        std::move(copy_mask)};
+                        std::move(copy_mask),
+                        null_count};
   std::vector<cudf::size_type> indices{0,
                                        104,
                                        104,
diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp
index 33e89a45250..b278e4928e5 100644
--- a/cpp/tests/column/column_test.cpp
+++ b/cpp/tests/column/column_test.cpp
@@ -92,7 +92,8 @@ void verify_column_views(cudf::column col)
 
 TYPED_TEST(TypedColumnTest, DefaultNullCountNoMask)
 {
-  cudf::column col{this->type(), this->num_elements(), std::move(this->data)};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0};
   EXPECT_FALSE(col.nullable());
   EXPECT_FALSE(col.has_nulls());
   EXPECT_EQ(0, col.null_count());
@@ -100,7 +101,8 @@ TYPED_TEST(TypedColumnTest, DefaultNullCountNoMask)
 
 TYPED_TEST(TypedColumnTest, DefaultNullCountEmptyMask)
 {
-  cudf::column col{this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0};
   EXPECT_FALSE(col.nullable());
   EXPECT_FALSE(col.has_nulls());
   EXPECT_EQ(0, col.null_count());
@@ -109,7 +111,7 @@ TYPED_TEST(TypedColumnTest, DefaultNullCountEmptyMask)
 TYPED_TEST(TypedColumnTest, DefaultNullCountAllValid)
 {
   cudf::column col{
-    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0};
   EXPECT_TRUE(col.nullable());
   EXPECT_FALSE(col.has_nulls());
   EXPECT_EQ(0, col.null_count());
@@ -126,8 +128,11 @@ TYPED_TEST(TypedColumnTest, ExplicitNullCountAllValid)
 
 TYPED_TEST(TypedColumnTest, DefaultNullCountAllNull)
 {
-  cudf::column col{
-    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_null_mask)};
+  cudf::column col{this->type(),
+                   this->num_elements(),
+                   std::move(this->data),
+                   std::move(this->all_null_mask),
+                   this->num_elements()};
   EXPECT_TRUE(col.nullable());
   EXPECT_TRUE(col.has_nulls());
   EXPECT_EQ(this->num_elements(), col.null_count());
@@ -147,13 +152,15 @@ TYPED_TEST(TypedColumnTest, ExplicitNullCountAllNull)
 
 TYPED_TEST(TypedColumnTest, SetNullCountNoMask)
 {
-  cudf::column col{this->type(), this->num_elements(), std::move(this->data)};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0};
   EXPECT_THROW(col.set_null_count(1), cudf::logic_error);
 }
 
 TYPED_TEST(TypedColumnTest, SetEmptyNullMaskNonZeroNullCount)
 {
-  cudf::column col{this->type(), this->num_elements(), std::move(this->data)};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0};
   rmm::device_buffer empty_null_mask{};
   EXPECT_THROW(col.set_null_mask(std::move(empty_null_mask), this->num_elements()),
                cudf::logic_error);
@@ -161,7 +168,8 @@ TYPED_TEST(TypedColumnTest, SetEmptyNullMaskNonZeroNullCount)
 
 TYPED_TEST(TypedColumnTest, SetInvalidSizeNullMaskNonZeroNullCount)
 {
-  cudf::column col{this->type(), this->num_elements(), std::move(this->data)};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0};
   auto invalid_size_null_mask =
     create_null_mask(std::min(this->num_elements() - 50, 0), cudf::mask_state::ALL_VALID);
   EXPECT_THROW(
@@ -171,30 +179,37 @@ TYPED_TEST(TypedColumnTest, SetInvalidSizeNullMaskNonZeroNullCount)
 
 TYPED_TEST(TypedColumnTest, SetNullCountEmptyMask)
 {
-  cudf::column col{this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0};
   EXPECT_THROW(col.set_null_count(1), cudf::logic_error);
 }
 
 TYPED_TEST(TypedColumnTest, SetNullCountAllValid)
 {
   cudf::column col{
-    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0};
   EXPECT_NO_THROW(col.set_null_count(0));
   EXPECT_EQ(0, col.null_count());
 }
 
 TYPED_TEST(TypedColumnTest, SetNullCountAllNull)
 {
-  cudf::column col{
-    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_null_mask)};
+  cudf::column col{this->type(),
+                   this->num_elements(),
+                   std::move(this->data),
+                   std::move(this->all_null_mask),
+                   this->num_elements()};
   EXPECT_NO_THROW(col.set_null_count(this->num_elements()));
   EXPECT_EQ(this->num_elements(), col.null_count());
 }
 
 TYPED_TEST(TypedColumnTest, ResetNullCountAllNull)
 {
-  cudf::column col{
-    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_null_mask)};
+  cudf::column col{this->type(),
+                   this->num_elements(),
+                   std::move(this->data),
+                   std::move(this->all_null_mask),
+                   this->num_elements()};
 
   EXPECT_EQ(this->num_elements(), col.null_count());
 }
@@ -202,13 +217,14 @@ TYPED_TEST(TypedColumnTest, ResetNullCountAllNull)
 TYPED_TEST(TypedColumnTest, ResetNullCountAllValid)
 {
   cudf::column col{
-    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0};
   EXPECT_EQ(0, col.null_count());
 }
 
 TYPED_TEST(TypedColumnTest, CopyDataNoMask)
 {
-  cudf::column col{this->type(), this->num_elements(), std::move(this->data)};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0};
   EXPECT_EQ(this->type(), col.type());
   EXPECT_FALSE(col.nullable());
   EXPECT_EQ(0, col.null_count());
@@ -226,7 +242,8 @@ TYPED_TEST(TypedColumnTest, CopyDataNoMask)
 TYPED_TEST(TypedColumnTest, MoveDataNoMask)
 {
   void* original_data = this->data.data();
-  cudf::column col{this->type(), this->num_elements(), std::move(this->data)};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0};
   EXPECT_EQ(this->type(), col.type());
   EXPECT_FALSE(col.nullable());
   EXPECT_EQ(0, col.null_count());
@@ -245,7 +262,8 @@ TYPED_TEST(TypedColumnTest, CopyDataAndMask)
   cudf::column col{this->type(),
                    this->num_elements(),
                    rmm::device_buffer{this->data, cudf::get_default_stream()},
-                   rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}};
+                   rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()},
+                   0};
   EXPECT_EQ(this->type(), col.type());
   EXPECT_TRUE(col.nullable());
   EXPECT_EQ(0, col.null_count());
@@ -267,7 +285,7 @@ TYPED_TEST(TypedColumnTest, MoveDataAndMask)
   void* original_data = this->data.data();
   void* original_mask = this->all_valid_mask.data();
   cudf::column col{
-    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0};
   EXPECT_EQ(this->type(), col.type());
   EXPECT_TRUE(col.nullable());
   EXPECT_EQ(0, col.null_count());
@@ -284,7 +302,8 @@ TYPED_TEST(TypedColumnTest, MoveDataAndMask)
 
 TYPED_TEST(TypedColumnTest, CopyConstructorNoMask)
 {
-  cudf::column original{this->type(), this->num_elements(), std::move(this->data)};
+  cudf::column original{
+    this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0};
   cudf::column copy{original};
   verify_column_views(copy);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(original, copy);
@@ -298,7 +317,7 @@ TYPED_TEST(TypedColumnTest, CopyConstructorNoMask)
 TYPED_TEST(TypedColumnTest, CopyConstructorWithMask)
 {
   cudf::column original{
-    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0};
   cudf::column copy{original};
   verify_column_views(copy);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(original, copy);
@@ -312,7 +331,8 @@ TYPED_TEST(TypedColumnTest, CopyConstructorWithMask)
 
 TYPED_TEST(TypedColumnTest, MoveConstructorNoMask)
 {
-  cudf::column original{this->type(), this->num_elements(), std::move(this->data)};
+  cudf::column original{
+    this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}, 0};
 
   auto original_data = original.view().head();
 
@@ -331,7 +351,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorNoMask)
 TYPED_TEST(TypedColumnTest, MoveConstructorWithMask)
 {
   cudf::column original{
-    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0};
   auto original_data = original.view().head();
   auto original_mask = original.view().null_mask();
   cudf::column moved_to{std::move(original)};
@@ -354,7 +374,7 @@ TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorNoMask)
   auto original = cudf::detail::make_device_uvector_async(
     data, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto original_data = original.data();
-  cudf::column moved_to{std::move(original)};
+  cudf::column moved_to{std::move(original), rmm::device_buffer{}, 0};
   verify_column_views(moved_to);
 
   // Verify move
@@ -371,7 +391,7 @@ TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorWithMask)
     data, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto original_data = original.data();
   auto original_mask = this->all_valid_mask.data();
-  cudf::column moved_to{std::move(original), std::move(this->all_valid_mask)};
+  cudf::column moved_to{std::move(original), std::move(this->all_valid_mask), 0};
   verify_column_views(moved_to);
 
   // Verify move
@@ -388,12 +408,14 @@ TYPED_TEST(TypedColumnTest, ConstructWithChildren)
     cudf::data_type{cudf::type_id::INT8},
     42,
     rmm::device_buffer{this->data, cudf::get_default_stream()},
-    rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}));
+    rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()},
+    0));
   children.emplace_back(std::make_unique<cudf::column>(
     cudf::data_type{cudf::type_id::FLOAT64},
     314,
     rmm::device_buffer{this->data, cudf::get_default_stream()},
-    rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}));
+    rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()},
+    0));
   cudf::column col{this->type(),
                    this->num_elements(),
                    rmm::device_buffer{this->data, cudf::get_default_stream()},
@@ -412,7 +434,7 @@ TYPED_TEST(TypedColumnTest, ConstructWithChildren)
 TYPED_TEST(TypedColumnTest, ReleaseNoChildren)
 {
   cudf::column col{
-    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0};
   auto original_data = col.view().head();
   auto original_mask = col.view().null_mask();
 
@@ -433,12 +455,14 @@ TYPED_TEST(TypedColumnTest, ReleaseWithChildren)
     this->type(),
     this->num_elements(),
     rmm::device_buffer{this->data, cudf::get_default_stream()},
-    rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}));
+    rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()},
+    0));
   children.emplace_back(std::make_unique<cudf::column>(
     this->type(),
     this->num_elements(),
     rmm::device_buffer{this->data, cudf::get_default_stream()},
-    rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}));
+    rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()},
+    0));
   cudf::column col{this->type(),
                    this->num_elements(),
                    rmm::device_buffer{this->data, cudf::get_default_stream()},
@@ -462,7 +486,7 @@ TYPED_TEST(TypedColumnTest, ReleaseWithChildren)
 TYPED_TEST(TypedColumnTest, ColumnViewConstructorWithMask)
 {
   cudf::column original{
-    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0};
   cudf::column_view original_view = original;
   cudf::column copy{original_view};
   verify_column_views(copy);
diff --git a/cpp/tests/column/column_view_device_span_test.cpp b/cpp/tests/column/column_view_device_span_test.cpp
index 53dd03ea352..7daf6870eac 100644
--- a/cpp/tests/column/column_view_device_span_test.cpp
+++ b/cpp/tests/column/column_view_device_span_test.cpp
@@ -50,7 +50,7 @@ TYPED_TEST(ColumnViewDeviceSpanTests, conversion_round_trip)
   auto col_view = cudf::column_view{*col};
 
   // Test implicit conversion, round trip
-  cudf::device_span<const TypeParam> device_span_from_col_view = col_view;
+  cudf::device_span<TypeParam const> device_span_from_col_view = col_view;
   cudf::column_view col_view_from_device_span                  = device_span_from_col_view;
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_view, col_view_from_device_span);
 }
@@ -61,7 +61,7 @@ TEST_F(ColumnViewDeviceSpanErrorTests, type_mismatch)
 {
   auto col      = example_column<int32_t>();
   auto col_view = cudf::column_view{*col};
-  EXPECT_THROW((void)cudf::device_span<const float>{col_view}, cudf::logic_error);
+  EXPECT_THROW((void)cudf::device_span<float const>{col_view}, cudf::logic_error);
 }
 
 TEST_F(ColumnViewDeviceSpanErrorTests, nullable_column)
@@ -69,5 +69,5 @@ TEST_F(ColumnViewDeviceSpanErrorTests, nullable_column)
   auto col = example_column<int32_t>();
   col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_NULL), col->size());
   auto col_view = cudf::column_view{*col};
-  EXPECT_THROW((void)cudf::device_span<const int32_t>{col_view}, cudf::logic_error);
+  EXPECT_THROW((void)cudf::device_span<int32_t const>{col_view}, cudf::logic_error);
 }
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index 3eccb6b2a55..95706ad9e37 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -508,7 +508,7 @@ TYPED_TEST_SUITE(ListsDictionaryLeafTest, cudf::test::FixedWidthTypes);
 TYPED_TEST(ListsDictionaryLeafTest, FromNonNested)
 {
   using DCW      = cudf::test::dictionary_column_wrapper<TypeParam>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto s   = cudf::make_list_scalar(DCW({1, 3, -1, 1, 3}, {1, 1, 0, 1, 1}));
   auto col = cudf::make_column_from_scalar(*s, 2);
@@ -524,7 +524,7 @@ TYPED_TEST(ListsDictionaryLeafTest, FromNonNested)
 TYPED_TEST(ListsDictionaryLeafTest, FromNested)
 {
   using DCW      = cudf::test::dictionary_column_wrapper<TypeParam>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   DCW leaf({1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, {1, 1, 0, 1, 1, 1, 1, 0, 1, 1});
   offset_t offsets{0, 3, 3, 6, 6, 10};
@@ -617,7 +617,7 @@ TYPED_TEST(ListsStructsLeafTest, FromNonNested)
 {
   using LCWinner_t = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
   using StringCW   = cudf::test::strings_column_wrapper;
-  using offset_t   = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t   = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
   using valid_t    = std::vector<cudf::valid_type>;
 
   auto data = this->make_test_structs_column(
@@ -648,7 +648,7 @@ TYPED_TEST(ListsStructsLeafTest, FromNested)
 {
   using LCWinner_t = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
   using StringCW   = cudf::test::strings_column_wrapper;
-  using offset_t   = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t   = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
   using valid_t    = std::vector<cudf::valid_type>;
   auto leaf        = this->make_test_structs_column(
     {{1, 2}, {0, 1}},
@@ -702,7 +702,7 @@ TEST_F(ListsZeroLengthColumnTest, MixedTypes)
   using FCW      = cudf::test::fixed_width_column_wrapper<int32_t>;
   using StringCW = cudf::test::strings_column_wrapper;
   using LCW      = cudf::test::lists_column_wrapper<int32_t>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
   {
     auto s   = cudf::make_list_scalar(FCW{1, 2, 3});
     auto got = cudf::make_column_from_scalar(*s, 0);
@@ -759,7 +759,7 @@ TEST_F(ListsZeroLengthColumnTest, SuperimposeNulls)
   using FCW      = cudf::test::fixed_width_column_wrapper<int32_t>;
   using StringCW = cudf::test::strings_column_wrapper;
   using LCW      = cudf::test::lists_column_wrapper<int32_t>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto const lists = [&] {
     auto child = this
@@ -819,3 +819,13 @@ void struct_from_scalar(bool is_valid)
 TEST_F(ColumnFactoryTest, FromStructScalar) { struct_from_scalar(true); }
 
 TEST_F(ColumnFactoryTest, FromStructScalarNull) { struct_from_scalar(false); }
+
+TEST_F(ColumnFactoryTest, FromScalarErrors)
+{
+  cudf::string_scalar ss("hello world");
+  EXPECT_THROW(cudf::make_column_from_scalar(ss, 214748365), std::overflow_error);
+
+  using FCW = cudf::test::fixed_width_column_wrapper<int8_t>;
+  auto s    = cudf::make_list_scalar(FCW({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  EXPECT_THROW(cudf::make_column_from_scalar(*s, 214748365), std::overflow_error);
+}
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 6658bbee283..c81f1772d10 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -24,15 +24,12 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/filling.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 #include <numeric>
 #include <stdexcept>
 #include <string>
@@ -66,10 +63,13 @@ struct TypedColumnTest : public cudf::test::BaseFixture {
       cudaMemcpyAsync(typed_data, h_data.data(), data.size(), cudaMemcpyDefault, stream.value()));
     CUDF_CUDA_TRY(
       cudaMemcpyAsync(typed_mask, h_mask.data(), mask.size(), cudaMemcpyDefault, stream.value()));
+    _null_count = cudf::detail::null_count(
+      static_cast<cudf::bitmask_type*>(mask.data()), 0, _num_elements, stream);
     stream.synchronize();
   }
 
-  cudf::size_type num_elements() { return _num_elements; }
+  cudf::size_type num_elements() const { return _num_elements; }
+  cudf::size_type null_count() const { return _null_count; }
 
   std::random_device r;
   std::default_random_engine generator{r()};
@@ -77,6 +77,7 @@ struct TypedColumnTest : public cudf::test::BaseFixture {
   cudf::size_type _num_elements{distribution(generator)};
   rmm::device_buffer data{};
   rmm::device_buffer mask{};
+  cudf::size_type _null_count{};
   rmm::device_buffer all_valid_mask{create_null_mask(num_elements(), cudf::mask_state::ALL_VALID)};
   rmm::device_buffer all_null_mask{create_null_mask(num_elements(), cudf::mask_state::ALL_NULL)};
 };
@@ -105,7 +106,11 @@ TYPED_TEST(TypedColumnTest, ConcatenateNoColumns)
 
 TYPED_TEST(TypedColumnTest, ConcatenateColumnView)
 {
-  column original{this->type(), this->num_elements(), std::move(this->data), std::move(this->mask)};
+  column original{this->type(),
+                  this->num_elements(),
+                  std::move(this->data),
+                  std::move(this->mask),
+                  this->null_count()};
   std::vector<cudf::size_type> indices{0,
                                        this->num_elements() / 3,
                                        this->num_elements() / 3,
@@ -123,7 +128,7 @@ struct StringColumnTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringColumnTest, ConcatenateColumnView)
 {
-  std::vector<const char*> h_strings{"aaa",
+  std::vector<char const*> h_strings{"aaa",
                                      "bb",
                                      "",
                                      "cccc",
@@ -157,9 +162,40 @@ TEST_F(StringColumnTest, ConcatenateColumnView)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringColumnTest, ConcatenateColumnViewLarge)
+{
+  // Test large concatenate, causes out of bound device memory errors if kernel
+  // indexing is not int64_t.
+  // 1.5GB bytes, 5k columns
+  constexpr size_t num_strings        = 10000;
+  constexpr size_t string_length      = 150000;
+  constexpr size_t strings_per_column = 2;
+  constexpr size_t num_columns        = num_strings / strings_per_column;
+
+  std::vector<std::string> strings;
+  std::vector<char const*> h_strings;
+  std::vector<cudf::test::strings_column_wrapper> strings_column_wrappers;
+  std::vector<cudf::column_view> strings_columns;
+
+  std::string s(string_length, 'a');
+  for (size_t i = 0; i < num_strings; ++i)
+    h_strings.push_back(s.data());
+
+  for (size_t i = 0; i < num_columns; ++i)
+    strings_column_wrappers.push_back(cudf::test::strings_column_wrapper(
+      h_strings.data() + i * strings_per_column, h_strings.data() + (i + 1) * strings_per_column));
+  for (auto& wrapper : strings_column_wrappers)
+    strings_columns.push_back(wrapper);
+
+  auto results = cudf::concatenate(strings_columns);
+
+  cudf::test::strings_column_wrapper expected(h_strings.begin(), h_strings.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringColumnTest, ConcatenateTooManyColumns)
 {
-  std::vector<const char*> h_strings{"aaa",
+  std::vector<char const*> h_strings{"aaa",
                                      "bb",
                                      "",
                                      "cccc",
@@ -178,7 +214,7 @@ TEST_F(StringColumnTest, ConcatenateTooManyColumns)
                                      "oo",
                                      "ppp"};
 
-  std::vector<const char*> expected_strings;
+  std::vector<char const*> expected_strings;
   std::vector<cudf::test::strings_column_wrapper> wrappers;
   std::vector<cudf::column_view> strings_columns;
   for (int i = 0; i < 200; ++i) {
@@ -196,7 +232,7 @@ struct TableTest : public cudf::test::BaseFixture {};
 
 TEST_F(TableTest, ConcatenateTables)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "Lorem", "ipsum", "dolor", "sit", "amet", "consectetur", "adipiscing", "elit"};
 
   CVector cols_gold;
@@ -372,7 +408,7 @@ TEST_F(OverflowTest, OverflowTest)
     constexpr auto size = static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns of with 1 billion chars in each
-    auto offsets    = cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, size};
+    auto offsets    = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, size};
     auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size);
     auto col        = cudf::make_strings_column(
       1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
@@ -413,7 +449,7 @@ TEST_F(OverflowTest, OverflowTest)
       cudf::make_structs_column(inner_size, std::move(children), 0, rmm::device_buffer{});
 
     // list
-    auto offsets = cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, inner_size};
+    auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, inner_size};
     auto col =
       cudf::make_lists_column(1, offsets.release(), std::move(struct_col), 0, rmm::device_buffer{});
 
@@ -430,7 +466,7 @@ TEST_F(OverflowTest, OverflowTest)
     constexpr cudf::size_type size = 3;
 
     // list
-    auto offsets = cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, 0, 0, inner_size};
+    auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, inner_size};
     auto many_chars =
       cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size);
     auto list_col =
@@ -552,7 +588,7 @@ TEST_F(OverflowTest, Presliced)
       cudf::table_view tb({b[1]});
 
       EXPECT_THROW(cudf::concatenate(std::vector<cudf::table_view>({ta, ta, ta, tb})),
-                   cudf::logic_error);
+                   std::overflow_error);
     }
   }
 
@@ -626,7 +662,7 @@ TEST_F(OverflowTest, Presliced)
       cudf::table_view tb({b[1]});
 
       EXPECT_THROW(cudf::concatenate(std::vector<cudf::table_view>({ta, ta, ta, tb})),
-                   cudf::logic_error);
+                   std::overflow_error);
     }
   }
 
@@ -638,7 +674,7 @@ TEST_F(OverflowTest, Presliced)
     constexpr cudf::size_type list_size = inner_size / num_rows;
 
     // list
-    auto offsets = cudf::test::fixed_width_column_wrapper<cudf::offset_type>{
+    auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
       0, list_size, (list_size * 2) - 1, list_size * 3, inner_size};
     auto many_chars =
       cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size);
@@ -837,7 +873,7 @@ TEST_F(StructsColumnTest, ConcatenateEmptyStructs)
 
   // concatenate
   auto result = cudf::concatenate(std::vector<column_view>({*first, *second, *third, *fourth}));
-  CUDF_EXPECTS(result->size() == expected->size(), "column size changed after concat");
+  ASSERT_EQ(result->size(), expected->size());
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
 }
 
diff --git a/cpp/tests/copying/copy_if_else_nested_tests.cpp b/cpp/tests/copying/copy_if_else_nested_tests.cpp
index ff28156ef1d..579e1bdce8a 100644
--- a/cpp/tests/copying/copy_if_else_nested_tests.cpp
+++ b/cpp/tests/copying/copy_if_else_nested_tests.cpp
@@ -332,7 +332,7 @@ TYPED_TEST(TypedCopyIfElseNestedTest, ListsWithStructs)
   using strings = cudf::test::strings_column_wrapper;
   using structs = cudf::test::structs_column_wrapper;
   using bools   = cudf::test::fixed_width_column_wrapper<bool, int32_t>;
-  using offsets = cudf::test::fixed_width_column_wrapper<cudf::offset_type, int32_t>;
+  using offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type, int32_t>;
 
   auto const null_at_0 = null_at(0);
   auto const null_at_3 = null_at(3);
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 0c60a7c5b97..737937367d5 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -322,11 +322,11 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarColumn)
 
   cudf::numeric_scalar<T> lhs_w(5);
 
-  const auto rhs = cudf::test::make_type_param_vector<T>({6, 6, 6, 6});
+  auto const rhs = cudf::test::make_type_param_vector<T>({6, 6, 6, 6});
   bool rhs_v[]   = {true, false, true, true};
   wrapper<T> rhs_w(rhs.begin(), rhs.end(), rhs_v);
 
-  const auto expected = cudf::test::make_type_param_vector<T>({5, 6, 6, 5});
+  auto const expected = cudf::test::make_type_param_vector<T>({5, 6, 6, 5});
   wrapper<T> expected_w(expected.begin(), expected.end(), rhs_v);
 
   auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w);
@@ -343,13 +343,13 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestColumnScalar)
   bool mask_v[] = {true, true, true, false};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els, mask_v);
 
-  const auto lhs = cudf::test::make_type_param_vector<T>({5, 5, 5, 5});
+  auto const lhs = cudf::test::make_type_param_vector<T>({5, 5, 5, 5});
   bool lhs_v[]   = {false, true, true, true};
   wrapper<T> lhs_w(lhs.begin(), lhs.end(), lhs_v);
 
   cudf::numeric_scalar<T> rhs_w(6);
 
-  const auto expected = cudf::test::make_type_param_vector<T>({5, 6, 6, 6});
+  auto const expected = cudf::test::make_type_param_vector<T>({5, 6, 6, 6});
   wrapper<T> expected_w(expected.begin(), expected.end(), lhs_v);
 
   auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w);
@@ -368,7 +368,7 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarScalar)
   cudf::numeric_scalar<T> lhs_w(5);
   cudf::numeric_scalar<T> rhs_w(6, false);
 
-  const auto expected = cudf::test::make_type_param_vector<T>({5, 6, 6, 5});
+  auto const expected = cudf::test::make_type_param_vector<T>({5, 6, 6, 5});
   wrapper<T> expected_w(expected.begin(), expected.end(), mask);
 
   auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w);
@@ -475,9 +475,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElse)
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
-  std::vector<const char*> h_strings1{"eee", "bb", "", "aa", "bbb", "ééé"};
+  std::vector<char const*> h_strings1{"eee", "bb", "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), valids);
-  std::vector<const char*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
+  std::vector<char const*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids);
 
   bool mask[]   = {true, true, false, true, false, true};
@@ -486,7 +486,7 @@ TEST_F(StringsCopyIfElseTest, CopyIfElse)
 
   auto results = cudf::copy_if_else(strings1, strings2, mask_w);
 
-  std::vector<const char*> h_expected;
+  std::vector<char const*> h_expected;
   for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(h_strings1.size()); ++idx) {
     if (mask[idx] and mask_v[idx])
       h_expected.push_back(h_strings1[idx]);
@@ -502,9 +502,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarColumn)
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
-  std::vector<const char*> h_string1{"eee"};
+  std::vector<char const*> h_string1{"eee"};
   cudf::string_scalar strings1{h_string1[0]};
-  std::vector<const char*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
+  std::vector<char const*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids);
 
   bool mask[]   = {true, false, true, false, true, false};
@@ -513,7 +513,7 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarColumn)
 
   auto results = cudf::copy_if_else(strings1, strings2, mask_w);
 
-  std::vector<const char*> h_expected;
+  std::vector<char const*> h_expected;
   for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(h_strings2.size()); ++idx) {
     if (mask[idx] and mask_v[idx]) {
       h_expected.push_back(h_string1[0]);
@@ -530,9 +530,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseColumnScalar)
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
-  std::vector<const char*> h_string1{"eee"};
+  std::vector<char const*> h_string1{"eee"};
   cudf::string_scalar strings1{h_string1[0]};
-  std::vector<const char*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
+  std::vector<char const*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids);
 
   bool mask[] = {false, true, true, true, false, true};
@@ -540,7 +540,7 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseColumnScalar)
 
   auto results = cudf::copy_if_else(strings2, strings1, mask_w);
 
-  std::vector<const char*> h_expected;
+  std::vector<char const*> h_expected;
   for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(h_strings2.size()); ++idx) {
     if (mask[idx]) {
       h_expected.push_back(h_strings2[idx]);
@@ -557,9 +557,9 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarScalar)
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
-  std::vector<const char*> h_string1{"eee"};
+  std::vector<char const*> h_string1{"eee"};
   cudf::string_scalar string1{h_string1[0]};
-  std::vector<const char*> h_string2{"aaa"};
+  std::vector<char const*> h_string2{"aaa"};
   cudf::string_scalar string2{h_string2[0], false};
 
   constexpr cudf::size_type mask_size = 6;
@@ -568,7 +568,7 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarScalar)
 
   auto results = cudf::copy_if_else(string1, string2, mask_w);
 
-  std::vector<const char*> h_expected;
+  std::vector<char const*> h_expected;
   for (bool idx : mask) {
     if (idx) {
       h_expected.push_back(h_string1[0]);
@@ -642,10 +642,10 @@ struct DictionaryCopyIfElseTest : public cudf::test::BaseFixture {};
 TEST_F(DictionaryCopyIfElseTest, ColumnColumn)
 {
   auto valids = cudf::test::iterators::null_at(2);
-  std::vector<const char*> h_strings1{"eee", "bb", "", "aa", "bb", "ééé"};
+  std::vector<char const*> h_strings1{"eee", "bb", "", "aa", "bb", "ééé"};
   cudf::test::dictionary_column_wrapper<std::string> input1(
     h_strings1.begin(), h_strings1.end(), valids);
-  std::vector<const char*> h_strings2{"zz", "bb", "", "aa", "ééé", "ooo"};
+  std::vector<char const*> h_strings2{"zz", "bb", "", "aa", "ééé", "ooo"};
   cudf::test::dictionary_column_wrapper<std::string> input2(
     h_strings2.begin(), h_strings2.end(), valids);
 
@@ -656,7 +656,7 @@ TEST_F(DictionaryCopyIfElseTest, ColumnColumn)
   auto results = cudf::copy_if_else(input1, input2, mask_w);
   auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));
 
-  std::vector<const char*> h_expected;
+  std::vector<char const*> h_expected;
   for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(h_strings1.size()); ++idx) {
     if (mask[idx] and mask_v[idx])
       h_expected.push_back(h_strings1[idx]);
@@ -671,7 +671,7 @@ TEST_F(DictionaryCopyIfElseTest, ColumnScalar)
 {
   std::string h_string{"eee"};
   cudf::string_scalar input1{h_string};
-  std::vector<const char*> h_strings{"zz", "", "yyy", "w", "ééé", "ooo"};
+  std::vector<char const*> h_strings{"zz", "", "yyy", "w", "ééé", "ooo"};
   auto valids = cudf::test::iterators::null_at(1);
   cudf::test::dictionary_column_wrapper<std::string> input2(
     h_strings.begin(), h_strings.end(), valids);
@@ -682,8 +682,8 @@ TEST_F(DictionaryCopyIfElseTest, ColumnScalar)
   auto results = cudf::copy_if_else(input2, input1, mask_w);
   auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));
 
-  std::vector<const char*> h_expected1;
-  std::vector<const char*> h_expected2;
+  std::vector<char const*> h_expected1;
+  std::vector<char const*> h_expected2;
   for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(h_strings.size()); ++idx) {
     if (mask[idx]) {
       h_expected1.push_back(h_strings[idx]);
diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp
index a046052883d..22af600ab96 100644
--- a/cpp/tests/copying/gather_str_tests.cpp
+++ b/cpp/tests/copying/gather_str_tests.cpp
@@ -78,7 +78,7 @@ TEST_F(GatherTestStr, GatherSlicedStringsColumn)
 
 TEST_F(GatherTestStr, Gather)
 {
-  std::vector<const char*> h_strings{"eee", "bb", "", "aa", "bbb", "ééé"};
+  std::vector<char const*> h_strings{"eee", "bb", "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   cudf::table_view source_table({strings});
 
@@ -91,7 +91,7 @@ TEST_F(GatherTestStr, Gather)
                                       cudf::get_default_stream(),
                                       rmm::mr::get_current_device_resource());
 
-  std::vector<const char*> h_expected;
+  std::vector<char const*> h_expected;
   std::vector<int32_t> expected_validity;
   for (auto itr = h_map.begin(); itr != h_map.end(); ++itr) {
     auto index = *itr;
@@ -110,7 +110,7 @@ TEST_F(GatherTestStr, Gather)
 
 TEST_F(GatherTestStr, GatherDontCheckOutOfBounds)
 {
-  std::vector<const char*> h_strings{"eee", "bb", "", "aa", "bbb", "ééé"};
+  std::vector<char const*> h_strings{"eee", "bb", "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   cudf::table_view source_table({strings});
 
@@ -123,7 +123,7 @@ TEST_F(GatherTestStr, GatherDontCheckOutOfBounds)
                                       cudf::get_default_stream(),
                                       rmm::mr::get_current_device_resource());
 
-  std::vector<const char*> h_expected;
+  std::vector<char const*> h_expected;
   for (auto itr = h_map.begin(); itr != h_map.end(); ++itr) {
     h_expected.push_back(h_strings[*itr]);
   }
@@ -133,10 +133,9 @@ TEST_F(GatherTestStr, GatherDontCheckOutOfBounds)
 
 TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING);
   cudf::test::fixed_width_column_wrapper<cudf::size_type> gather_map;
-  auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column}),
+  auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column->view()}),
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
@@ -147,11 +146,10 @@ TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
 
 TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING);
   cudf::test::fixed_width_column_wrapper<int32_t> gather_map({0});
   cudf::test::strings_column_wrapper expected{std::pair<std::string, bool>{"", false}};
-  auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column}),
+  auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column->view()}),
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
diff --git a/cpp/tests/copying/gather_struct_tests.cpp b/cpp/tests/copying/gather_struct_tests.cpp
index ebfd950df4d..2bc18c706db 100644
--- a/cpp/tests/copying/gather_struct_tests.cpp
+++ b/cpp/tests/copying/gather_struct_tests.cpp
@@ -37,7 +37,7 @@
 #include <memory>
 
 using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
-using gather_map_t      = std::vector<cudf::offset_type>;
+using gather_map_t      = std::vector<cudf::size_type>;
 using offsets           = cudf::test::fixed_width_column_wrapper<int32_t>;
 using structs           = cudf::test::structs_column_wrapper;
 using strings           = cudf::test::strings_column_wrapper;
@@ -54,7 +54,7 @@ using numerics = cudf::test::fixed_width_column_wrapper<T, int32_t>;
 template <typename T>
 using lists = cudf::test::lists_column_wrapper<T, int32_t>;
 
-auto constexpr null_index = std::numeric_limits<cudf::offset_type>::max();
+auto constexpr null_index = std::numeric_limits<cudf::size_type>::max();
 
 struct StructGatherTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index a18ed6a1ccf..d322fbe11f2 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -311,7 +311,7 @@ TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNull)
 {
   using LCW      = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
   using FCW      = cudf::test::fixed_width_column_wrapper<TypeParam>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   std::vector<cudf::valid_type> valid{1, 0, 1, 0};
   // clang-format off
@@ -466,7 +466,7 @@ TEST_F(ListGetStringValueTest, NestedGetNonNullEmpty)
 TEST_F(ListGetStringValueTest, NestedGetNull)
 {
   using LCW      = cudf::test::lists_column_wrapper<cudf::string_view>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
   using StringCW = cudf::test::strings_column_wrapper;
 
   std::vector<cudf::valid_type> valid{0, 0, 1, 1};
@@ -508,7 +508,7 @@ struct ListGetStructValueTest : public cudf::test::BaseFixture {
    */
   std::unique_ptr<cudf::column> make_test_lists_column(
     cudf::size_type num_lists,
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets,
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets,
     std::unique_ptr<cudf::column> child,
     std::initializer_list<cudf::valid_type> null_mask)
   {
@@ -776,7 +776,7 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNull)
   // NULL                      <- cudf::get_element(2)
 
   using valid_t  = std::vector<cudf::valid_type>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto list_column = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {1, 1});
   auto list_column_nested =
@@ -900,12 +900,12 @@ TEST_F(StructGetValueTest, multi_level_nested)
   // col fields
   LCW l3({LCW{1, 1, 1}, LCW{2, 2}, LCW{3}}, validity_mask_t{false, true, true}.begin());
   cudf::test::structs_column_wrapper l2{l3};
-  auto l1 = cudf::make_lists_column(
-    1,
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, 3}.release(),
-    l2.release(),
-    0,
-    cudf::create_null_mask(1, cudf::mask_state::UNALLOCATED));
+  auto l1 =
+    cudf::make_lists_column(1,
+                            cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 3}.release(),
+                            l2.release(),
+                            0,
+                            cudf::create_null_mask(1, cudf::mask_state::UNALLOCATED));
   std::vector<std::unique_ptr<cudf::column>> l0_fields;
   l0_fields.emplace_back(std::move(l1));
   cudf::test::structs_column_wrapper l0(std::move(l0_fields));
diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
index ea23a1c1069..b55875fa32d 100644
--- a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
+++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
@@ -40,6 +40,41 @@ using gather_map_t  = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 template <typename T>
 using LCW = cudf::test::lists_column_wrapper<T, int32_t>;
 
+struct HasNonEmptyNullsTest : public cudf::test::BaseFixture {};
+
+TEST_F(HasNonEmptyNullsTest, TrivialTest)
+{
+  auto const input = LCW<T>{{{{1, 2, 3, 4}, null_at(2)},
+                             {5},
+                             {6, 7},  // <--- Will be set to NULL. Unsanitized row.
+                             {8, 9, 10}},
+                            no_nulls()}
+                       .release();
+  EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
+
+  // Set nullmask, post construction.
+  cudf::detail::set_null_mask(
+    input->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream());
+  input->set_null_count(1);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
+}
+
+TEST_F(HasNonEmptyNullsTest, SlicedInputTest)
+{
+  auto const input = cudf::test::strings_column_wrapper{
+    {"" /*NULL*/, "111", "222", "333", "444", "" /*NULL*/, "", "777", "888", "" /*NULL*/, "101010"},
+    cudf::test::iterators::nulls_at({0, 5, 9})};
+
+  // Split into 2 columns from rows [0, 2) and [2, 10).
+  auto const result = cudf::split(input, {2});
+  for (auto const& col : result) {
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(col));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(col));
+  }
+}
+
 struct PurgeNonEmptyNullsTest : public cudf::test::BaseFixture {
   /// Helper to run gather() on a single column, and extract the single column from the result.
   std::unique_ptr<cudf::column> gather(cudf::column_view const& input,
@@ -69,14 +104,11 @@ TEST_F(PurgeNonEmptyNullsTest, SingleLevelList)
                              {8, 9, 10}},
                             no_nulls()}
                        .release();
-  EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input));
-  EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
 
   // Set nullmask, post construction.
   cudf::detail::set_null_mask(
     input->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream());
-  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
-  EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
+  input->set_null_count(1);
 
   test_purge(*input);
 
@@ -158,6 +190,7 @@ TEST_F(PurgeNonEmptyNullsTest, TwoLevelList)
   // Set nullmask, post construction.
   cudf::detail::set_null_mask(
     input->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream());
+  input->set_null_count(1);
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
 
@@ -213,6 +246,7 @@ TEST_F(PurgeNonEmptyNullsTest, ThreeLevelList)
   // Set nullmask, post construction.
   cudf::detail::set_null_mask(
     input->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream());
+  input->set_null_count(1);
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
 
@@ -267,6 +301,7 @@ TEST_F(PurgeNonEmptyNullsTest, ListOfStrings)
   // Set nullmask, post construction.
   cudf::detail::set_null_mask(
     input->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream());
+  input->set_null_count(1);
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
 
@@ -332,6 +367,7 @@ TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings)
 
   // Set strings nullmask, post construction.
   cudf::set_null_mask(strings->mutable_view().null_mask(), 7, 8, false);
+  strings->set_null_count(1);
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*strings));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*strings));
 
@@ -358,6 +394,7 @@ TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings)
   // Set lists nullmask, post construction.
   cudf::detail::set_null_mask(
     lists->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream());
+  lists->set_null_count(1);
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*lists));
 
diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp
index 9dda3c12edf..42d2e004d6b 100644
--- a/cpp/tests/copying/scatter_list_scalar_tests.cpp
+++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp
@@ -307,7 +307,7 @@ TYPED_TEST_SUITE(ScatterListOfStructScalarTest, cudf::test::FixedWidthTypesWitho
 TYPED_TEST(ScatterListOfStructScalarTest, Basic)
 {
   using LCW      = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto data =
     this->make_test_structs({{42, 42, 42}, {1, 0, 1}},
@@ -346,7 +346,7 @@ TYPED_TEST(ScatterListOfStructScalarTest, Basic)
 TYPED_TEST(ScatterListOfStructScalarTest, EmptyValidScalar)
 {
   using LCW      = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto data = this->make_test_structs({}, {}, LCW{}, {});
   auto slr  = std::make_unique<cudf::list_scalar>(data, true);
@@ -379,7 +379,7 @@ TYPED_TEST(ScatterListOfStructScalarTest, EmptyValidScalar)
 TYPED_TEST(ScatterListOfStructScalarTest, NullScalar)
 {
   using LCW      = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto data = this->make_test_structs({}, {}, {}, {});
   auto slr  = std::make_unique<cudf::list_scalar>(data, false);
@@ -411,7 +411,7 @@ TYPED_TEST(ScatterListOfStructScalarTest, NullScalar)
 TYPED_TEST(ScatterListOfStructScalarTest, NullableTargetRow)
 {
   using LCW      = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
-  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto data =
     this->make_test_structs({{42, 42, 42}, {1, 0, 1}},
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index 491c82db991..8194a74c10a 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -437,16 +437,16 @@ class ScatterStringsTests : public cudf::test::BaseFixture {};
 
 TEST_F(ScatterStringsTests, ScatterNoNulls)
 {
-  std::vector<const char*> h_source{"dog", "the", "jumps", "brown", "the"};
+  std::vector<char const*> h_source{"dog", "the", "jumps", "brown", "the"};
   cudf::test::strings_column_wrapper source(h_source.begin(), h_source.end());
 
-  std::vector<const char*> h_target{
+  std::vector<char const*> h_target{
     "a", "quick", "fire", "fox", "browses", "over", "a", "lazy", "web"};
   cudf::test::strings_column_wrapper target(h_target.begin(), h_target.end());
 
   cudf::test::fixed_width_column_wrapper<int32_t> scatter_map({-1, -3, -5, 2, 0});
 
-  std::vector<const char*> h_expected{
+  std::vector<char const*> h_expected{
     "the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"};
   cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
 
@@ -465,13 +465,13 @@ TEST_F(ScatterStringsTests, ScatterScalarNoNulls)
   std::reference_wrapper<const cudf::scalar> slr_ref{source};
   std::vector<std::reference_wrapper<const cudf::scalar>> source_vector{slr_ref};
 
-  std::vector<const char*> h_target{
+  std::vector<char const*> h_target{
     "Buffalo", "bison", "Buffalo", "bison", "bully", "bully", "Buffalo", "bison"};
   cudf::test::strings_column_wrapper target(h_target.begin(), h_target.end());
 
   cudf::test::fixed_width_column_wrapper<int32_t> scatter_map({1, 3, -4, -3, -1});
 
-  std::vector<const char*> h_expected{
+  std::vector<char const*> h_expected{
     "Buffalo", "buffalo", "Buffalo", "buffalo", "buffalo", "buffalo", "Buffalo", "buffalo"};
   cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
 
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index 9ca07040175..17e56ea8ed8 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -206,22 +206,48 @@ TEST_F(ShiftTests, StringsShiftTest)
   auto results = cudf::shift(input, 2, fill);
   auto expected_right =
     cudf::test::strings_column_wrapper({"xx", "xx", "", "bb", "ccc"}, {1, 1, 0, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_right, *results);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_right, *results);
 
   results = cudf::shift(input, -2, fill);
   auto expected_left =
     cudf::test::strings_column_wrapper({"ccc", "ddddddé", "", "xx", "xx"}, {1, 1, 0, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_left, *results);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_left, *results);
 
   auto sliced = cudf::slice(input, {1, 4}).front();
 
   results           = cudf::shift(sliced, 1, fill);
-  auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_right, *results);
+  auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"}, {1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_right, *results);
 
   results          = cudf::shift(sliced, -1, fill);
-  auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced_left, *results);
+  auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"}, {1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_left, *results);
+}
+
+TEST_F(ShiftTests, StringsShiftNullFillTest)
+{
+  auto input = cudf::test::strings_column_wrapper(
+    {"a", "b", "c", "d", "e", "ff", "ggg", "hhhh", "iii", "jjjjj"});
+  auto phil = cudf::string_scalar("", false);
+
+  auto results  = cudf::shift(input, -1, phil);
+  auto expected = cudf::test::strings_column_wrapper(
+    {"b", "c", "d", "e", "ff", "ggg", "hhhh", "iii", "jjjjj", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::shift(input, 1, phil);
+  expected = cudf::test::strings_column_wrapper(
+    {"", "a", "b", "c", "d", "e", "ff", "ggg", "hhhh", "iii"}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto sliced = cudf::slice(input, {5, 10}).front();
+  results     = cudf::shift(sliced, -2, phil);
+  expected = cudf::test::strings_column_wrapper({"hhhh", "iii", "jjjjj", "", ""}, {1, 1, 1, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::shift(sliced, 2, phil);
+  expected = cudf::test::strings_column_wrapper({"", "", "ff", "ggg", "hhhh"}, {0, 0, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(ShiftTests, OffsetGreaterThanSize)
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index ef91f00c9bc..7a5c738dc12 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -28,7 +28,6 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/filling.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_buffer.hpp>
 
@@ -450,10 +449,8 @@ void split_end_to_size(SplitFunc Split, CompareFunc Compare)
 }
 
 template <typename SplitFunc>
-void split_empty_table(SplitFunc Split)
+void split_empty_table(SplitFunc Split, std::vector<cudf::size_type> const& splits = {2, 5, 6})
 {
-  std::vector<cudf::size_type> splits{2, 5, 9};
-
   cudf::table src_table{};
   auto result = Split(src_table, splits);
 
@@ -530,7 +527,9 @@ void split_negative_value(SplitFunc Split)
 }
 
 template <typename SplitFunc, typename CompareFunc>
-void split_empty_output_column_value(SplitFunc Split, CompareFunc Compare)
+void split_empty_output_column_value(SplitFunc Split,
+                                     CompareFunc Compare,
+                                     std::vector<cudf::size_type> const& splits = {0, 2, 2})
 {
   cudf::size_type start    = 0;
   cudf::size_type col_size = 10;
@@ -540,8 +539,6 @@ void split_empty_output_column_value(SplitFunc Split, CompareFunc Compare)
   cudf::size_type num_cols = 5;
   cudf::table src_table    = create_fixed_table<int8_t>(num_cols, start, col_size, valids);
 
-  std::vector<cudf::size_type> splits{0, 2, 2};
-
   EXPECT_NO_THROW(Split(src_table, splits));
 
   auto result = Split(src_table, splits);
@@ -622,7 +619,9 @@ TEST_F(SplitTableCornerCases, EmptyOutputColumn)
 }
 
 template <typename SplitFunc, typename CompareFunc>
-void split_string_with_invalids(SplitFunc Split, CompareFunc Compare)
+void split_string_with_invalids(SplitFunc Split,
+                                CompareFunc Compare,
+                                std::vector<cudf::size_type> splits = {2, 5, 9})
 {
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
@@ -638,8 +637,6 @@ void split_string_with_invalids(SplitFunc Split, CompareFunc Compare)
   scols.push_back(sw[1].release());
   cudf::table src_table(std::move(scols));
 
-  std::vector<cudf::size_type> splits{2, 5, 9};
-
   std::vector<cudf::table> expected =
     create_expected_string_tables_for_splits(strings, splits, true);
 
@@ -653,7 +650,9 @@ void split_string_with_invalids(SplitFunc Split, CompareFunc Compare)
 }
 
 template <typename SplitFunc, typename CompareFunc>
-void split_empty_output_strings_column_value(SplitFunc Split, CompareFunc Compare)
+void split_empty_output_strings_column_value(SplitFunc Split,
+                                             CompareFunc Compare,
+                                             std::vector<cudf::size_type> const& splits = {0, 2, 2})
 {
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
@@ -671,8 +670,6 @@ void split_empty_output_strings_column_value(SplitFunc Split, CompareFunc Compar
 
   cudf::size_type num_cols = 2;
 
-  std::vector<cudf::size_type> splits{0, 2, 2};
-
   EXPECT_NO_THROW(Split(src_table, splits));
 
   auto result = Split(src_table, splits);
@@ -759,7 +756,7 @@ struct SplitNestedTypesTest : public cudf::test::BaseFixture {};
 
 // common functions for testing split/contiguous_split
 template <typename T, typename SplitFunc, typename CompareFunc>
-void split_lists(SplitFunc Split, CompareFunc Compare)
+void split_lists(SplitFunc Split, CompareFunc Compare, bool split = true)
 {
   using LCW = cudf::test::lists_column_wrapper<T>;
 
@@ -775,22 +772,28 @@ void split_lists(SplitFunc Split, CompareFunc Compare)
                                              {-10},
                                              {-100, -200}};
 
-    std::vector<cudf::size_type> splits{0, 1, 4, 5, 6, 9};
-
-    std::vector<cudf::test::lists_column_wrapper<T>> expected;
-    expected.push_back(LCW{});
-    expected.push_back(LCW{{1, 2, 3}});
-    expected.push_back(LCW{{4, 5}, {6}, {7, 8}});
-    expected.push_back(LCW{{9, 10, 11}});
-    expected.push_back(LCW{LCW{}});
-    expected.push_back(LCW{LCW{}, {-1, -2, -3, -4, -5}, {-10}});
-    expected.push_back(LCW{{-100, -200}});
-
-    auto result = Split(list, splits);
-    EXPECT_EQ(expected.size(), result.size());
-
-    for (unsigned long index = 0; index < result.size(); index++) {
-      Compare(expected[index], result[index]);
+    if (split) {
+      std::vector<cudf::size_type> splits{0, 1, 4, 5, 6, 9};
+
+      std::vector<cudf::test::lists_column_wrapper<T>> expected;
+      expected.push_back(LCW{});
+      expected.push_back(LCW{{1, 2, 3}});
+      expected.push_back(LCW{{4, 5}, {6}, {7, 8}});
+      expected.push_back(LCW{{9, 10, 11}});
+      expected.push_back(LCW{LCW{}});
+      expected.push_back(LCW{LCW{}, {-1, -2, -3, -4, -5}, {-10}});
+      expected.push_back(LCW{{-100, -200}});
+
+      auto result = Split(list, splits);
+      EXPECT_EQ(expected.size(), result.size());
+
+      for (unsigned long index = 0; index < result.size(); index++) {
+        Compare(expected[index], result[index]);
+      }
+    } else {
+      auto result = Split(list, {});
+      EXPECT_EQ(1, result.size());
+      Compare(list, result[0]);
     }
   }
 
@@ -803,25 +806,31 @@ void split_lists(SplitFunc Split, CompareFunc Compare)
                                              {LCW{}},
                                              {{-10}, {-100, -200}}};
 
-    std::vector<cudf::size_type> splits{1, 3, 4};
-
-    std::vector<cudf::test::lists_column_wrapper<T>> expected;
-    expected.push_back(LCW{{{1, 2, 3}, {4, 5}}});
-    expected.push_back(LCW{{LCW{}, LCW{}, {7, 8}, LCW{}}, {LCW{6}}});
-    expected.push_back(LCW{{{7, 8}, {9, 10, 11}, LCW{}}});
-    expected.push_back(LCW{{LCW{}, {-1, -2, -3, -4, -5}}, {LCW{}}, {{-10}, {-100, -200}}});
-
-    auto result = Split(list, splits);
-    EXPECT_EQ(expected.size(), result.size());
-
-    for (unsigned long index = 0; index < result.size(); index++) {
-      Compare(expected[index], result[index]);
+    if (split) {
+      std::vector<cudf::size_type> splits{1, 3, 4};
+
+      std::vector<cudf::test::lists_column_wrapper<T>> expected;
+      expected.push_back(LCW{{{1, 2, 3}, {4, 5}}});
+      expected.push_back(LCW{{LCW{}, LCW{}, {7, 8}, LCW{}}, {LCW{6}}});
+      expected.push_back(LCW{{{7, 8}, {9, 10, 11}, LCW{}}});
+      expected.push_back(LCW{{LCW{}, {-1, -2, -3, -4, -5}}, {LCW{}}, {{-10}, {-100, -200}}});
+
+      auto result = Split(list, splits);
+      EXPECT_EQ(expected.size(), result.size());
+
+      for (unsigned long index = 0; index < result.size(); index++) {
+        Compare(expected[index], result[index]);
+      }
+    } else {
+      auto result = Split(list, {});
+      EXPECT_EQ(1, result.size());
+      Compare(list, result[0]);
     }
   }
 }
 
 template <typename T, typename SplitFunc, typename CompareFunc>
-void split_lists_with_nulls(SplitFunc Split, CompareFunc Compare)
+void split_lists_with_nulls(SplitFunc Split, CompareFunc Compare, bool split = true)
 {
   using LCW = cudf::test::lists_column_wrapper<int>;
 
@@ -840,22 +849,28 @@ void split_lists_with_nulls(SplitFunc Split, CompareFunc Compare)
                                              {-10},
                                              {{-100, -200}, valids}};
 
-    std::vector<cudf::size_type> splits{0, 1, 4, 5, 6, 9};
-
-    std::vector<cudf::test::lists_column_wrapper<T>> expected;
-    expected.push_back(LCW{});
-    expected.push_back(LCW{{1, 2, 3}});
-    expected.push_back(LCW{{4, 5}, {6}, {{7, 8}, valids}});
-    expected.push_back(LCW{{9, 10, 11}});
-    expected.push_back(LCW{LCW{}});
-    expected.push_back(LCW{LCW{}, {{-1, -2, -3, -4, -5}, valids}, {-10}});
-    expected.push_back(LCW{{{-100, -200}, valids}});
-
-    auto result = Split(list, splits);
-    EXPECT_EQ(expected.size(), result.size());
-
-    for (unsigned long index = 0; index < result.size(); index++) {
-      Compare(expected[index], result[index]);
+    if (split) {
+      std::vector<cudf::size_type> splits{0, 1, 4, 5, 6, 9};
+
+      std::vector<cudf::test::lists_column_wrapper<T>> expected;
+      expected.push_back(LCW{});
+      expected.push_back(LCW{{1, 2, 3}});
+      expected.push_back(LCW{{4, 5}, {6}, {{7, 8}, valids}});
+      expected.push_back(LCW{{9, 10, 11}});
+      expected.push_back(LCW{LCW{}});
+      expected.push_back(LCW{LCW{}, {{-1, -2, -3, -4, -5}, valids}, {-10}});
+      expected.push_back(LCW{{{-100, -200}, valids}});
+
+      auto result = Split(list, splits);
+      EXPECT_EQ(expected.size(), result.size());
+
+      for (unsigned long index = 0; index < result.size(); index++) {
+        Compare(expected[index], result[index]);
+      }
+    } else {
+      auto result = Split(list, {});
+      EXPECT_EQ(1, result.size());
+      Compare(list, result[0]);
     }
   }
 
@@ -868,26 +883,32 @@ void split_lists_with_nulls(SplitFunc Split, CompareFunc Compare)
                                              {LCW{}},
                                              {{-10}, {-100, -200}}};
 
-    std::vector<cudf::size_type> splits{1, 3, 4};
-
-    std::vector<cudf::test::lists_column_wrapper<T>> expected;
-    expected.push_back(LCW{{{{1, 2, 3}, valids}, {4, 5}}});
-    expected.push_back(LCW{{{LCW{}, LCW{}, {7, 8}, LCW{}}, valids}, {{{6}}}});
-    expected.push_back(LCW{{{{7, 8}, {{9, 10, 11}, valids}, LCW{}}, valids}});
-    expected.push_back(
-      LCW{{{LCW{}, {-1, -2, -3, -4, -5}}, valids}, {LCW{}}, {{-10}, {-100, -200}}});
-
-    auto result = Split(list, splits);
-    EXPECT_EQ(expected.size(), result.size());
-
-    for (unsigned long index = 0; index < result.size(); index++) {
-      Compare(expected[index], result[index]);
+    if (split) {
+      std::vector<cudf::size_type> splits{1, 3, 4};
+
+      std::vector<cudf::test::lists_column_wrapper<T>> expected;
+      expected.push_back(LCW{{{{1, 2, 3}, valids}, {4, 5}}});
+      expected.push_back(LCW{{{LCW{}, LCW{}, {7, 8}, LCW{}}, valids}, {{{6}}}});
+      expected.push_back(LCW{{{{7, 8}, {{9, 10, 11}, valids}, LCW{}}, valids}});
+      expected.push_back(
+        LCW{{{LCW{}, {-1, -2, -3, -4, -5}}, valids}, {LCW{}}, {{-10}, {-100, -200}}});
+
+      auto result = Split(list, splits);
+      EXPECT_EQ(expected.size(), result.size());
+
+      for (unsigned long index = 0; index < result.size(); index++) {
+        Compare(expected[index], result[index]);
+      }
+    } else {
+      auto result = Split(list, {});
+      EXPECT_EQ(1, result.size());
+      Compare(list, result[0]);
     }
   }
 }
 
 template <typename SplitFunc, typename CompareFunc>
-void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare)
+void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare, bool split = true)
 {
   // 1. String "names" column.
   std::vector<std::string> names{
@@ -921,7 +942,8 @@ void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare)
       : cudf::test::structs_column_wrapper({names_column, ages_column, is_human_col});
 
   // split
-  std::vector<cudf::size_type> splits{0, 1, 3, 8};
+  std::vector<cudf::size_type> splits;
+  if (split) { splits = std::vector<cudf::size_type>({0, 1, 3, 8}); }
   auto result = Split(struct_column, splits);
 
   // expected outputs
@@ -952,20 +974,26 @@ void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare)
 }
 
 template <typename SplitFunc, typename CompareFunc>
-void split_structs_no_children(SplitFunc Split, CompareFunc Compare)
+void split_structs_no_children(SplitFunc Split, CompareFunc Compare, bool split = true)
 {
   // no nulls
   {
     auto struct_column = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{});
-    auto expected      = cudf::make_structs_column(2, {}, 0, rmm::device_buffer{});
-
-    // split
-    std::vector<cudf::size_type> splits{2};
-    auto result = Split(*struct_column, splits);
-
-    EXPECT_EQ(result.size(), 2ul);
-    Compare(*expected, result[0]);
-    Compare(*expected, result[1]);
+    if (split) {
+      auto expected = cudf::make_structs_column(2, {}, 0, rmm::device_buffer{});
+
+      // split
+      std::vector<cudf::size_type> splits{2};
+      auto result = Split(*struct_column, splits);
+
+      EXPECT_EQ(result.size(), 2ul);
+      Compare(*expected, result[0]);
+      Compare(*expected, result[1]);
+    } else {
+      auto result = Split(*struct_column, {});
+      EXPECT_EQ(1, result.size());
+      Compare(*struct_column, result[0]);
+    }
   }
 
   // all nulls
@@ -975,33 +1003,45 @@ void split_structs_no_children(SplitFunc Split, CompareFunc Compare)
       cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
     auto struct_column = cudf::make_structs_column(4, {}, null_count, std::move(null_mask));
 
-    std::vector<bool> expected_validity{false, false};
-    std::tie(null_mask, null_count) =
-      cudf::test::detail::make_null_mask(expected_validity.begin(), expected_validity.end());
-    auto expected = cudf::make_structs_column(2, {}, null_count, std::move(null_mask));
-
-    // split
-    std::vector<cudf::size_type> splits{2};
-    auto result = Split(*struct_column, splits);
-
-    EXPECT_EQ(result.size(), 2ul);
-    Compare(*expected, result[0]);
-    Compare(*expected, result[1]);
+    if (split) {
+      std::vector<bool> expected_validity{false, false};
+      std::tie(null_mask, null_count) =
+        cudf::test::detail::make_null_mask(expected_validity.begin(), expected_validity.end());
+      auto expected = cudf::make_structs_column(2, {}, null_count, std::move(null_mask));
+
+      // split
+      std::vector<cudf::size_type> splits{2};
+      auto result = Split(*struct_column, splits);
+
+      EXPECT_EQ(result.size(), 2ul);
+      Compare(*expected, result[0]);
+      Compare(*expected, result[1]);
+    } else {
+      auto result = Split(*struct_column, {});
+      EXPECT_EQ(1, result.size());
+      Compare(*struct_column, result[0]);
+    }
   }
 
   // no nulls, empty output column
   {
     auto struct_column = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{});
-    auto expected0     = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{});
-    auto expected1     = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{});
-
-    // split
-    std::vector<cudf::size_type> splits{4};
-    auto result = Split(*struct_column, splits);
-
-    EXPECT_EQ(result.size(), 2ul);
-    Compare(*expected0, result[0]);
-    Compare(*expected1, result[1]);
+    if (split) {
+      auto expected0 = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{});
+      auto expected1 = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{});
+
+      // split
+      std::vector<cudf::size_type> splits{4};
+      auto result = Split(*struct_column, splits);
+
+      EXPECT_EQ(result.size(), 2ul);
+      Compare(*expected0, result[0]);
+      Compare(*expected1, result[1]);
+    } else {
+      auto result = Split(*struct_column, {});
+      EXPECT_EQ(1, result.size());
+      Compare(*struct_column, result[0]);
+    }
   }
 
   // all nulls, empty output column
@@ -1011,25 +1051,31 @@ void split_structs_no_children(SplitFunc Split, CompareFunc Compare)
       cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
     auto struct_column = cudf::make_structs_column(4, {}, null_count, std::move(null_mask));
 
-    std::vector<bool> expected_validity0{false, false, false, false};
-    std::tie(null_mask, null_count) =
-      cudf::test::detail::make_null_mask(expected_validity0.begin(), expected_validity0.end());
-    auto expected0 = cudf::make_structs_column(4, {}, null_count, std::move(null_mask));
-
-    auto expected1 = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{});
-
-    // split
-    std::vector<cudf::size_type> splits{4};
-    auto result = Split(*struct_column, splits);
-
-    EXPECT_EQ(result.size(), 2ul);
-    Compare(*expected0, result[0]);
-    Compare(*expected1, result[1]);
+    if (split) {
+      std::vector<bool> expected_validity0{false, false, false, false};
+      std::tie(null_mask, null_count) =
+        cudf::test::detail::make_null_mask(expected_validity0.begin(), expected_validity0.end());
+      auto expected0 = cudf::make_structs_column(4, {}, null_count, std::move(null_mask));
+
+      auto expected1 = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{});
+
+      // split
+      std::vector<cudf::size_type> splits{4};
+      auto result = Split(*struct_column, splits);
+
+      EXPECT_EQ(result.size(), 2ul);
+      Compare(*expected0, result[0]);
+      Compare(*expected1, result[1]);
+    } else {
+      auto result = Split(*struct_column, {});
+      EXPECT_EQ(1, result.size());
+      Compare(*struct_column, result[0]);
+    }
   }
 }
 
 template <typename SplitFunc, typename CompareFunc>
-void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare)
+void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare, bool split = true)
 {
   // Struct<List<List>>
   using LCW = cudf::test::lists_column_wrapper<float>;
@@ -1064,33 +1110,134 @@ void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare)
   auto struct_column =
     cudf::test::structs_column_wrapper({names_column, ages_column, list}, struct_validity.begin());
 
-  // split
-  std::vector<cudf::size_type> splits{1, 3, 8};
-  auto result = Split(struct_column, splits);
+  if (split) {
+    std::vector<cudf::size_type> splits{1, 3, 8};
+    auto result = Split(struct_column, splits);
+    // expected results
+    auto expected_names = create_expected_string_columns_for_splits(names, splits, names_validity);
+    auto expected_ages  = create_expected_columns_for_splits<int>(splits, ages, ages_validity);
+    std::vector<cudf::test::lists_column_wrapper<float>> expected_lists;
+    expected_lists.push_back(LCW({{{1, 2, 3}, {4}}}));
+    expected_lists.push_back(LCW({{{-1, -2}, LCW{}}, LCW{}}));
+    std::vector<bool> ex_v{1, 1, 0, 1, 0};
+    expected_lists.push_back(LCW({{{10}, {20, 30, 40}, {100, -100}},
+                                  {LCW{}, LCW{}, {8, 9}},
+                                  LCW{},
+                                  {{8}, {10, 9, 8, 7, 6, 5}},
+                                  {{5, 6}, LCW{}, {8}}},
+                                 ex_v.begin()));
+    expected_lists.push_back(LCW({{LCW{-3, 4, -5}}}));
+
+    auto expected_struct_validity = create_expected_validity(splits, struct_validity);
+    EXPECT_EQ(expected_names.size(), result.size());
+
+    for (std::size_t index = 0; index < result.size(); index++) {
+      auto expected = cudf::test::structs_column_wrapper(
+        {expected_names[index], expected_ages[index], expected_lists[index]},
+        expected_struct_validity[index]);
+      Compare(expected, result[index]);
+    }
+  } else {
+    auto result = Split(struct_column, {});
+    Compare(struct_column, result[0]);
+  }
+}
 
-  // expected results
-  auto expected_names = create_expected_string_columns_for_splits(names, splits, names_validity);
-  auto expected_ages  = create_expected_columns_for_splits<int>(splits, ages, ages_validity);
-  std::vector<cudf::test::lists_column_wrapper<float>> expected_lists;
-  expected_lists.push_back(LCW({{{1, 2, 3}, {4}}}));
-  expected_lists.push_back(LCW({{{-1, -2}, LCW{}}, LCW{}}));
-  std::vector<bool> ex_v{1, 1, 0, 1, 0};
-  expected_lists.push_back(LCW({{{10}, {20, 30, 40}, {100, -100}},
-                                {LCW{}, LCW{}, {8, 9}},
-                                LCW{},
-                                {{8}, {10, 9, 8, 7, 6, 5}},
-                                {{5, 6}, LCW{}, {8}}},
-                               ex_v.begin()));
-  expected_lists.push_back(LCW({{LCW{-3, 4, -5}}}));
+template <typename SplitFunc, typename CompareFunc>
+void split_nested_list_of_structs(SplitFunc Split, CompareFunc Compare, bool split = true)
+{
+  // List<Struct<List<>>
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
 
-  auto expected_struct_validity = create_expected_validity(splits, struct_validity);
-  EXPECT_EQ(expected_names.size(), result.size());
+  // 1. String "names" column.
+  std::vector<std::string> names{"Vimes",
+                                 "Carrot",
+                                 "Angua",
+                                 "Cheery",
+                                 "Detritus",
+                                 "Slant",
+                                 "Fred",
+                                 "Todd",
+                                 "Kevin",
+                                 "Jason",
+                                 "Clark",
+                                 "Bob",
+                                 "Mithun",
+                                 "Sameer",
+                                 "Tim",
+                                 "Mark",
+                                 "Herman",
+                                 "Will"};
+  std::vector<bool> names_validity{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  cudf::test::strings_column_wrapper names_column(names.begin(), names.end());
 
-  for (std::size_t index = 0; index < result.size(); index++) {
-    auto expected = cudf::test::structs_column_wrapper(
-      {expected_names[index], expected_ages[index], expected_lists[index]},
-      expected_struct_validity[index]);
-    Compare(expected, result[index]);
+  // 2. Numeric "ages" column.
+  std::vector<int> ages{5, 10, 15, 20, 25, 30, 100, 101, 102, 26, 64, 12, 17, 16, 120, 44, 23, 50};
+  std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0};
+  auto ages_column =
+    cudf::test::fixed_width_column_wrapper<int>(ages.begin(), ages.end(), ages_validity.begin());
+
+  // 3. List column
+  std::vector<bool> list_validity{1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1};
+  cudf::test::lists_column_wrapper<cudf::string_view> list(
+    {{"ab", "cd", "ef"},
+     LCW{"gh"},
+     {"ijk", "lmn"},
+     LCW{},
+     LCW{"o"},
+     {"pqr", "stu", "vwx"},
+     {"yz", "aaaa"},
+     LCW{"bbbb"},
+     {"cccc", "ddd", "eee", "fff", "ggg", "hh"},
+     {"b", "cdr", "efh", "um"},
+     LCW{"gh", "iu"},
+     {"lmn"},
+     LCW{"org"},
+     LCW{},
+     {"stu", "vwx"},
+     {"yz", "aaaa", "kem"},
+     LCW{"bbbb"},
+     {"cccc", "eee", "faff", "jiea", "fff", "ggg", "hh"}},
+    list_validity.begin());
+
+  // Assembly struct column
+  auto const struct_validity =
+    std::vector<bool>{1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1};
+  auto struct_column =
+    cudf::test::structs_column_wrapper({names_column, ages_column, list}, struct_validity.begin());
+
+  // wrap in a list
+  std::vector<int> outer_offsets{0, 3, 4, 8, 13, 16, 17, 18};
+  cudf::test::fixed_width_column_wrapper<int> outer_offsets_col(outer_offsets.begin(),
+                                                                outer_offsets.end());
+  std::vector<bool> outer_validity{1, 1, 1, 0, 1, 1, 0};
+  auto [outer_null_mask, outer_null_count] =
+    cudf::test::detail::make_null_mask(outer_validity.begin(), outer_validity.end());
+  auto outer_list = make_lists_column(static_cast<cudf::size_type>(outer_validity.size()),
+                                      outer_offsets_col.release(),
+                                      struct_column.release(),
+                                      outer_null_count,
+                                      std::move(outer_null_mask));
+  if (split) {
+    std::vector<cudf::size_type> splits{1, 3, 7};
+    cudf::table_view tbl({static_cast<cudf::column_view>(*outer_list)});
+
+    // we are testing the results of contiguous_split against regular cudf::split, which may seem
+    // weird. however, cudf::split() is a simple operation that just sets offsets at the topmost
+    // output column, whereas contiguous_split is a deep copy of the data to contiguous output
+    // buffers. so as long as we believe the comparison code (expect_columns_equivalent) can compare
+    // these outputs correctly, this should be safe.
+    auto result   = Split(*outer_list, splits);
+    auto expected = cudf::split(static_cast<cudf::column_view>(*outer_list), splits);
+    ASSERT_EQ(result.size(), expected.size());
+
+    for (std::size_t index = 0; index < result.size(); index++) {
+      Compare(expected[index], result[index]);
+    }
+  } else {
+    auto result = Split(*outer_list, {});
+    EXPECT_EQ(1, result.size());
+    Compare(*outer_list, result[0]);
   }
 }
 
@@ -1165,6 +1312,47 @@ TEST_F(SplitNestedTypesTest, StructsOfList)
 template <typename T>
 struct ContiguousSplitTest : public cudf::test::BaseFixture {};
 
+std::vector<cudf::packed_table> do_chunked_pack(cudf::table_view const& input)
+{
+  auto mr = rmm::mr::get_current_device_resource();
+
+  rmm::device_buffer bounce_buff(1 * 1024 * 1024, cudf::get_default_stream(), mr);
+  auto bounce_buff_span =
+    cudf::device_span<uint8_t>(static_cast<uint8_t*>(bounce_buff.data()), bounce_buff.size());
+
+  auto chunked_pack = cudf::chunked_pack::create(input, bounce_buff_span.size(), mr);
+
+  // right size the final buffer
+  rmm::device_buffer final_buff(
+    chunked_pack->get_total_contiguous_size(), cudf::get_default_stream(), mr);
+
+  std::size_t final_buff_offset = 0;
+  while (chunked_pack->has_next()) {
+    auto bytes_copied = chunked_pack->next(bounce_buff_span);
+    cudaMemcpyAsync((uint8_t*)final_buff.data() + final_buff_offset,
+                    bounce_buff.data(),
+                    bytes_copied,
+                    cudaMemcpyDefault,
+                    cudf::get_default_stream());
+    final_buff_offset += bytes_copied;
+  }
+
+  auto packed_column_metas = chunked_pack->build_metadata();
+  // for chunked contig split, this is going to be a size 1 vector if we have
+  // results, or a size 0 if the original table was empty (no columns)
+  std::vector<cudf::packed_table> result;
+  if (packed_column_metas) {
+    result  = std::vector<cudf::packed_table>(1);
+    auto pc = cudf::packed_columns(std::move(packed_column_metas),
+                                   std::make_unique<rmm::device_buffer>(std::move(final_buff)));
+
+    auto unpacked = cudf::unpack(pc);
+    cudf::packed_table pt{std::move(unpacked), std::move(pc)};
+    result[0] = std::move(pt);
+  }
+  return result;
+}
+
 // the various utility functions in slice_tests.cuh don't like the chrono types
 using FixedWidthTypesWithoutChrono =
   cudf::test::Concat<cudf::test::NumericTypes, cudf::test::FixedPointTypes>;
@@ -1208,6 +1396,41 @@ TYPED_TEST(ContiguousSplitTest, LongColumn)
     false);
 }
 
+TYPED_TEST(ContiguousSplitTest, LongColumnChunked)
+{
+  split_custom_column<TypeParam>(
+    [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+      return do_chunked_pack(t);
+    },
+    [](cudf::table_view const& expected, cudf::packed_table const& result) {
+      std::for_each(thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(expected.num_columns()),
+                    [&expected, &result](cudf::size_type i) {
+                      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected.column(i),
+                                                          result.table.column(i));
+                    });
+    },
+    100002,
+    {},
+    true);
+
+  split_custom_column<TypeParam>(
+    [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+      return do_chunked_pack(t);
+    },
+    [](cudf::table_view const& expected, cudf::packed_table const& result) {
+      std::for_each(thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(expected.num_columns()),
+                    [&expected, &result](cudf::size_type i) {
+                      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected.column(i),
+                                                          result.table.column(i));
+                    });
+    },
+    100002,
+    {},
+    false);
+}
+
 TYPED_TEST(ContiguousSplitTest, LongColumnBigSplits)
 {
   split_custom_column<TypeParam>(
@@ -1309,6 +1532,46 @@ TEST_F(ContiguousSplitUntypedTest, ProgressiveSizes)
   }
 }
 
+TEST_F(ContiguousSplitUntypedTest, ProgressiveSizesChunked)
+{
+  constexpr int col_size = 4096;
+
+  // stress test copying a wide amount of bytes.
+  for (int idx = 2048; idx < col_size; idx += 128) {
+    split_custom_column<uint64_t>(
+      [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+        return do_chunked_pack(t);
+      },
+      [](cudf::table_view const& expected, cudf::packed_table const& result) {
+        std::for_each(thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(expected.num_columns()),
+                      [&expected, &result](cudf::size_type i) {
+                        CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected.column(i),
+                                                            result.table.column(i));
+                      });
+      },
+      col_size,
+      {},
+      true);
+
+    split_custom_column<uint64_t>(
+      [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+        return do_chunked_pack(t);
+      },
+      [](cudf::table_view const& expected, cudf::packed_table const& result) {
+        std::for_each(thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(expected.num_columns()),
+                      [&expected, &result](cudf::size_type i) {
+                        CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected.column(i),
+                                                            result.table.column(i));
+                      });
+      },
+      col_size,
+      {},
+      false);
+  }
+}
+
 TEST_F(ContiguousSplitUntypedTest, ValidityRepartition)
 {
   // it is tricky to actually get the internal repartitioning/load-balancing code to add new splits
@@ -1328,14 +1591,32 @@ TEST_F(ContiguousSplitUntypedTest, ValidityRepartition)
   cudf::table_view t({*col});
   auto result   = cudf::contiguous_split(t, {num_rows / 2});
   auto expected = cudf::split(t, {num_rows / 2});
-  CUDF_EXPECTS(result.size() == expected.size(),
-               "Mismatch in split results in ValidityRepartition test");
+  ASSERT_EQ(result.size(), expected.size());
 
   for (size_t idx = 0; idx < result.size(); idx++) {
     CUDF_TEST_EXPECT_TABLES_EQUAL(result[idx].table, expected[idx]);
   }
 }
 
+TEST_F(ContiguousSplitUntypedTest, ValidityRepartitionChunked)
+{
+  srand(0);
+  auto rvalids                   = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+    return static_cast<float>(rand()) / static_cast<float>(RAND_MAX) < 0.5f ? 0 : 1;
+  });
+  cudf::size_type const num_rows = 2000000;
+  auto col                       = cudf::sequence(num_rows, cudf::numeric_scalar<int8_t>{0});
+  auto [null_mask, null_count]   = cudf::test::detail::make_null_mask(rvalids, rvalids + num_rows);
+  col->set_null_mask(std::move(null_mask), null_count);
+
+  cudf::table_view t({*col});
+  auto result    = do_chunked_pack(t);
+  auto& expected = t;
+  EXPECT_EQ(1, result.size());
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result[0].table, expected);
+}
+
 TEST_F(ContiguousSplitUntypedTest, ValidityEdgeCase)
 {
   // tests an edge case where the splits cause the final validity data to be copied
@@ -1361,6 +1642,16 @@ TEST_F(ContiguousSplitUntypedTest, DISABLED_VeryLargeColumnTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, result[0].table.column(0));
 }
 
+// This test requires about 25GB of device memory when used with the arena allocator
+TEST_F(ContiguousSplitUntypedTest, DISABLED_VeryLargeColumnTestChunked)
+{
+  // tests an edge case where buf.elements * buf.element_size overflows an INT32.
+  auto col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT64}, 400 * 1024 * 1024, cudf::mask_state::UNALLOCATED);
+  auto result = do_chunked_pack(cudf::table_view{{*col}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, result[0].table.column(0));
+}
+
 // contiguous split with strings
 struct ContiguousSplitStringTableTest : public SplitTest<std::string> {};
 
@@ -1375,6 +1666,18 @@ TEST_F(ContiguousSplitStringTableTest, StringWithInvalids)
     });
 }
 
+TEST_F(ContiguousSplitStringTableTest, StringWithInvalidsChunked)
+{
+  split_string_with_invalids(
+    [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+      return do_chunked_pack(t);
+    },
+    [](cudf::table_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.table);
+    },
+    {});
+}
+
 TEST_F(ContiguousSplitStringTableTest, EmptyInputColumn)
 {
   // build a bunch of empty stuff
@@ -1392,7 +1695,14 @@ TEST_F(ContiguousSplitStringTableTest, EmptyInputColumn)
   {
     std::vector<cudf::size_type> splits;
     auto result = cudf::contiguous_split(src_table, splits);
-    CUDF_EXPECTS(result.size() == 1, "Incorrect returned contiguous_split result size!");
+    ASSERT_EQ(result.size(), 1);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(src_table, result[0].table);
+  }
+
+  {
+    auto result = do_chunked_pack(src_table);
+    ASSERT_EQ(result.size(), 1);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(src_table, result[0].table);
   }
@@ -1400,7 +1710,7 @@ TEST_F(ContiguousSplitStringTableTest, EmptyInputColumn)
   {
     std::vector<cudf::size_type> splits{0, 0, 0, 0};
     auto result = cudf::contiguous_split(src_table, splits);
-    CUDF_EXPECTS(result.size() == 5, "Incorrect returned contiguous_split result size!");
+    ASSERT_EQ(result.size(), 5);
 
     for (size_t idx = 0; idx < result.size(); idx++) {
       CUDF_TEST_EXPECT_TABLES_EQUIVALENT(src_table, result[idx].table);
@@ -1417,6 +1727,16 @@ TEST_F(ContiguousSplitStringTableTest, EmptyOutputColumn)
     [](cudf::packed_table const& t, int num_cols) { EXPECT_EQ(t.table.num_columns(), num_cols); });
 }
 
+TEST_F(ContiguousSplitStringTableTest, EmptyOutputColumnChunked)
+{
+  split_empty_output_strings_column_value(
+    [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+      return do_chunked_pack(t);
+    },
+    [](cudf::packed_table const& t, int num_cols) { EXPECT_EQ(t.table.num_columns(), num_cols); },
+    {});
+}
+
 TEST_F(ContiguousSplitStringTableTest, NullStringColumn)
 {
   split_null_input_strings_column_value(
@@ -1464,6 +1784,13 @@ TEST_F(ContiguousSplitTableCornerCases, EmptyTable)
   });
 }
 
+TEST_F(ContiguousSplitTableCornerCases, EmptyTableChunked)
+{
+  split_empty_table([](cudf::table_view const& t,
+                       std::vector<cudf::size_type> const&) { return do_chunked_pack(t); },
+                    {});
+}
+
 TEST_F(ContiguousSplitTableCornerCases, EmptyIndices)
 {
   split_empty_indices([](cudf::table_view const& t, std::vector<cudf::size_type> const& splits) {
@@ -1501,6 +1828,16 @@ TEST_F(ContiguousSplitTableCornerCases, EmptyOutputColumn)
     [](cudf::packed_table const& t, int num_cols) { EXPECT_EQ(t.table.num_columns(), num_cols); });
 }
 
+TEST_F(ContiguousSplitTableCornerCases, EmptyOutputColumnChunked)
+{
+  split_empty_output_column_value(
+    [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+      return do_chunked_pack(t);
+    },
+    [](cudf::packed_table const& t, int num_cols) { EXPECT_EQ(t.table.num_columns(), num_cols); },
+    {});
+}
+
 TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypes)
 {
   cudf::size_type start = 0;
@@ -1542,6 +1879,70 @@ TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypes)
   }
 }
 
+TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypesChunked)
+{
+  cudf::size_type start = 0;
+  auto valids = cudf::detail::make_counting_transform_iterator(start, [](auto i) { return true; });
+
+  std::size_t num_rows = 1000000;
+
+  std::vector<std::string> strings1(num_rows);
+  std::vector<std::string> strings2(num_rows);
+  strings1[0] = "";
+  strings2[0] = "";
+  for (std::size_t i = 1; i < num_rows; ++i) {
+    auto str    = std::to_string(i);
+    strings1[i] = str;
+    strings2[i] = str;
+  }
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+
+  auto iter0 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i); });
+  auto c0    = cudf::test::fixed_width_column_wrapper<int>(iter0, iter0 + num_rows, valids);
+  cols.push_back(c0.release());
+
+  auto iter1 = cudf::detail::make_counting_transform_iterator(10, [](auto i) { return (i); });
+  auto c1    = cudf::test::fixed_width_column_wrapper<int>(iter1, iter1 + num_rows, valids);
+  cols.push_back(c1.release());
+
+  auto c2 = cudf::test::strings_column_wrapper(strings1.begin(), strings1.end(), valids);
+  cols.push_back(c2.release());
+
+  auto c3 = cudf::test::strings_column_wrapper(strings2.begin(), strings2.end(), valids);
+  cols.push_back(c3.release());
+
+  auto iter4 = cudf::detail::make_counting_transform_iterator(20, [](auto i) { return (i); });
+  auto c4    = cudf::test::fixed_width_column_wrapper<int>(iter4, iter4 + num_rows, valids);
+  cols.push_back(c4.release());
+
+  auto tbl     = cudf::table(std::move(cols));
+  auto results = do_chunked_pack(tbl.view());
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(tbl, results[0].table);
+}
+
+TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypesSingleRowChunked)
+{
+  cudf::size_type start = 0;
+  auto valids = cudf::detail::make_counting_transform_iterator(start, [](auto i) { return true; });
+
+  std::size_t num_rows = 1;
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+
+  auto iter0 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i); });
+  auto c0    = cudf::test::fixed_width_column_wrapper<int32_t>(iter0, iter0 + num_rows, valids);
+  cols.push_back(c0.release());
+
+  auto iter1 = cudf::detail::make_counting_transform_iterator(1, [](auto i) { return (i); });
+  auto c1    = cudf::test::fixed_width_column_wrapper<int64_t>(iter1, iter1 + num_rows);
+  cols.push_back(c1.release());
+
+  auto tbl     = cudf::table(std::move(cols));
+  auto results = do_chunked_pack(tbl.view());
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(tbl, results[0].table);
+}
+
 TEST_F(ContiguousSplitTableCornerCases, PreSplitTable)
 {
   auto valids =
@@ -1592,6 +1993,13 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitTable)
       CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected[index], result[index].table);
     }
   }
+
+  {
+    auto result = do_chunked_pack(pre_split[1]);
+    EXPECT_EQ(1, result.size());
+    auto expected = pre_split[1];
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result[0].table);
+  }
 }
 
 TEST_F(ContiguousSplitTableCornerCases, PreSplitTableLarge)
@@ -1663,8 +2071,7 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitList)
 
   // list<struct<float>>
   {
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{
-      0, 2, 5, 7, 10, 12, 14, 17, 20};
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 2, 5, 7, 10, 12, 14, 17, 20};
     cudf::test::fixed_width_column_wrapper<float> floats{1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
                                                          11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
     cudf::test::structs_column_wrapper data({floats});
@@ -1722,8 +2129,7 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitStructs)
 
   // struct<list<struct>>
   {
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{
-      0, 2, 5, 7, 10, 12, 14, 17, 20};
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 2, 5, 7, 10, 12, 14, 17, 20};
     cudf::test::fixed_width_column_wrapper<float> floats{1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
                                                          11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
     cudf::test::structs_column_wrapper data({floats});
@@ -1766,6 +2172,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
 
     std::vector<cudf::size_type> splits2({1});
     EXPECT_NO_THROW(contiguous_split(src_table, splits2));
+
+    EXPECT_NO_THROW(do_chunked_pack(src_table));
   }
 
   // this produces an empty strings column with children that have no data,
@@ -1784,6 +2192,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
 
     std::vector<cudf::size_type> splits2({1});
     EXPECT_NO_THROW(contiguous_split(src_table, splits2));
+
+    EXPECT_NO_THROW(do_chunked_pack(src_table));
   }
 
   // this produces an empty lists column with children that have no data,
@@ -1802,6 +2212,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
 
     std::vector<cudf::size_type> splits2({1});
     EXPECT_NO_THROW(contiguous_split(src_table, splits2));
+
+    EXPECT_NO_THROW(do_chunked_pack(src_table));
   }
 
   // this produces an empty lists column with children that have no data,
@@ -1820,6 +2232,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
 
     std::vector<cudf::size_type> splits2({1});
     EXPECT_NO_THROW(contiguous_split(src_table, splits2));
+
+    EXPECT_NO_THROW(do_chunked_pack(src_table));
   }
 
   // this produces an empty struct column with children that have no data,
@@ -1840,6 +2254,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
 
     std::vector<cudf::size_type> splits2({1});
     EXPECT_NO_THROW(contiguous_split(src_table, splits2));
+
+    EXPECT_NO_THROW(do_chunked_pack(src_table));
   }
 }
 
@@ -1868,6 +2284,12 @@ TEST_F(ContiguousSplitTableCornerCases, SplitEmpty)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sliced[0], result[0].table);
   }
 
+  {
+    auto result = do_chunked_pack(sliced[0]);
+    EXPECT_EQ(1, result.size());
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sliced[0], result[0].table);
+  }
+
   {
     auto result = cudf::contiguous_split(sliced[0], {0});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sliced[0], result[0].table);
@@ -1878,6 +2300,52 @@ TEST_F(ContiguousSplitTableCornerCases, SplitEmpty)
   }
 }
 
+TEST_F(ContiguousSplitTableCornerCases, OutBufferToSmall)
+{
+  // internally, contiguous split chunks GPU work in 1MB contiguous copies
+  // so the output buffer must be 1MB or larger.
+  EXPECT_THROW(cudf::chunked_pack::create({}, 1 * 1024, mr()), cudf::logic_error);
+}
+
+TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall)
+{
+  auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024, mr());
+  rmm::device_buffer buff(1 * 1024, cudf::get_default_stream(), mr());
+  cudf::device_span<uint8_t> too_small(static_cast<uint8_t*>(buff.data()), buff.size());
+  std::size_t copied = 0;
+  // throws because we created chunked_contig_split with 1MB, but we are giving
+  // it a 1KB span here
+  EXPECT_THROW(copied = chunked_pack->next(too_small), cudf::logic_error);
+  EXPECT_EQ(copied, 0);
+}
+
+TEST_F(ContiguousSplitTableCornerCases, EmptyTableHasNextFalse)
+{
+  auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024, mr());
+  rmm::device_buffer buff(1 * 1024 * 1024, cudf::get_default_stream(), mr());
+  cudf::device_span<uint8_t> bounce_buff(static_cast<uint8_t*>(buff.data()), buff.size());
+  EXPECT_EQ(chunked_pack->has_next(), false);  // empty input table
+  std::size_t copied = 0;
+  EXPECT_THROW(copied = chunked_pack->next(bounce_buff), cudf::logic_error);
+  EXPECT_EQ(copied, 0);
+}
+
+TEST_F(ContiguousSplitTableCornerCases, ExhaustedHasNextFalse)
+{
+  cudf::test::strings_column_wrapper a{"abc", "def", "ghi", "jkl", "mno", "", "st", "uvwx"};
+  cudf::table_view t({a});
+  rmm::device_buffer buff(1 * 1024 * 1024, cudf::get_default_stream(), mr());
+  cudf::device_span<uint8_t> bounce_buff(static_cast<uint8_t*>(buff.data()), buff.size());
+  auto chunked_pack = cudf::chunked_pack::create(t, buff.size(), mr());
+  EXPECT_EQ(chunked_pack->has_next(), true);
+  std::size_t copied = chunked_pack->next(bounce_buff);
+  EXPECT_EQ(copied, chunked_pack->get_total_contiguous_size());
+  EXPECT_EQ(chunked_pack->has_next(), false);
+  copied = 0;
+  EXPECT_THROW(copied = chunked_pack->next(bounce_buff), cudf::logic_error);
+  EXPECT_EQ(copied, 0);
+}
+
 struct ContiguousSplitNestedTypesTest : public cudf::test::BaseFixture {};
 
 TEST_F(ContiguousSplitNestedTypesTest, Lists)
@@ -1892,6 +2360,19 @@ TEST_F(ContiguousSplitNestedTypesTest, Lists)
     });
 }
 
+TEST_F(ContiguousSplitNestedTypesTest, ListsChunked)
+{
+  split_lists<int>(
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    /*split*/ false);
+}
+
 TEST_F(ContiguousSplitNestedTypesTest, ListsWithNulls)
 {
   split_lists_with_nulls<int>(
@@ -1904,6 +2385,19 @@ TEST_F(ContiguousSplitNestedTypesTest, ListsWithNulls)
     });
 }
 
+TEST_F(ContiguousSplitNestedTypesTest, ListsWithNullsChunked)
+{
+  split_lists_with_nulls<int>(
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    /*split*/ false);
+}
+
 TEST_F(ContiguousSplitNestedTypesTest, Structs)
 {
   split_structs(
@@ -1917,6 +2411,20 @@ TEST_F(ContiguousSplitNestedTypesTest, Structs)
     });
 }
 
+TEST_F(ContiguousSplitNestedTypesTest, StructsChunked)
+{
+  split_structs(
+    false,
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    /*split*/ false);
+}
+
 TEST_F(ContiguousSplitNestedTypesTest, StructsWithNulls)
 {
   split_structs(
@@ -1930,6 +2438,20 @@ TEST_F(ContiguousSplitNestedTypesTest, StructsWithNulls)
     });
 }
 
+TEST_F(ContiguousSplitNestedTypesTest, StructsWithNullsChunked)
+{
+  split_structs(
+    true,
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    {});
+}
+
 TEST_F(ContiguousSplitNestedTypesTest, StructsNoChildren)
 {
   split_structs_no_children(
@@ -1942,6 +2464,19 @@ TEST_F(ContiguousSplitNestedTypesTest, StructsNoChildren)
     });
 }
 
+TEST_F(ContiguousSplitNestedTypesTest, StructsNoChildrenChunked)
+{
+  split_structs_no_children(
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    /*split*/ false);
+}
+
 TEST_F(ContiguousSplitNestedTypesTest, StructsOfList)
 {
   split_nested_struct_of_list(
@@ -1954,94 +2489,40 @@ TEST_F(ContiguousSplitNestedTypesTest, StructsOfList)
     });
 }
 
-TEST_F(ContiguousSplitNestedTypesTest, ListOfStruct)
+TEST_F(ContiguousSplitNestedTypesTest, StructsOfListChunked)
 {
-  // List<Struct<List<>>
-  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-
-  // 1. String "names" column.
-  std::vector<std::string> names{"Vimes",
-                                 "Carrot",
-                                 "Angua",
-                                 "Cheery",
-                                 "Detritus",
-                                 "Slant",
-                                 "Fred",
-                                 "Todd",
-                                 "Kevin",
-                                 "Jason",
-                                 "Clark",
-                                 "Bob",
-                                 "Mithun",
-                                 "Sameer",
-                                 "Tim",
-                                 "Mark",
-                                 "Herman",
-                                 "Will"};
-  cudf::test::strings_column_wrapper names_column(names.begin(), names.end());
-
-  // 2. Numeric "ages" column.
-  std::vector<int> ages{5, 10, 15, 20, 25, 30, 100, 101, 102, 26, 64, 12, 17, 16, 120, 44, 23, 50};
-  std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0};
-  auto ages_column =
-    cudf::test::fixed_width_column_wrapper<int>(ages.begin(), ages.end(), ages_validity.begin());
-
-  // 3. List column
-  std::vector<bool> list_validity{1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1};
-  cudf::test::lists_column_wrapper<cudf::string_view> list(
-    {{"ab", "cd", "ef"},
-     LCW{"gh"},
-     {"ijk", "lmn"},
-     LCW{},
-     LCW{"o"},
-     {"pqr", "stu", "vwx"},
-     {"yz", "aaaa"},
-     LCW{"bbbb"},
-     {"cccc", "ddd", "eee", "fff", "ggg", "hh"},
-     {"b", "cdr", "efh", "um"},
-     LCW{"gh", "iu"},
-     {"lmn"},
-     LCW{"org"},
-     LCW{},
-     {"stu", "vwx"},
-     {"yz", "aaaa", "kem"},
-     LCW{"bbbb"},
-     {"cccc", "eee", "faff", "jiea", "fff", "ggg", "hh"}},
-    list_validity.begin());
-
-  // Assembly struct column
-  auto const struct_validity =
-    std::vector<bool>{1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1};
-  auto struct_column =
-    cudf::test::structs_column_wrapper({names_column, ages_column, list}, struct_validity.begin());
-
-  // wrap in a list
-  std::vector<int> outer_offsets{0, 3, 4, 8, 13, 16, 17, 18};
-  cudf::test::fixed_width_column_wrapper<int> outer_offsets_col(outer_offsets.begin(),
-                                                                outer_offsets.end());
-  std::vector<bool> outer_validity{1, 1, 1, 0, 1, 1, 0};
-  auto [outer_null_mask, null_count] =
-    cudf::test::detail::make_null_mask(outer_validity.begin(), outer_validity.end());
-  auto outer_list = make_lists_column(static_cast<cudf::size_type>(outer_validity.size()),
-                                      outer_offsets_col.release(),
-                                      struct_column.release(),
-                                      null_count,
-                                      std::move(outer_null_mask));
+  split_nested_struct_of_list(
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    /*split*/ false);
+}
 
-  // split
-  std::vector<cudf::size_type> splits{1, 3, 7};
-  cudf::table_view tbl({static_cast<cudf::column_view>(*outer_list)});
-
-  // we are testing the results of contiguous_split against regular cudf::split, which may seem
-  // weird. however, cudf::split() is a simple operation that just sets offsets at the topmost
-  // output column, whereas contiguous_split is a deep copy of the data to contiguous output
-  // buffers. so as long as we believe the comparison code (expect_columns_equivalent) can compare
-  // these outputs correctly, this should be safe.
-  auto result   = cudf::contiguous_split(tbl, splits);
-  auto expected = cudf::split(static_cast<cudf::column_view>(*outer_list), splits);
-  CUDF_EXPECTS(result.size() == expected.size(), "Split result size mismatch");
+TEST_F(ContiguousSplitNestedTypesTest, ListOfStruct)
+{
+  split_nested_list_of_structs(
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const& splits) {
+      cudf::table_view t({c});
+      return cudf::contiguous_split(t, splits);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    });
+}
 
-  for (std::size_t index = 0; index < result.size(); index++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected[index], result[index].table.column(0));
-  }
+TEST_F(ContiguousSplitNestedTypesTest, ListOfStructChunked)
+{
+  split_nested_list_of_structs(
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    /*split*/ false);
 }
diff --git a/cpp/tests/copying/utility_tests.cpp b/cpp/tests/copying/utility_tests.cpp
index a9c3c5311f8..dadb8ea4eb8 100644
--- a/cpp/tests/copying/utility_tests.cpp
+++ b/cpp/tests/copying/utility_tests.cpp
@@ -60,7 +60,7 @@ void check_empty_string_columns(cudf::column_view lhs, cudf::column_view rhs)
 
 TEST_F(EmptyLikeStringTest, ColumnStringTest)
 {
-  std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",
+  std::vector<char const*> h_strings{"the quick brown fox jumps over the lazy dog",
                                      "thé result does not include the value in the sum in",
                                      "",
                                      nullptr,
@@ -194,15 +194,7 @@ TYPED_TEST(AllocateLikeTest, ColumnNumericTestSameSize)
   input = make_numeric_column(
     cudf::data_type{cudf::type_to_id<TypeParam>()}, size, cudf::mask_state::ALL_VALID);
   got = cudf::allocate_like(input->view());
-  EXPECT_EQ(input->type(), got->type());
-  EXPECT_EQ(input->size(), got->size());
-  EXPECT_EQ(input->nullable(), got->nullable());
-  EXPECT_EQ(input->num_children(), got->num_children());
-  // CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL includes checking the null-count property.
-  // This value will be incorrect since the null mask will contain uninitialized bits
-  // and the null-count set to UNKNOWN_NULL_COUNT on return from allocate_like().
-  // This means any subsequent call to null_count() will try to compute the null-count
-  // using the uninitialized null-mask.
+  CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL(*input, *got);
 }
 
 TYPED_TEST(AllocateLikeTest, ColumnNumericTestSpecifiedSize)
@@ -221,15 +213,13 @@ TYPED_TEST(AllocateLikeTest, ColumnNumericTestSpecifiedSize)
   input = make_numeric_column(
     cudf::data_type{cudf::type_to_id<TypeParam>()}, size, cudf::mask_state::ALL_VALID);
   got = cudf::allocate_like(input->view(), specified_size);
+  // Can't use CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL because the sizes of
+  // the two columns are different.
   EXPECT_EQ(input->type(), got->type());
   EXPECT_EQ(specified_size, got->size());
+  EXPECT_EQ(0, got->null_count());
   EXPECT_EQ(input->nullable(), got->nullable());
   EXPECT_EQ(input->num_children(), got->num_children());
-  // CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL includes checking the null-count property.
-  // This value will be incorrect since the null mask will contain uninitialized bits
-  // and the null-count set to UNKNOWN_NULL_COUNT on return from allocate_like().
-  // This means any subsequent call to null_count() will try to compute the null-count
-  // using the uninitialized null-mask.
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index e2750d169ae..6bae20efa8c 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -51,7 +51,7 @@ TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp)
   using namespace cuda::std::chrono;
 
   cudf::data_type dtype{cudf::type_to_id<T>()};
-  cudf::column col{dtype, 0, rmm::device_buffer{}};
+  cudf::column col{dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0};
 
   EXPECT_THROW(extract_year(col), cudf::logic_error);
   EXPECT_THROW(extract_month(col), cudf::logic_error);
@@ -65,8 +65,7 @@ TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp)
   EXPECT_THROW(extract_nanosecond_fraction(col), cudf::logic_error);
   EXPECT_THROW(last_day_of_month(col), cudf::logic_error);
   EXPECT_THROW(day_of_year(col), cudf::logic_error);
-  EXPECT_THROW(add_calendrical_months(
-                 col, cudf::column{cudf::data_type{cudf::type_id::INT16}, 0, rmm::device_buffer{}}),
+  EXPECT_THROW(add_calendrical_months(col, *cudf::make_empty_column(cudf::type_id::INT16)),
                cudf::logic_error);
 }
 
@@ -215,8 +214,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestEmptyColumns)
   auto int16s_dtype     = cudf::data_type{cudf::type_to_id<int16_t>()};
   auto timestamps_dtype = cudf::data_type{cudf::type_to_id<T>()};
 
-  cudf::column int16s{int16s_dtype, 0, rmm::device_buffer{}};
-  cudf::column timestamps{timestamps_dtype, 0, rmm::device_buffer{}};
+  cudf::column int16s{int16s_dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0};
+  cudf::column timestamps{timestamps_dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps), int16s);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps), int16s);
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 5694513647b..24195362d92 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -256,95 +256,4 @@ TYPED_TEST(AtomicsTest, atomicCASRandom)
   this->atomic_test(input_array, is_cas_test, block_size, grid_size);
 }
 
-template <typename T>
-__global__ void gpu_atomic_bitwiseOp_test(T* result, T* data, size_t size)
-{
-  size_t id   = blockIdx.x * blockDim.x + threadIdx.x;
-  size_t step = blockDim.x * gridDim.x;
-
-  for (; id < size; id += step) {
-    atomicAnd(&result[0], data[id]);
-    atomicOr(&result[1], data[id]);
-    atomicXor(&result[2], data[id]);
-    cudf::genericAtomicOperation(&result[3], data[id], cudf::DeviceAnd{});
-    cudf::genericAtomicOperation(&result[4], data[id], cudf::DeviceOr{});
-    cudf::genericAtomicOperation(&result[5], data[id], cudf::DeviceXor{});
-  }
-}
-
-template <typename T>
-struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture {
-  void atomic_test(std::vector<uint64_t> const& v_input, int block_size = 0, int grid_size = 1)
-  {
-    size_t vec_size = v_input.size();
-    std::vector<T> v(vec_size);
-    std::transform(v_input.begin(), v_input.end(), v.begin(), [](int x) {
-      T t(x);
-      return t;
-    });
-
-    thrust::host_vector<T> identity(9, T{0});  // +3 elements padding for int8 tests
-    identity[0] = T(~0ull);
-    identity[3] = T(~0ull);
-
-    T exact[3];
-    exact[0] = std::accumulate(
-      v.begin(), v.end(), identity[0], [](T acc, uint64_t i) { return acc & T(i); });
-    exact[1] = std::accumulate(
-      v.begin(), v.end(), identity[1], [](T acc, uint64_t i) { return acc | T(i); });
-    exact[2] = std::accumulate(
-      v.begin(), v.end(), identity[2], [](T acc, uint64_t i) { return acc ^ T(i); });
-
-    auto dev_result = cudf::detail::make_device_uvector_sync(
-      identity, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-    auto dev_data = cudf::detail::make_device_uvector_sync(
-      v, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-
-    if (block_size == 0) { block_size = vec_size; }
-
-    gpu_atomic_bitwiseOp_test<T><<<grid_size, block_size, 0, cudf::get_default_stream().value()>>>(
-      reinterpret_cast<T*>(dev_result.data()), reinterpret_cast<T*>(dev_data.data()), vec_size);
-
-    auto host_result = cudf::detail::make_host_vector_sync(dev_result, cudf::get_default_stream());
-
-    CUDF_CHECK_CUDA(cudf::get_default_stream().value());
-
-    // print_exact(exact, "exact");
-    // print_exact(host_result.data(), "result");
-
-    EXPECT_EQ(host_result[0], exact[0]) << "atomicAnd test failed";
-    EXPECT_EQ(host_result[1], exact[1]) << "atomicOr  test failed";
-    EXPECT_EQ(host_result[2], exact[2]) << "atomicXor test failed";
-    EXPECT_EQ(host_result[3], exact[0]) << "atomicAnd test(2) failed";
-    EXPECT_EQ(host_result[4], exact[1]) << "atomicOr  test(2) failed";
-    EXPECT_EQ(host_result[5], exact[2]) << "atomicXor test(2) failed";
-  }
-
-  [[maybe_unused]] void print_exact(const T* v, const char* msg)
-  {
-    std::cout << std::hex << std::showbase;
-    std::cout << "The " << msg << " = {" << +v[0] << ", " << +v[1] << ", " << +v[2] << "}"
-              << std::endl;
-  }
-};
-
-using BitwiseOpTestingTypes =
-  cudf::test::Types<int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t>;
-
-TYPED_TEST_SUITE(AtomicsBitwiseOpTest, BitwiseOpTestingTypes);
-
-TYPED_TEST(AtomicsBitwiseOpTest, atomicBitwiseOps)
-{
-  {  // test for AND, XOR
-    std::vector<uint64_t> input_array(
-      {0xfcfc'fcfc'fcfc'fc7f, 0x7f'7f7f'7f7f'7ffc, 0xfffd'dffd'dffd'dfdf, 0x7f'7f7f'7f7f'7ffc});
-    this->atomic_test(input_array);
-  }
-  {  // test for OR, XOR
-    std::vector<uint64_t> input_array(
-      {0x01, 0xfc02, 0x1d'ff03, 0x1100'a0b0'801d'0003, 0x8000'0000'0000'0000, 0x1d'ff03});
-    this->atomic_test(input_array);
-  }
-}
-
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/dictionary/decode_test.cpp b/cpp/tests/dictionary/decode_test.cpp
index 01dfea9a012..25ccb331756 100644
--- a/cpp/tests/dictionary/decode_test.cpp
+++ b/cpp/tests/dictionary/decode_test.cpp
@@ -26,7 +26,7 @@ struct DictionaryDecodeTest : public cudf::test::BaseFixture {};
 
 TEST_F(DictionaryDecodeTest, StringColumn)
 {
-  std::vector<const char*> h_strings{"eee", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "eee", "aaa"};
+  std::vector<char const*> h_strings{"eee", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "eee", "aaa"};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
 
   auto dictionary = cudf::dictionary::encode(strings);
diff --git a/cpp/tests/dictionary/remove_keys_test.cpp b/cpp/tests/dictionary/remove_keys_test.cpp
index d39f0468e2d..eb48c3e783f 100644
--- a/cpp/tests/dictionary/remove_keys_test.cpp
+++ b/cpp/tests/dictionary/remove_keys_test.cpp
@@ -39,7 +39,7 @@ TEST_F(DictionaryRemoveKeysTest, StringsColumn)
   {
     auto const result =
       cudf::dictionary::remove_keys(cudf::dictionary_column_view(dictionary->view()), del_keys);
-    std::vector<const char*> h_expected{
+    std::vector<char const*> h_expected{
       "eee", "aaa", nullptr, nullptr, "ccc", "ccc", "ccc", "eee", "aaa"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(),
diff --git a/cpp/tests/dictionary/set_keys_test.cpp b/cpp/tests/dictionary/set_keys_test.cpp
index b7a19c7d1cd..9eb4b43b786 100644
--- a/cpp/tests/dictionary/set_keys_test.cpp
+++ b/cpp/tests/dictionary/set_keys_test.cpp
@@ -36,7 +36,7 @@ TEST_F(DictionarySetKeysTest, StringsKeys)
   cudf::test::strings_column_wrapper new_keys{"aaa", "ccc", "eee", "fff"};
   auto result = cudf::dictionary::set_keys(dictionary->view(), new_keys);
 
-  std::vector<const char*> h_expected{
+  std::vector<char const*> h_expected{
     "eee", "aaa", nullptr, nullptr, "ccc", "ccc", "ccc", "eee", "aaa"};
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),
diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp
index c38422af688..4f74523ec7c 100644
--- a/cpp/tests/filling/repeat_tests.cpp
+++ b/cpp/tests/filling/repeat_tests.cpp
@@ -174,6 +174,21 @@ TYPED_TEST(RepeatTypedTestFixture, ZeroSizeInput)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(p_ret->view().column(0), expected);
 }
 
+TYPED_TEST(RepeatTypedTestFixture, ZeroCount)
+{
+  using T = TypeParam;
+  cudf::test::fixed_width_column_wrapper<T, int32_t> input(thrust::make_counting_iterator(0),
+                                                           thrust::make_counting_iterator(10));
+
+  auto expected = cudf::make_empty_column(cudf::type_to_id<T>());
+
+  cudf::table_view input_table{{input}};
+  auto p_ret = cudf::repeat(input_table, 0);
+
+  EXPECT_EQ(p_ret->num_columns(), 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(p_ret->view().column(0), expected->view());
+}
+
 class RepeatStringTestFixture : public cudf::test::BaseFixture,
                                 cudf::test::UniformRandomGenerator<cudf::size_type> {
  public:
@@ -271,3 +286,22 @@ TEST_F(RepeatErrorTestFixture, CountHasNulls)
   // input_table.has_nulls() == true
   EXPECT_THROW(auto ret = cudf::repeat(input_table, count), cudf::logic_error);
 }
+
+TEST_F(RepeatErrorTestFixture, Overflow)
+{
+  auto input = cudf::test::fixed_width_column_wrapper<int32_t>(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + 100);
+  cudf::table_view input_table{{input}};
+  // set the count such that (count * num_rows) > max(size_type);
+  // the extra divide by 2 ensures the max is exceeded despite truncation in integer division
+  auto count = std::numeric_limits<cudf::size_type>::max() / (input_table.num_rows() / 2);
+  EXPECT_THROW(cudf::repeat(input_table, count), std::overflow_error);
+}
+
+TEST_F(RepeatErrorTestFixture, NegativeCount)
+{
+  auto input = cudf::test::fixed_width_column_wrapper<int32_t>(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + 100);
+  cudf::table_view input_table{{input}};
+  EXPECT_THROW(cudf::repeat(input_table, -1), cudf::logic_error);
+}
diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp
index 883ccd98447..749f4013013 100644
--- a/cpp/tests/groupby/collect_list_tests.cpp
+++ b/cpp/tests/groupby/collect_list_tests.cpp
@@ -127,7 +127,7 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion)
   using LCW = cudf::test::lists_column_wrapper<V, int32_t>;
 
   cudf::test::fixed_width_column_wrapper<K, int32_t> keys{1, 1, 2, 2, 3, 3, 4, 4};
-  const bool validity_mask[] = {true, false, false, true, true, true, false, false};
+  bool const validity_mask[] = {true, false, false, true, true, true, false, false};
   LCW values{{{1, 2}, {3, 4}, {5, 6, 7}, LCW{}, {9, 10}, {11}, {20, 30, 40}, LCW{}}, validity_mask};
 
   cudf::test::fixed_width_column_wrapper<K, int32_t> expect_keys{1, 2, 3, 4};
@@ -146,7 +146,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists)
 
   using LCW = cudf::test::lists_column_wrapper<V, int32_t>;
 
-  auto offsets = cudf::data_type{cudf::type_to_id<cudf::offset_type>()};
+  auto offsets = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
 
   cudf::test::fixed_width_column_wrapper<K, int32_t> keys{};
   auto values =
@@ -176,7 +176,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs)
 
   auto values =
     cudf::make_lists_column(0,
-                            cudf::make_empty_column(cudf::type_to_id<cudf::offset_type>()),
+                            cudf::make_empty_column(cudf::type_to_id<cudf::size_type>()),
                             struct_column.release(),
                             0,
                             {});
@@ -188,13 +188,13 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs)
 
   auto expect_child =
     cudf::make_lists_column(0,
-                            cudf::make_empty_column(cudf::type_to_id<cudf::offset_type>()),
+                            cudf::make_empty_column(cudf::type_to_id<cudf::size_type>()),
                             expect_struct_column.release(),
                             0,
                             {});
   auto expect_values =
     cudf::make_lists_column(0,
-                            cudf::make_empty_column(cudf::type_to_id<cudf::offset_type>()),
+                            cudf::make_empty_column(cudf::type_to_id<cudf::size_type>()),
                             std::move(expect_child),
                             0,
                             {});
diff --git a/cpp/tests/groupby/groupby_test_util.cpp b/cpp/tests/groupby/groupby_test_util.cpp
index ef99e148a62..de51012e8e1 100644
--- a/cpp/tests/groupby/groupby_test_util.cpp
+++ b/cpp/tests/groupby/groupby_test_util.cpp
@@ -77,7 +77,7 @@ void test_single_agg(cudf::column_view const& keys,
   cudf::groupby::groupby gb_obj(
     cudf::table_view({keys}), include_null_keys, keys_are_sorted, column_order, precedence);
 
-  auto result = gb_obj.aggregate(requests);
+  auto result = gb_obj.aggregate(requests, cudf::test::get_default_stream());
 
   if (use_sort == force_use_sort_impl::YES && keys_are_sorted == cudf::sorted::NO) {
     CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_expect_keys, result.first->view());
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index 4c4492f8151..755b0c20f17 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -20,6 +20,7 @@
 #include <cudf/groupby.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 enum class force_use_sort_impl : bool { NO, YES };
 
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index 42bc2d3e442..9481770dc58 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -428,6 +428,77 @@ TEST_F(groupby_max_struct_test, values_with_null_child)
   }
 }
 
+struct groupby_max_list_test : public cudf::test::BaseFixture {};
+
+TEST_F(groupby_max_list_test, basic)
+{
+  using lists = cudf::test::lists_column_wrapper<int32_t>;
+
+  auto const keys        = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 1, 2};
+  auto const vals        = lists{{1, 2}, {3, 4}, {5, 6, 7}, {0, 8}, {9, 10}};
+  auto const expect_keys = cudf::test::fixed_width_column_wrapper<int>{1, 2, 3};
+  auto const expect_vals = lists{{1, 2}, {9, 10}, {5, 6, 7}};
+
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_max_aggregation<cudf::groupby_aggregation>());
+}
+
+TEST_F(groupby_max_list_test, slice_input)
+{
+  using lists = cudf::test::lists_column_wrapper<int32_t>;
+  constexpr int32_t dont_care{1};
+
+  auto const keys_original =
+    cudf::test::fixed_width_column_wrapper<int32_t>{dont_care, 1, 2, 3, 1, 2, dont_care};
+  auto const vals_original =
+    lists{{1, 2, 3, 4, 5} /*dont care*/, {1, 2}, {3, 4}, {5, 6, 7}, {0, 8}, {9, 10}};
+  auto const keys = cudf::slice(keys_original, {1, 6})[0];
+  auto const vals = cudf::slice(vals_original, {1, 6})[0];
+
+  auto const expect_keys = cudf::test::fixed_width_column_wrapper<int>{1, 2, 3};
+  auto const expect_vals = lists{{1, 2}, {9, 10}, {5, 6, 7}};
+
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_max_aggregation<cudf::groupby_aggregation>());
+}
+
+TEST_F(groupby_max_list_test, null_keys_and_values)
+{
+  using lists = cudf::test::lists_column_wrapper<int32_t>;
+  constexpr int32_t null{0};
+
+  auto const keys =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{1, 2, 3, null, 1, 2}, null_at(3)};
+  auto const expect_keys = cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3}, no_nulls()};
+
+  // Null list element.
+  {
+    auto const vals = lists{{{} /*null*/, {1, 2}, {3, 4}, {5, 6, 7}, {0, 8}, {9, 10}}, null_at(0)};
+    auto const expect_vals = lists{{0, 8}, {9, 10}, {3, 4}};
+    test_single_agg(keys,
+                    vals,
+                    expect_keys,
+                    expect_vals,
+                    cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  }
+
+  // Null child element.
+  {
+    auto const vals        = lists{lists{{0, null}, null_at(1)},
+                            lists{1, 2},
+                            lists{3, 4},
+                            lists{5, 6, 7},
+                            lists{0, 8},
+                            lists{9, 10}};
+    auto const expect_vals = lists{{0, 8}, {9, 10}, {3, 4}};
+    test_single_agg(keys,
+                    vals,
+                    expect_keys,
+                    expect_vals,
+                    cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  }
+}
+
 template <typename V>
 struct groupby_max_floating_point_test : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index 8877189dbf3..44f9b7040c6 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -424,6 +424,77 @@ TEST_F(groupby_min_struct_test, values_with_null_child)
   }
 }
 
+struct groupby_min_list_test : public cudf::test::BaseFixture {};
+
+TEST_F(groupby_min_list_test, basic)
+{
+  using lists = cudf::test::lists_column_wrapper<int32_t>;
+
+  auto const keys        = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 1, 2};
+  auto const vals        = lists{{1, 2}, {3, 4}, {5, 6, 7}, {0, 8}, {9, 10}};
+  auto const expect_keys = cudf::test::fixed_width_column_wrapper<int>{1, 2, 3};
+  auto const expect_vals = lists{{0, 8}, {3, 4}, {5, 6, 7}};
+
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_min_aggregation<cudf::groupby_aggregation>());
+}
+
+TEST_F(groupby_min_list_test, slice_input)
+{
+  using lists = cudf::test::lists_column_wrapper<int32_t>;
+  constexpr int32_t dont_care{1};
+
+  auto const keys_original =
+    cudf::test::fixed_width_column_wrapper<int32_t>{dont_care, 1, 2, 3, 1, 2, dont_care};
+  auto const vals_original =
+    lists{{1, 2, 3, 4, 5} /*dont care*/, {1, 2}, {3, 4}, {5, 6, 7}, {0, 8}, {9, 10}};
+  auto const keys = cudf::slice(keys_original, {1, 6})[0];
+  auto const vals = cudf::slice(vals_original, {1, 6})[0];
+
+  auto const expect_keys = cudf::test::fixed_width_column_wrapper<int>{1, 2, 3};
+  auto const expect_vals = lists{{0, 8}, {3, 4}, {5, 6, 7}};
+
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_min_aggregation<cudf::groupby_aggregation>());
+}
+
+TEST_F(groupby_min_list_test, null_keys_and_values)
+{
+  using lists = cudf::test::lists_column_wrapper<int32_t>;
+  constexpr int32_t null{0};
+
+  auto const keys =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{1, 2, 3, null, 1, 2}, null_at(3)};
+  auto const expect_keys = cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3}, no_nulls()};
+
+  // Null list element.
+  {
+    auto const vals = lists{{{} /*null*/, {1, 2}, {3, 4}, {5, 6, 7}, {0, 8}, {9, 10}}, null_at(0)};
+    auto const expect_vals = lists{{0, 8}, {1, 2}, {3, 4}};
+    test_single_agg(keys,
+                    vals,
+                    expect_keys,
+                    expect_vals,
+                    cudf::make_min_aggregation<cudf::groupby_aggregation>());
+  }
+
+  // Null child element.
+  {
+    auto const vals        = lists{lists{{0, null}, null_at(1)},
+                            lists{1, 2},
+                            lists{3, 4},
+                            lists{5, 6, 7},
+                            lists{0, 8},
+                            lists{9, 10}};
+    auto const expect_vals = lists{lists{{0, null}, null_at(1)}, {1, 2}, {3, 4}};
+    test_single_agg(keys,
+                    vals,
+                    expect_keys,
+                    expect_vals,
+                    cudf::make_min_aggregation<cudf::groupby_aggregation>());
+  }
+}
+
 template <typename V>
 struct groupby_min_floating_point_test : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 1d2835675f9..97edc1c45a7 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -265,7 +265,7 @@ TEST_F(TDigestMergeTest, Grouped)
 {
   auto values = cudf::test::generate_standardized_percentile_distribution(
     cudf::data_type{cudf::type_id::FLOAT64});
-  CUDF_EXPECTS(values->size() == 750000, "Unexpected distribution size");
+  ASSERT_EQ(values->size(), 750000);
   // all in the same group
   auto keys = cudf::make_fixed_width_column(
     cudf::data_type{cudf::type_id::INT32}, values->size(), cudf::mask_state::UNALLOCATED);
@@ -321,7 +321,7 @@ TEST_F(TDigestMergeTest, Grouped)
     requests.push_back({*merge_input, std::move(aggregations)});
     auto result = gb.aggregate(requests);
 
-    CUDF_EXPECTS(result.second[0].results[0]->size() == 2, "Unexpected tdigest merge result size");
+    ASSERT_EQ(result.second[0].results[0]->size(), 2);
     cudf::tdigest::tdigest_column_view tdv(*result.second[0].results[0]);
 
     // verify centroids
@@ -376,7 +376,7 @@ TEST_F(TDigestMergeTest, Grouped)
     requests.push_back({*merge_input, std::move(aggregations)});
     auto result = gb.aggregate(requests);
 
-    CUDF_EXPECTS(result.second[0].results[0]->size() == 2, "Unexpected tdigest merge result size");
+    ASSERT_EQ(result.second[0].results[0]->size(), 2);
     cudf::tdigest::tdigest_column_view tdv(*result.second[0].results[0]);
 
     // verify centroids
@@ -423,7 +423,7 @@ TEST_F(TDigestMergeTest, Grouped)
     requests.push_back({*merge_input, std::move(aggregations)});
     auto result = gb.aggregate(requests);
 
-    CUDF_EXPECTS(result.second[0].results[0]->size() == 2, "Unexpected tdigest merge result size");
+    ASSERT_EQ(result.second[0].results[0]->size(), 2);
     cudf::tdigest::tdigest_column_view tdv(*result.second[0].results[0]);
 
     // verify centroids
diff --git a/cpp/tests/groupby/var_tests.cpp b/cpp/tests/groupby/var_tests.cpp
index 739fc30af6d..baebc45b975 100644
--- a/cpp/tests/groupby/var_tests.cpp
+++ b/cpp/tests/groupby/var_tests.cpp
@@ -167,3 +167,33 @@ TYPED_TEST(groupby_var_test, dictionary)
                   expect_vals,
                   cudf::make_variance_aggregation<cudf::groupby_aggregation>());
 }
+
+// This test ensures that the same results are produced by the sort-based and
+// hash-based implementations of groupby-variance.
+TYPED_TEST(groupby_var_test, sort_vs_hash)
+{
+  using K = int32_t;
+  using V = double;
+
+  cudf::test::fixed_width_column_wrapper<K> keys{50, 30, 90, 80};
+  cudf::test::fixed_width_column_wrapper<V> vals{380.0, 370.0, 24.0, 26.0};
+
+  cudf::groupby::groupby gb_obj(cudf::table_view({keys}));
+
+  auto agg1 = cudf::make_variance_aggregation<cudf::groupby_aggregation>();
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back();
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(std::move(agg1));
+
+  auto result1 = gb_obj.aggregate(requests);
+
+  // This agg forces a sort groupby.
+  auto agg2 = cudf::make_quantile_aggregation<cudf::groupby_aggregation>({0.25});
+  requests[0].aggregations.push_back(std::move(agg2));
+
+  auto result2 = gb_obj.aggregate(requests);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result1.second[0].results[0], *result2.second[0].results[0]);
+}
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index f38c5b3f58f..8d71c512c79 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,10 +68,7 @@ struct InsertTest : public cudf::test::BaseFixture {
 
 using TestTypes = ::testing::Types<key_value_types<int32_t, int32_t>,
                                    key_value_types<int64_t, int64_t>,
-                                   key_value_types<int8_t, int8_t>,
                                    key_value_types<int16_t, int16_t>,
-                                   key_value_types<int8_t, float>,
-                                   key_value_types<int16_t, double>,
                                    key_value_types<int32_t, float>,
                                    key_value_types<int64_t, double>>;
 
diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp
deleted file mode 100644
index 45dcb3f80a1..00000000000
--- a/cpp/tests/hashing/hash_test.cpp
+++ /dev/null
@@ -1,1109 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/hashing.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
-class HashTest : public cudf::test::BaseFixture {};
-
-TEST_F(HashTest, MultiValue)
-{
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
-
-  using limits = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
-    {0, 100, -100, limits::min(), limits::max()});
-
-  // Different truth values should be equal
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
-
-  using ts = cudf::timestamp_s;
-  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col(
-    {ts::duration::zero(),
-     static_cast<ts::duration>(100),
-     static_cast<ts::duration>(-100),
-     ts::duration::min(),
-     ts::duration::max()});
-
-  auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1, secs_col});
-  auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2, secs_col});
-
-  auto const output1 = cudf::hash(input1);
-  auto const output2 = cudf::hash(input2);
-
-  EXPECT_EQ(input1.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-}
-
-TEST_F(HashTest, MultiValueNulls)
-{
-  // Nulls with different values should be equal
-  cudf::test::strings_column_wrapper const strings_col1(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-    {0, 1, 1, 0, 1});
-  cudf::test::strings_column_wrapper const strings_col2(
-    {"different but null",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "I am Jack's complete lack of null value",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-    {0, 1, 1, 0, 1});
-
-  // Nulls with different values should be equal
-  using limits = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
-    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
-    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
-
-  // Nulls with different values should be equal
-  // Different truth values should be equal
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
-
-  // Nulls with different values should be equal
-  using ts = cudf::timestamp_s;
-  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col1(
-    {ts::duration::zero(),
-     static_cast<ts::duration>(100),
-     static_cast<ts::duration>(-100),
-     ts::duration::min(),
-     ts::duration::max()},
-    {1, 0, 0, 1, 1});
-  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col2(
-    {ts::duration::zero(),
-     static_cast<ts::duration>(-200),
-     static_cast<ts::duration>(200),
-     ts::duration::min(),
-     ts::duration::max()},
-    {1, 0, 0, 1, 1});
-
-  auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1});
-  auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2});
-
-  auto const output1 = cudf::hash(input1);
-  auto const output2 = cudf::hash(input2);
-
-  EXPECT_EQ(input1.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-
-  auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
-  auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);
-
-  EXPECT_EQ(input1.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
-}
-
-TEST_F(HashTest, BasicList)
-{
-  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
-  using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
-
-  auto const col = LCW{{}, {}, {1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}};
-  auto const input  = cudf::table_view({col});
-  auto const expect = ICW{1607593296,
-                          1607593296,
-                          -636010097,
-                          -132459357,
-                          -636010097,
-                          -2008850957,
-                          -1023787369,
-                          761197503,
-                          761197503,
-                          1340177511,
-                          -1023787369,
-                          -1023787369};
-
-  auto const output = cudf::hash(input);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-
-  auto const expect_seeded = ICW{1607594268u,
-                                 1607594268u,
-                                 1576790066u,
-                                 1203671017u,
-                                 1576790066u,
-                                 2107478077u,
-                                 1756855002u,
-                                 2228938758u,
-                                 2228938758u,
-                                 3491134126u,
-                                 1756855002u,
-                                 1756855002u};
-
-  auto const seeded_output = cudf::hash(input, cudf::hash_id::HASH_MURMUR3, 15);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
-}
-
-TEST_F(HashTest, NullableList)
-{
-  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
-  using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
-
-  auto const valids = std::vector<bool>{1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0};
-  auto const col =
-    LCW{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {}, {2, 2}, {2, 2}, {}}, valids.begin()};
-  auto expect = ICW{-2023148619,
-                    -2023148619,
-                    -31671896,
-                    -31671896,
-                    -1205248335,
-                    1865773848,
-                    1865773848,
-                    -2023148682,
-                    -1205248335,
-                    -1205248335,
-                    -2023148682};
-
-  auto const output = cudf::hash(cudf::table_view({col}));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-
-  auto const expect_seeded = ICW{2271820643u,
-                                 2271820643u,
-                                 1038318696u,
-                                 1038318696u,
-                                 595138041u,
-                                 3027840870u,
-                                 3027840870u,
-                                 2271820578u,
-                                 595138041u,
-                                 595138041u,
-                                 2271820578u};
-
-  auto const seeded_output = cudf::hash(cudf::table_view({col}), cudf::hash_id::HASH_MURMUR3, 31);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
-}
-
-TEST_F(HashTest, ListOfStruct)
-{
-  auto col1 = cudf::test::fixed_width_column_wrapper<int32_t>{
-    {-1, -1, 0, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, 1, 2},
-    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
-  auto col2 = cudf::test::strings_column_wrapper{
-    {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"},
-    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1}};
-  auto struct_col = cudf::test::structs_column_wrapper{
-    {col1, col2}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
-
-  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-    0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18};
-
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    17, offsets.release(), struct_col.release(), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{83451479,
-                                                                 83451479,
-                                                                 83455332,
-                                                                 83455332,
-                                                                 -759684425,
-                                                                 -959632766,
-                                                                 -959632766,
-                                                                 -959632766,
-                                                                 -959636527,
-                                                                 -656998704,
-                                                                 613652814,
-                                                                 1902080426,
-                                                                 1902080426,
-                                                                 2061025592,
-                                                                 2061025592,
-                                                                 -319840811,
-                                                                 -319840811};
-
-  auto const output = cudf::hash(cudf::table_view({*list_column}));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-
-  auto expect_seeded = cudf::test::fixed_width_column_wrapper<uint32_t>{81710442u,
-                                                                        81710442u,
-                                                                        81729816u,
-                                                                        81729816u,
-                                                                        3532787573u,
-                                                                        3642097855u,
-                                                                        3642097855u,
-                                                                        3642097855u,
-                                                                        3642110391u,
-                                                                        3889855760u,
-                                                                        1494406307u,
-                                                                        103934081u,
-                                                                        103934081u,
-                                                                        3462063680u,
-                                                                        3462063680u,
-                                                                        1696730835u,
-                                                                        1696730835u};
-
-  auto const seeded_output =
-    cudf::hash(cudf::table_view({*list_column}), cudf::hash_id::HASH_MURMUR3, 619);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
-}
-
-TEST_F(HashTest, ListOfEmptyStruct)
-{
-  // []
-  // []
-  // Null
-  // Null
-  // [Null, Null]
-  // [Null, Null]
-  // [Null, Null]
-  // [Null]
-  // [Null]
-  // [{}]
-  // [{}]
-  // [{}, {}]
-  // [{}, {}]
-
-  auto struct_validity = std::vector<bool>{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
-  auto struct_col = cudf::make_structs_column(14, {}, null_count, std::move(null_mask));
-
-  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-    0, 0, 0, 0, 0, 2, 4, 6, 7, 8, 9, 10, 12, 14};
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  std::tie(null_mask, null_count) =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    13, offsets.release(), std::move(struct_col), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{2271818677u,
-                                                                 2271818677u,
-                                                                 2271818614u,
-                                                                 2271818614u,
-                                                                 3954409013u,
-                                                                 3954409013u,
-                                                                 3954409013u,
-                                                                 2295666275u,
-                                                                 2295666275u,
-                                                                 2295666276u,
-                                                                 2295666276u,
-                                                                 3954409052u,
-                                                                 3954409052u};
-
-  auto output = cudf::hash(cudf::table_view({*list_column}));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
-TEST_F(HashTest, EmptyDeepList)
-{
-  // List<List<int>>, where all lists are empty
-  // []
-  // []
-  // Null
-  // Null
-
-  // Internal empty list
-  auto list1 = cudf::test::lists_column_wrapper<int>{};
-
-  auto offsets       = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0};
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    4, offsets.release(), list1.release(), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{
-    2271818677u, 2271818677u, 2271818614u, 2271818614u};
-
-  auto output = cudf::hash(cudf::table_view({*list_column}));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
-template <typename T>
-class HashTestTyped : public cudf::test::BaseFixture {};
-
-TYPED_TEST_SUITE(HashTestTyped, cudf::test::FixedWidthTypes);
-
-TYPED_TEST(HashTestTyped, Equality)
-{
-  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> const col{0, 127, 1, 2, 8};
-  auto const input = cudf::table_view({col});
-
-  // Hash of same input should be equal
-  auto const output1 = cudf::hash(input);
-  auto const output2 = cudf::hash(input);
-
-  EXPECT_EQ(input.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-
-  auto const spark_output1 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
-  auto const spark_output2 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3);
-
-  EXPECT_EQ(input.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
-}
-
-TYPED_TEST(HashTestTyped, EqualityNulls)
-{
-  using T = TypeParam;
-
-  // Nulls with different values should be equal
-  cudf::test::fixed_width_column_wrapper<T, int32_t> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-  cudf::test::fixed_width_column_wrapper<T, int32_t> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-
-  auto const input1 = cudf::table_view({col1});
-  auto const input2 = cudf::table_view({col2});
-
-  auto const output1 = cudf::hash(input1);
-  auto const output2 = cudf::hash(input2);
-
-  EXPECT_EQ(input1.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-
-  auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
-  auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);
-
-  EXPECT_EQ(input1.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
-}
-
-template <typename T>
-class HashTestFloatTyped : public cudf::test::BaseFixture {};
-
-TYPED_TEST_SUITE(HashTestFloatTyped, cudf::test::FloatingPointTypes);
-
-TYPED_TEST(HashTestFloatTyped, TestExtremes)
-{
-  using T = TypeParam;
-  T min   = std::numeric_limits<T>::min();
-  T max   = std::numeric_limits<T>::max();
-  T nan   = std::numeric_limits<T>::quiet_NaN();
-  T inf   = std::numeric_limits<T>::infinity();
-
-  cudf::test::fixed_width_column_wrapper<T> const col(
-    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  cudf::test::fixed_width_column_wrapper<T> const col_neg_zero(
-    {T(-0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  cudf::test::fixed_width_column_wrapper<T> const col_neg_nan(
-    {T(0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
-
-  auto const table_col          = cudf::table_view({col});
-  auto const table_col_neg_zero = cudf::table_view({col_neg_zero});
-  auto const table_col_neg_nan  = cudf::table_view({col_neg_nan});
-
-  auto const hash_col          = cudf::hash(table_col);
-  auto const hash_col_neg_zero = cudf::hash(table_col_neg_zero);
-  auto const hash_col_neg_nan  = cudf::hash(table_col_neg_nan);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_zero, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, verbosity);
-
-  // Spark hash is sensitive to 0 and -0
-  constexpr auto spark_hasher  = cudf::hash_id::HASH_SPARK_MURMUR3;
-  auto const spark_col         = cudf::hash(table_col, spark_hasher, 0);
-  auto const spark_col_neg_nan = cudf::hash(table_col_neg_nan, spark_hasher);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
-}
-
-class SparkMurmurHash3Test : public cudf::test::BaseFixture {};
-
-TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
-{
-  // The hash values were determined by running the following Scala code in Apache Spark.
-  // Note that Spark >= 3.2 normalizes the float/double value of -0. to +0. and both values hash
-  // to the same result. This is normalized in the calling code (Spark RAPIDS plugin) for Spark
-  // >= 3.2. However, the reference values for -0. below must be obtained with Spark < 3.2 and
-  // libcudf will continue to implement the Spark < 3.2 behavior until Spark >= 3.2 is required and
-  // the workaround in the calling code is removed. This also affects the combined hash values.
-
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types._
-  import org.apache.spark.sql.Row
-  import org.apache.spark.sql.catalyst.util.DateTimeUtils
-
-  val schema = new StructType()
-      .add("structs", new StructType()
-          .add("a", IntegerType)
-          .add("b", StringType)
-          .add("c", new StructType()
-              .add("x", FloatType)
-              .add("y", LongType)))
-      .add("strings", StringType)
-      .add("doubles", DoubleType)
-      .add("timestamps", TimestampType)
-      .add("decimal64", DecimalType(18, 7))
-      .add("longs", LongType)
-      .add("floats", FloatType)
-      .add("dates", DateType)
-      .add("decimal32", DecimalType(9, 3))
-      .add("ints", IntegerType)
-      .add("shorts", ShortType)
-      .add("bytes", ByteType)
-      .add("bools", BooleanType)
-      .add("decimal128", DecimalType(38, 11))
-
-  val data = Seq(
-      Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble,
-          DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat,
-          DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte,
-          false, BigDecimal(0)),
-      Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble),
-          DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat),
-          DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte,
-          true, BigDecimal("0.000000001")),
-      Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN,
-          DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN,
-          DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte,
-          true, BigDecimal("-0.00000000001")),
-      Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)),
-          "All work and no play makes Jack a dull boy", Double.MinValue,
-          DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"),
-          Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100),
-          BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true,
-          BigDecimal("-9999999999999999.99999999999")),
-      Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)),
-          "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue,
-          DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"),
-          Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100),
-          BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false,
-          BigDecimal("99999999999999999999999999.99999999999")))
-
-  val df = spark.createDataFrame(sc.parallelize(data), schema)
-  df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}"))
-  println(s"combined => ${df.select(hash(col("*"))).collect.mkString(",")}")
-  */
-
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_structs_expected(
-    {-105406170, 90479889, -678041645, 1667387937, 301478567});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected(
-    {142593372, 1217302703, -715697185, -2061143941, -111635966});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_doubles_expected(
-    {-1670924195, -853646085, -1281358385, 1897734433, -508695674});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_timestamps_expected(
-    {-1670924195, 1114849490, 904948192, -1832979433, 1752430209});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal64_expected(
-    {-1670924195, 1114849490, 904948192, 1962370902, -1795328666});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_longs_expected(
-    {-1670924195, 1114849490, 904948192, -853646085, -1604625029});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_floats_expected(
-    {933211791, 723455942, -349261430, -1225560532, -338752985});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_dates_expected(
-    {933211791, 751823303, -1080202046, -1906567553, -1503850410});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal32_expected(
-    {-1670924195, 1114849490, 904948192, -1454351396, -193774131});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_ints_expected(
-    {933211791, 751823303, -1080202046, 723455942, 133916647});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_shorts_expected(
-    {933211791, 751823303, -1080202046, -1871935946, 1249274084});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bytes_expected(
-    {933211791, 751823303, -1080202046, 1110053733, 1135925485});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bools_expected(
-    {933211791, -559580957, -559580957, -559580957, 933211791});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal128_expected(
-    {-783713497, -295670906, 1398487324, -52622807, -1359749815});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_combined_expected(
-    {401603227, 588162166, 552160517, 1132537411, -326043017});
-
-  using double_limits = std::numeric_limits<double>;
-  using long_limits   = std::numeric_limits<int64_t>;
-  using float_limits  = std::numeric_limits<float>;
-  using int_limits    = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> a_col{0, 100, -100, 0x1234'5678, -0x7654'3210};
-  cudf::test::strings_column_wrapper b_col{"a", "bc", "def", "ghij", "klmno"};
-  cudf::test::fixed_width_column_wrapper<float> x_col{
-    0.f, 100.f, -100.f, float_limits::infinity(), -float_limits::infinity()};
-  cudf::test::fixed_width_column_wrapper<int64_t> y_col{
-    0L, 100L, -100L, 0x0123'4567'89ab'cdefL, -0x0123'4567'89ab'cdefL};
-  cudf::test::structs_column_wrapper c_col{{x_col, y_col}};
-  cudf::test::structs_column_wrapper const structs_col{{a_col, b_col, c_col}};
-
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
-  cudf::test::fixed_width_column_wrapper<double> const doubles_col(
-    {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()});
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> const
-    timestamps_col({0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000});
-  cudf::test::fixed_point_column_wrapper<int64_t> const decimal64_col(
-    {0L, 100L, -100L, -999999999999999999L, 999999999999999999L}, numeric::scale_type{-7});
-  cudf::test::fixed_width_column_wrapper<int64_t> const longs_col(
-    {0L, 100L, -100L, long_limits::min(), long_limits::max()});
-  cudf::test::fixed_width_column_wrapper<float> const floats_col(
-    {0.f, -0.f, -float_limits::quiet_NaN(), float_limits::lowest(), float_limits::max()});
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> dates_col(
-    {0, 100, -100, int_limits::min() / 100, int_limits::max() / 100});
-  cudf::test::fixed_point_column_wrapper<int32_t> const decimal32_col(
-    {0, 100, -100, -999999999, 999999999}, numeric::scale_type{-3});
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
-    {0, 100, -100, int_limits::min(), int_limits::max()});
-  cudf::test::fixed_width_column_wrapper<int16_t> const shorts_col({0, 100, -100, -32768, 32767});
-  cudf::test::fixed_width_column_wrapper<int8_t> const bytes_col({0, 100, -100, -128, 127});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
-  cudf::test::fixed_point_column_wrapper<__int128_t> const decimal128_col(
-    {static_cast<__int128>(0),
-     static_cast<__int128>(100),
-     static_cast<__int128>(-1),
-     (static_cast<__int128>(0xFFFF'FFFF'FCC4'D1C3u) << 64 | 0x602F'7FC3'1800'0001u),
-     (static_cast<__int128>(0x0785'EE10'D5DA'46D9u) << 64 | 0x00F4'369F'FFFF'FFFFu)},
-    numeric::scale_type{-11});
-
-  constexpr auto hasher      = cudf::hash_id::HASH_SPARK_MURMUR3;
-  auto const hash_structs    = cudf::hash(cudf::table_view({structs_col}), hasher, 42);
-  auto const hash_strings    = cudf::hash(cudf::table_view({strings_col}), hasher, 42);
-  auto const hash_doubles    = cudf::hash(cudf::table_view({doubles_col}), hasher, 42);
-  auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, 42);
-  auto const hash_decimal64  = cudf::hash(cudf::table_view({decimal64_col}), hasher, 42);
-  auto const hash_longs      = cudf::hash(cudf::table_view({longs_col}), hasher, 42);
-  auto const hash_floats     = cudf::hash(cudf::table_view({floats_col}), hasher, 42);
-  auto const hash_dates      = cudf::hash(cudf::table_view({dates_col}), hasher, 42);
-  auto const hash_decimal32  = cudf::hash(cudf::table_view({decimal32_col}), hasher, 42);
-  auto const hash_ints       = cudf::hash(cudf::table_view({ints_col}), hasher, 42);
-  auto const hash_shorts     = cudf::hash(cudf::table_view({shorts_col}), hasher, 42);
-  auto const hash_bytes      = cudf::hash(cudf::table_view({bytes_col}), hasher, 42);
-  auto const hash_bools1     = cudf::hash(cudf::table_view({bools_col1}), hasher, 42);
-  auto const hash_bools2     = cudf::hash(cudf::table_view({bools_col2}), hasher, 42);
-  auto const hash_decimal128 = cudf::hash(cudf::table_view({decimal128_col}), hasher, 42);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal64, hash_decimal64_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_longs, hash_longs_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_floats, hash_floats_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_dates, hash_dates_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal32, hash_decimal32_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_ints, hash_ints_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_shorts, hash_shorts_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal128, hash_decimal128_expected, verbosity);
-
-  auto const combined_table = cudf::table_view({structs_col,
-                                                strings_col,
-                                                doubles_col,
-                                                timestamps_col,
-                                                decimal64_col,
-                                                longs_col,
-                                                floats_col,
-                                                dates_col,
-                                                decimal32_col,
-                                                ints_col,
-                                                shorts_col,
-                                                bytes_col,
-                                                bools_col2,
-                                                decimal128_col});
-  auto const hash_combined  = cudf::hash(combined_table, hasher, 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity);
-}
-
-TEST_F(SparkMurmurHash3Test, StringsWithSeed)
-{
-  // The hash values were determined by running the following Scala code in Apache Spark:
-  // val strs = Seq("", "The quick brown fox",
-  //              "jumps over the lazy dog.",
-  //              "All work and no play makes Jack a dull boy",
-  //              "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721")
-  // println(strs.map(org.apache.spark.unsafe.types.UTF8String.fromString)
-  //   .map(org.apache.spark.sql.catalyst.expressions.Murmur3HashFunction.hash(
-  //     _, org.apache.spark.sql.types.StringType, 314)))
-
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected_seed_314(
-    {1467149710, 723257560, -1620282500, -2001858707, 1588473657});
-
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
-
-  constexpr auto hasher   = cudf::hash_id::HASH_SPARK_MURMUR3;
-  auto const hash_strings = cudf::hash(cudf::table_view({strings_col}), hasher, 314);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected_seed_314, verbosity);
-}
-
-TEST_F(SparkMurmurHash3Test, ListValues)
-{
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
-  import org.apache.spark.sql.Row
-
-  val schema = new StructType()
-    .add("lists",ArrayType(ArrayType(IntegerType)))
-
-  val data = Seq(
-    Row(null),
-    Row(List(null)),
-    Row(List(List())),
-    Row(List(List(1))),
-    Row(List(List(1, 2))),
-    Row(List(List(1, 2, 3))),
-    Row(List(List(1, 2), List(3))),
-    Row(List(List(1), List(2, 3))),
-    Row(List(List(1), List(null, 2, 3))),
-    Row(List(List(1, 2), List(3), List(null))),
-    Row(List(List(1, 2), null, List(3))),
-  )
-
-  val df = spark.createDataFrame(
-    spark.sparkContext.parallelize(data), schema)
-
-  val df2 = df.selectExpr("lists", "hash(lists) as hash")
-  df2.printSchema()
-  df2.show(false)
-  */
-
-  auto const null = -1;
-  auto nested_list =
-    cudf::test::lists_column_wrapper<int>({{},
-                                           {1},
-                                           {1, 2},
-                                           {1, 2, 3},
-                                           {1, 2},
-                                           {3},
-                                           {1},
-                                           {2, 3},
-                                           {1},
-                                           {{null, 2, 3}, cudf::test::iterators::nulls_at({0})},
-                                           {1, 2},
-                                           {3},
-                                           {{null}, cudf::test::iterators::nulls_at({0})},
-                                           {1, 2},
-                                           {},
-                                           {3}},
-                                          cudf::test::iterators::nulls_at({0, 14}));
-  auto offsets =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 1, 2, 3, 4, 6, 8, 10, 13, 16};
-  auto list_validity = cudf::test::iterators::nulls_at({0});
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_validity, list_validity + 11);
-  auto list_column = cudf::make_lists_column(
-    11, offsets.release(), nested_list.release(), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{42,
-                                                                42,
-                                                                42,
-                                                                -559580957,
-                                                                -222940379,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097};
-
-  auto output = cudf::hash(cudf::table_view({*list_column}), cudf::hash_id::HASH_SPARK_MURMUR3, 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
-TEST_F(SparkMurmurHash3Test, StructOfListValues)
-{
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
-  import org.apache.spark.sql.Row
-
-  val schema = new StructType()
-    .add("structs", new StructType()
-        .add("a", ArrayType(IntegerType))
-        .add("b", ArrayType(IntegerType)))
-
-  val data = Seq(
-    Row(Row(List(), List())),
-    Row(Row(List(0), List(0))),
-    Row(Row(List(1, null), null)),
-    Row(Row(List(1, null), List())),
-    Row(Row(List(), List(null, 1))),
-    Row(Row(null, List(1))),
-    Row(Row(List(2, 3), List(4, 5))),
-  )
-
-  val df = spark.createDataFrame(
-    spark.sparkContext.parallelize(data), schema)
-
-  val df2 = df.selectExpr("lists", "hash(lists) as hash")
-  df2.printSchema()
-  df2.show(false)
-  */
-
-  auto const null = -1;
-  auto col1 =
-    cudf::test::lists_column_wrapper<int>({{},
-                                           {0},
-                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
-                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
-                                           {},
-                                           {} /*NULL*/,
-                                           {2, 3}},
-                                          cudf::test::iterators::nulls_at({5}));
-  auto col2 = cudf::test::lists_column_wrapper<int>(
-    {{}, {0}, {} /*NULL*/, {}, {{null, 1}, cudf::test::iterators::nulls_at({0})}, {1}, {4, 5}},
-    cudf::test::iterators::nulls_at({2}));
-  auto struct_column = cudf::test::structs_column_wrapper{{col1, col2}};
-
-  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
-    42, 59727262, -559580957, -559580957, -559580957, -559580957, 170038658};
-
-  auto output =
-    cudf::hash(cudf::table_view({struct_column}), cudf::hash_id::HASH_SPARK_MURMUR3, 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
-TEST_F(SparkMurmurHash3Test, ListOfStructValues)
-{
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
-  import org.apache.spark.sql.Row
-
-  val schema = new StructType()
-    .add("lists", ArrayType(new StructType()
-      .add("a", IntegerType)
-      .add("b", IntegerType)))
-
-  val data = Seq(
-    Row(List(Row(0, 0))),
-    Row(List(null)),
-    Row(List(Row(null, null))),
-    Row(List(Row(1, null))),
-    Row(List(Row(null, 1))),
-    Row(List(Row(null, 1), Row(2, 3))),
-    Row(List(Row(2, 3), null)),
-    Row(List(Row(2, 3), Row(4, 5))),
-  )
-
-  val df = spark.createDataFrame(
-    spark.sparkContext.parallelize(data), schema)
-
-  val df2 = df.selectExpr("lists", "hash(lists) as hash")
-  df2.printSchema()
-  df2.show(false)
-  */
-
-  auto const null = -1;
-  auto col1       = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {0, null, null, 1, null, null, 2, 2, null, 2, 4},
-    cudf::test::iterators::nulls_at({1, 2, 4, 5, 8}));
-  auto col2 = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {0, null, null, null, 1, 1, 3, 3, null, 3, 5}, cudf::test::iterators::nulls_at({1, 2, 3, 8}));
-  auto struct_column =
-    cudf::test::structs_column_wrapper{{col1, col2}, {1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1}};
-  auto offsets =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3, 4, 5, 7, 9, 11};
-  auto list_nullmask = std::vector<bool>(1, 8);
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    8, offsets.release(), struct_column.release(), null_count, std::move(null_mask));
-
-  // TODO: Lists of structs are not yet supported. Once support is added,
-  // remove this EXPECT_THROW and uncomment the rest of this test.
-  EXPECT_THROW(cudf::hash(cudf::table_view({*list_column}), cudf::hash_id::HASH_SPARK_MURMUR3, 42),
-               cudf::logic_error);
-
-  /*
-  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
-    59727262, 42, 42, -559580957, -559580957, -912918097, 1092624418, 170038658};
-
-  auto output = cudf::hash(cudf::table_view({*list_column}), cudf::hash_id::HASH_SPARK_MURMUR3, 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-  */
-}
-
-class MD5HashTest : public cudf::test::BaseFixture {};
-
-TEST_F(MD5HashTest, MultiValue)
-{
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "A 60 character string to test MD5's message padding algorithm",
-     "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
-     "MD5 hash function. This string needed to be longer.",
-     "All work and no play makes Jack a dull boy",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
-
-  cudf::test::strings_column_wrapper const md5_string_results1(
-    {"d41d8cd98f00b204e9800998ecf8427e",
-     "682240021651ae166d08fe2a014d5c09",
-     "3669d5225fddbb34676312ca3b78bbd9",
-     "c61a4185135eda043f35e92c3505e180",
-     "52da74c75cb6575d25be29e66bd0adde"});
-
-  cudf::test::strings_column_wrapper const md5_string_results2(
-    {"d41d8cd98f00b204e9800998ecf8427e",
-     "e5a5682e82278e78dbaad9a689df7a73",
-     "4121ab1bb6e84172fd94822645862ae9",
-     "28970886501efe20164213855afe5850",
-     "6bc1b872103cc6a02d882245b8516e2e"});
-
-  using limits = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
-    {0, 100, -100, limits::min(), limits::max()});
-
-  // Different truth values should be equal
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
-
-  auto const string_input1      = cudf::table_view({strings_col});
-  auto const string_input2      = cudf::table_view({strings_col, strings_col});
-  auto const md5_string_output1 = cudf::hash(string_input1, cudf::hash_id::HASH_MD5);
-  auto const md5_string_output2 = cudf::hash(string_input2, cudf::hash_id::HASH_MD5);
-  EXPECT_EQ(string_input1.num_rows(), md5_string_output1->size());
-  EXPECT_EQ(string_input2.num_rows(), md5_string_output2->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_string_output1->view(), md5_string_results1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_string_output2->view(), md5_string_results2);
-
-  auto const input1      = cudf::table_view({strings_col, ints_col, bools_col1});
-  auto const input2      = cudf::table_view({strings_col, ints_col, bools_col2});
-  auto const md5_output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5);
-  auto const md5_output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5);
-  EXPECT_EQ(input1.num_rows(), md5_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_output1->view(), md5_output2->view());
-}
-
-TEST_F(MD5HashTest, MultiValueNulls)
-{
-  // Nulls with different values should be equal
-  cudf::test::strings_column_wrapper const strings_col1(
-    {"",
-     "Different but null!",
-     "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
-     "MD5 hash function. This string needed to be longer.",
-     "All work and no play makes Jack a dull boy",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-    {1, 0, 0, 1, 0});
-  cudf::test::strings_column_wrapper const strings_col2(
-    {"",
-     "A 60 character string to test MD5's message padding algorithm",
-     "Very different... but null",
-     "All work and no play makes Jack a dull boy",
-     ""},
-    {1, 0, 0, 1, 1});  // empty string is equivalent to null
-
-  // Nulls with different values should be equal
-  using limits = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
-    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
-    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
-
-  // Nulls with different values should be equal
-  // Different truth values should be equal
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
-
-  auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1});
-  auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2});
-
-  auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5);
-  auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5);
-
-  EXPECT_EQ(input1.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-}
-
-TEST_F(MD5HashTest, StringListsNulls)
-{
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; });
-
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "A 60 character string to test MD5's message padding algorithm",
-     "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
-     "MD5 hash function. This string needed to be longer. It needed to be even longer.",
-     "All work and no play makes Jack a dull boy",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
-
-  cudf::test::lists_column_wrapper<cudf::string_view> strings_list_col(
-    {{""},
-     {{"NULL", "A 60 character string to test MD5's message padding algorithm"}, validity},
-     {"A very long (greater than 128 bytes/char string) to test a multi hash-step data point in "
-      "the "
-      "MD5 hash function. This string needed to be longer.",
-      " It needed to be even longer."},
-     {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
-     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
-
-  auto const input1 = cudf::table_view({strings_col});
-  auto const input2 = cudf::table_view({strings_list_col});
-
-  auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5);
-  auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-}
-
-template <typename T>
-class MD5HashTestTyped : public cudf::test::BaseFixture {};
-
-TYPED_TEST_SUITE(MD5HashTestTyped, cudf::test::NumericTypes);
-
-TYPED_TEST(MD5HashTestTyped, Equality)
-{
-  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8});
-  auto const input = cudf::table_view({col});
-
-  // Hash of same input should be equal
-  auto const output1 = cudf::hash(input, cudf::hash_id::HASH_MD5);
-  auto const output2 = cudf::hash(input, cudf::hash_id::HASH_MD5);
-
-  EXPECT_EQ(input.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-}
-
-TYPED_TEST(MD5HashTestTyped, EqualityNulls)
-{
-  using T = TypeParam;
-
-  // Nulls with different values should be equal
-  cudf::test::fixed_width_column_wrapper<T> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-  cudf::test::fixed_width_column_wrapper<T> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-
-  auto const input1 = cudf::table_view({col1});
-  auto const input2 = cudf::table_view({col2});
-
-  auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5);
-  auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5);
-
-  EXPECT_EQ(input1.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-}
-
-TEST_F(MD5HashTest, TestBoolListsWithNulls)
-{
-  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 255, 255, 16, 27, 18, 100, 1, 2},
-                                                          {1, 0, 0, 0, 1, 1, 1, 0, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 255, 255, 32, 81, 68, 3, 101, 4},
-                                                          {1, 0, 0, 1, 0, 1, 0, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const col3({0, 255, 255, 64, 49, 42, 5, 6, 102},
-                                                          {1, 0, 0, 1, 1, 0, 0, 0, 1});
-
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; });
-  cudf::test::lists_column_wrapper<bool> const list_col(
-    {{0, 0, 0}, {1}, {}, {{1, 1, 1}, validity}, {1, 1}, {1, 1}, {1}, {1}, {1}}, validity);
-
-  auto const input1 = cudf::table_view({col1, col2, col3});
-  auto const input2 = cudf::table_view({list_col});
-
-  auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5);
-  auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5);
-
-  EXPECT_EQ(input1.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-}
-
-template <typename T>
-class MD5HashListTestTyped : public cudf::test::BaseFixture {};
-
-using NumericTypesNoBools =
-  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
-TYPED_TEST_SUITE(MD5HashListTestTyped, NumericTypesNoBools);
-
-TYPED_TEST(MD5HashListTestTyped, TestListsWithNulls)
-{
-  using T = TypeParam;
-
-  cudf::test::fixed_width_column_wrapper<T> const col1({0, 255, 255, 16, 27, 18, 100, 1, 2},
-                                                       {1, 0, 0, 0, 1, 1, 1, 0, 0});
-  cudf::test::fixed_width_column_wrapper<T> const col2({0, 255, 255, 32, 81, 68, 3, 101, 4},
-                                                       {1, 0, 0, 1, 0, 1, 0, 1, 0});
-  cudf::test::fixed_width_column_wrapper<T> const col3({0, 255, 255, 64, 49, 42, 5, 6, 102},
-                                                       {1, 0, 0, 1, 1, 0, 0, 0, 1});
-
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; });
-  cudf::test::lists_column_wrapper<T> const list_col(
-    {{0, 0, 0}, {127}, {}, {{32, 127, 64}, validity}, {27, 49}, {18, 68}, {100}, {101}, {102}},
-    validity);
-
-  auto const input1 = cudf::table_view({col1, col2, col3});
-  auto const input2 = cudf::table_view({list_col});
-
-  auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5);
-  auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5);
-
-  EXPECT_EQ(input1.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-}
-
-template <typename T>
-class MD5HashTestFloatTyped : public cudf::test::BaseFixture {};
-
-TYPED_TEST_SUITE(MD5HashTestFloatTyped, cudf::test::FloatingPointTypes);
-
-TYPED_TEST(MD5HashTestFloatTyped, TestExtremes)
-{
-  using T = TypeParam;
-  T min   = std::numeric_limits<T>::min();
-  T max   = std::numeric_limits<T>::max();
-  T nan   = std::numeric_limits<T>::quiet_NaN();
-  T inf   = std::numeric_limits<T>::infinity();
-
-  cudf::test::fixed_width_column_wrapper<T> const col1(
-    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  cudf::test::fixed_width_column_wrapper<T> const col2(
-    {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
-
-  auto const input1 = cudf::table_view({col1});
-  auto const input2 = cudf::table_view({col2});
-
-  auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5);
-  auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view(), verbosity);
-}
-
-TYPED_TEST(MD5HashTestFloatTyped, TestListExtremes)
-{
-  using T = TypeParam;
-  T min   = std::numeric_limits<T>::min();
-  T max   = std::numeric_limits<T>::max();
-  T nan   = std::numeric_limits<T>::quiet_NaN();
-  T inf   = std::numeric_limits<T>::infinity();
-
-  cudf::test::lists_column_wrapper<T> const col1(
-    {{T(0.0)}, {T(100.0), T(-100.0)}, {min, max, nan}, {inf, -inf}});
-  cudf::test::lists_column_wrapper<T> const col2(
-    {{T(-0.0)}, {T(100.0), T(-100.0)}, {min, max, -nan}, {inf, -inf}});
-
-  auto const input1 = cudf::table_view({col1});
-  auto const input2 = cudf::table_view({col2});
-
-  auto const output1 = cudf::hash(input1, cudf::hash_id::HASH_MD5);
-  auto const output2 = cudf::hash(input2, cudf::hash_id::HASH_MD5);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view(), verbosity);
-}
-
-CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp
new file mode 100644
index 00000000000..52ca52eb2ff
--- /dev/null
+++ b/cpp/tests/hashing/md5_test.cpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/hashing.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
+
+class MD5HashTest : public cudf::test::BaseFixture {};
+
+TEST_F(MD5HashTest, MultiValue)
+{
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "A 60 character string to test MD5's message padding algorithm",
+     "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
+     "MD5 hash function. This string needed to be longer.",
+     "All work and no play makes Jack a dull boy",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
+
+  cudf::test::strings_column_wrapper const md5_string_results1(
+    {"d41d8cd98f00b204e9800998ecf8427e",
+     "682240021651ae166d08fe2a014d5c09",
+     "3669d5225fddbb34676312ca3b78bbd9",
+     "c61a4185135eda043f35e92c3505e180",
+     "52da74c75cb6575d25be29e66bd0adde"});
+
+  cudf::test::strings_column_wrapper const md5_string_results2(
+    {"d41d8cd98f00b204e9800998ecf8427e",
+     "e5a5682e82278e78dbaad9a689df7a73",
+     "4121ab1bb6e84172fd94822645862ae9",
+     "28970886501efe20164213855afe5850",
+     "6bc1b872103cc6a02d882245b8516e2e"});
+
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, 100, -100, limits::min(), limits::max()});
+
+  // Different truth values should be equal
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
+
+  auto const string_input1      = cudf::table_view({strings_col});
+  auto const string_input2      = cudf::table_view({strings_col, strings_col});
+  auto const md5_string_output1 = cudf::hashing::md5(string_input1);
+  auto const md5_string_output2 = cudf::hashing::md5(string_input2);
+  EXPECT_EQ(string_input1.num_rows(), md5_string_output1->size());
+  EXPECT_EQ(string_input2.num_rows(), md5_string_output2->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_string_output1->view(), md5_string_results1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_string_output2->view(), md5_string_results2);
+
+  auto const input1      = cudf::table_view({strings_col, ints_col, bools_col1});
+  auto const input2      = cudf::table_view({strings_col, ints_col, bools_col2});
+  auto const md5_output1 = cudf::hashing::md5(input1);
+  auto const md5_output2 = cudf::hashing::md5(input2);
+  EXPECT_EQ(input1.num_rows(), md5_output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_output1->view(), md5_output2->view());
+}
+
+TEST_F(MD5HashTest, MultiValueNulls)
+{
+  // Nulls with different values should be equal
+  cudf::test::strings_column_wrapper const strings_col1(
+    {"",
+     "Different but null!",
+     "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
+     "MD5 hash function. This string needed to be longer.",
+     "All work and no play makes Jack a dull boy",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
+    {1, 0, 0, 1, 0});
+  cudf::test::strings_column_wrapper const strings_col2(
+    {"",
+     "A 60 character string to test MD5's message padding algorithm",
+     "Very different... but null",
+     "All work and no play makes Jack a dull boy",
+     ""},
+    {1, 0, 0, 1, 1});  // empty string is equivalent to null
+
+  // Nulls with different values should be equal
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
+    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
+    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
+
+  // Nulls with different values should be equal
+  // Different truth values should be equal
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
+
+  auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1});
+  auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2});
+
+  auto const output1 = cudf::hashing::md5(input1);
+  auto const output2 = cudf::hashing::md5(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TEST_F(MD5HashTest, StringListsNulls)
+{
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; });
+
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "A 60 character string to test MD5's message padding algorithm",
+     "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
+     "MD5 hash function. This string needed to be longer. It needed to be even longer.",
+     "All work and no play makes Jack a dull boy",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
+
+  cudf::test::lists_column_wrapper<cudf::string_view> strings_list_col(
+    {{""},
+     {{"NULL", "A 60 character string to test MD5's message padding algorithm"}, validity},
+     {"A very long (greater than 128 bytes/char string) to test a multi hash-step data point in "
+      "the "
+      "MD5 hash function. This string needed to be longer.",
+      " It needed to be even longer."},
+     {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
+     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
+
+  auto const input1 = cudf::table_view({strings_col});
+  auto const input2 = cudf::table_view({strings_list_col});
+
+  auto const output1 = cudf::hashing::md5(input1);
+  auto const output2 = cudf::hashing::md5(input2);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+template <typename T>
+class MD5HashTestTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(MD5HashTestTyped, cudf::test::NumericTypes);
+
+TYPED_TEST(MD5HashTestTyped, Equality)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8});
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::md5(input);
+  auto const output2 = cudf::hashing::md5(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TYPED_TEST(MD5HashTestTyped, EqualityNulls)
+{
+  using T = TypeParam;
+
+  // Nulls with different values should be equal
+  cudf::test::fixed_width_column_wrapper<T> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+
+  auto const input1 = cudf::table_view({col1});
+  auto const input2 = cudf::table_view({col2});
+
+  auto const output1 = cudf::hashing::md5(input1);
+  auto const output2 = cudf::hashing::md5(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TEST_F(MD5HashTest, TestBoolListsWithNulls)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 255, 255, 16, 27, 18, 100, 1, 2},
+                                                          {1, 0, 0, 0, 1, 1, 1, 0, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 255, 255, 32, 81, 68, 3, 101, 4},
+                                                          {1, 0, 0, 1, 0, 1, 0, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const col3({0, 255, 255, 64, 49, 42, 5, 6, 102},
+                                                          {1, 0, 0, 1, 1, 0, 0, 0, 1});
+
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; });
+  cudf::test::lists_column_wrapper<bool> const list_col(
+    {{0, 0, 0}, {1}, {}, {{1, 1, 1}, validity}, {1, 1}, {1, 1}, {1}, {1}, {1}}, validity);
+
+  auto const input1 = cudf::table_view({col1, col2, col3});
+  auto const input2 = cudf::table_view({list_col});
+
+  auto const output1 = cudf::hashing::md5(input1);
+  auto const output2 = cudf::hashing::md5(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+template <typename T>
+class MD5HashListTestTyped : public cudf::test::BaseFixture {};
+
+using NumericTypesNoBools =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+TYPED_TEST_SUITE(MD5HashListTestTyped, NumericTypesNoBools);
+
+TYPED_TEST(MD5HashListTestTyped, TestListsWithNulls)
+{
+  using T = TypeParam;
+
+  cudf::test::fixed_width_column_wrapper<T> const col1({0, 255, 255, 16, 27, 18, 100, 1, 2},
+                                                       {1, 0, 0, 0, 1, 1, 1, 0, 0});
+  cudf::test::fixed_width_column_wrapper<T> const col2({0, 255, 255, 32, 81, 68, 3, 101, 4},
+                                                       {1, 0, 0, 1, 0, 1, 0, 1, 0});
+  cudf::test::fixed_width_column_wrapper<T> const col3({0, 255, 255, 64, 49, 42, 5, 6, 102},
+                                                       {1, 0, 0, 1, 1, 0, 0, 0, 1});
+
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; });
+  cudf::test::lists_column_wrapper<T> const list_col(
+    {{0, 0, 0}, {127}, {}, {{32, 127, 64}, validity}, {27, 49}, {18, 68}, {100}, {101}, {102}},
+    validity);
+
+  auto const input1 = cudf::table_view({col1, col2, col3});
+  auto const input2 = cudf::table_view({list_col});
+
+  auto const output1 = cudf::hashing::md5(input1);
+  auto const output2 = cudf::hashing::md5(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+template <typename T>
+class MD5HashTestFloatTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(MD5HashTestFloatTyped, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(MD5HashTestFloatTyped, TestExtremes)
+{
+  using T = TypeParam;
+  T min   = std::numeric_limits<T>::min();
+  T max   = std::numeric_limits<T>::max();
+  T nan   = std::numeric_limits<T>::quiet_NaN();
+  T inf   = std::numeric_limits<T>::infinity();
+
+  cudf::test::fixed_width_column_wrapper<T> const col1(
+    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col2(
+    {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
+
+  auto const input1 = cudf::table_view({col1});
+  auto const input2 = cudf::table_view({col2});
+
+  auto const output1 = cudf::hashing::md5(input1);
+  auto const output2 = cudf::hashing::md5(input2);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view(), verbosity);
+}
+
+TYPED_TEST(MD5HashTestFloatTyped, TestListExtremes)
+{
+  using T = TypeParam;
+  T min   = std::numeric_limits<T>::min();
+  T max   = std::numeric_limits<T>::max();
+  T nan   = std::numeric_limits<T>::quiet_NaN();
+  T inf   = std::numeric_limits<T>::infinity();
+
+  cudf::test::lists_column_wrapper<T> const col1(
+    {{T(0.0)}, {T(100.0), T(-100.0)}, {min, max, nan}, {inf, -inf}});
+  cudf::test::lists_column_wrapper<T> const col2(
+    {{T(-0.0)}, {T(100.0), T(-100.0)}, {min, max, -nan}, {inf, -inf}});
+
+  auto const input1 = cudf::table_view({col1});
+  auto const input2 = cudf::table_view({col2});
+
+  auto const output1 = cudf::hashing::md5(input1);
+  auto const output2 = cudf::hashing::md5(input2);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view(), verbosity);
+}
diff --git a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp
new file mode 100644
index 00000000000..4fb8f78b558
--- /dev/null
+++ b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/hashing.hpp>
+
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
+
+using NumericTypesNoBools =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+
+template <typename T>
+class MurmurHash3_x64_128_TestTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(MurmurHash3_x64_128_TestTyped, NumericTypesNoBools);
+
+TYPED_TEST(MurmurHash3_x64_128_TestTyped, TestNumeric)
+{
+  using T   = TypeParam;
+  auto col1 = cudf::test::fixed_width_column_wrapper<T, int32_t>{
+    {-1, -1, 0, 2, 22, 0, 11, 12, 116, 32, 0, 42, 7, 62, 1, -22, 0, 0},
+    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
+  auto col2 = cudf::test::fixed_width_column_wrapper<T, int32_t>{
+    {-1, -1, 0, 2, 22, 1, 11, 12, 116, 32, 0, 42, 7, 62, 1, -22, 1, -22},
+    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
+
+  auto output1 = cudf::hashing::murmurhash3_x64_128(cudf::table_view({col1}));
+  auto output2 = cudf::hashing::murmurhash3_x64_128(cudf::table_view({col2}));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(output1->view(), output2->view());
+
+  output1 = cudf::hashing::murmurhash3_x64_128(cudf::table_view({col1}), 7);
+  output2 = cudf::hashing::murmurhash3_x64_128(cudf::table_view({col2}), 7);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(output1->view(), output2->view());
+}
+
+class MurmurHash3_x64_128_Test : public cudf::test::BaseFixture {};
+
+TEST_F(MurmurHash3_x64_128_Test, StringType)
+{
+  auto col1 = cudf::test::strings_column_wrapper(
+    {"The",
+     "quick",
+     "brown fox",
+     "jumps over the lazy dog.",
+     "I am Jack's complete lack of null value",
+     "A very long (greater than 128 bytes/characters) to test a very long string. "
+     "2nd half of the very long string to verify the long string hashing happening.",
+     "Some multi-byte characters here: ééé",
+     "ééé",
+     "ééé ééé",
+     "ééé ééé ééé ééé",
+     "",
+     "!@#$%^&*(())",
+     "0123456789",
+     "{}|:<>?,./;[]=-"});
+
+  auto output = cudf::hashing::murmurhash3_x64_128(cudf::table_view({col1}));
+  // these were generated using the CPU compiled
+  // https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+  auto expected = cudf::test::fixed_width_column_wrapper<uint64_t>({3481043174314896794ul,
+                                                                    1981901315483788749ul,
+                                                                    1418748153263580713ul,
+                                                                    11224732510765974842ul,
+                                                                    10813495276579975748ul,
+                                                                    8563282101401420087ul,
+                                                                    7289234017606107350ul,
+                                                                    225672801045596944ul,
+                                                                    14927688838032769435ul,
+                                                                    7513581995808204968ul,
+                                                                    0ul,
+                                                                    14163495587303857889ul,
+                                                                    4581940570640870180ul,
+                                                                    18164432652839101653ul});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view().column(0), expected);
+
+  auto const seed = uint64_t{7};
+
+  output   = cudf::hashing::murmurhash3_x64_128(cudf::table_view({col1}), seed);
+  expected = cudf::test::fixed_width_column_wrapper<uint64_t>({5091211404759866125ul,
+                                                               12948345853121693662ul,
+                                                               14974420008081159223ul,
+                                                               4475830656132398742ul,
+                                                               15724398074328467356ul,
+                                                               4091324140202743991ul,
+                                                               7130403777725115865ul,
+                                                               11087585763075301159ul,
+                                                               12568262854562899547ul,
+                                                               2679775340886828858ul,
+                                                               17582832888865278351ul,
+                                                               5264478748926531221ul,
+                                                               8863578460974333747ul,
+                                                               11176802453047055260ul});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view().column(0), expected);
+}
diff --git a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
new file mode 100644
index 00000000000..a4b16550398
--- /dev/null
+++ b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/hashing.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
+
+class MurmurHashTest : public cudf::test::BaseFixture {};
+
+TEST_F(MurmurHashTest, MultiValue)
+{
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
+
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, 100, -100, limits::min(), limits::max()});
+
+  // Different truth values should be equal
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
+
+  using ts = cudf::timestamp_s;
+  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col(
+    {ts::duration::zero(),
+     static_cast<ts::duration>(100),
+     static_cast<ts::duration>(-100),
+     ts::duration::min(),
+     ts::duration::max()});
+
+  auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1, secs_col});
+  auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2, secs_col});
+
+  auto const output1 = cudf::hashing::murmurhash3_x86_32(input1);
+  auto const output2 = cudf::hashing::murmurhash3_x86_32(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TEST_F(MurmurHashTest, MultiValueNulls)
+{
+  // Nulls with different values should be equal
+  cudf::test::strings_column_wrapper const strings_col1(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
+    {0, 1, 1, 0, 1});
+  cudf::test::strings_column_wrapper const strings_col2(
+    {"different but null",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "I am Jack's complete lack of null value",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
+    {0, 1, 1, 0, 1});
+
+  // Nulls with different values should be equal
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
+    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
+    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
+
+  // Nulls with different values should be equal
+  // Different truth values should be equal
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
+
+  // Nulls with different values should be equal
+  using ts = cudf::timestamp_s;
+  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col1(
+    {ts::duration::zero(),
+     static_cast<ts::duration>(100),
+     static_cast<ts::duration>(-100),
+     ts::duration::min(),
+     ts::duration::max()},
+    {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col2(
+    {ts::duration::zero(),
+     static_cast<ts::duration>(-200),
+     static_cast<ts::duration>(200),
+     ts::duration::min(),
+     ts::duration::max()},
+    {1, 0, 0, 1, 1});
+
+  auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1});
+  auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2});
+
+  auto const output1 = cudf::hashing::murmurhash3_x86_32(input1);
+  auto const output2 = cudf::hashing::murmurhash3_x86_32(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TEST_F(MurmurHashTest, BasicList)
+{
+  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
+  using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
+
+  auto const col = LCW{{}, {}, {1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}};
+  auto const input  = cudf::table_view({col});
+  auto const expect = ICW{1607593296,
+                          1607593296,
+                          -636010097,
+                          -132459357,
+                          -636010097,
+                          -2008850957,
+                          -1023787369,
+                          761197503,
+                          761197503,
+                          1340177511,
+                          -1023787369,
+                          -1023787369};
+
+  auto const output = cudf::hashing::murmurhash3_x86_32(input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+
+  auto const expect_seeded = ICW{1607594268u,
+                                 1607594268u,
+                                 1576790066u,
+                                 1203671017u,
+                                 1576790066u,
+                                 2107478077u,
+                                 1756855002u,
+                                 2228938758u,
+                                 2228938758u,
+                                 3491134126u,
+                                 1756855002u,
+                                 1756855002u};
+
+  auto const seeded_output = cudf::hashing::murmurhash3_x86_32(input, 15);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
+}
+
+TEST_F(MurmurHashTest, NullableList)
+{
+  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
+  using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
+
+  auto const valids = std::vector<bool>{1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0};
+  auto const col =
+    LCW{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {}, {2, 2}, {2, 2}, {}}, valids.begin()};
+  auto expect = ICW{-2023148619,
+                    -2023148619,
+                    -31671896,
+                    -31671896,
+                    -1205248335,
+                    1865773848,
+                    1865773848,
+                    -2023148682,
+                    -1205248335,
+                    -1205248335,
+                    -2023148682};
+
+  auto const output = cudf::hashing::murmurhash3_x86_32(cudf::table_view({col}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+
+  auto const expect_seeded = ICW{2271820643u,
+                                 2271820643u,
+                                 1038318696u,
+                                 1038318696u,
+                                 595138041u,
+                                 3027840870u,
+                                 3027840870u,
+                                 2271820578u,
+                                 595138041u,
+                                 595138041u,
+                                 2271820578u};
+
+  auto const seeded_output = cudf::hashing::murmurhash3_x86_32(cudf::table_view({col}), 31);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
+}
+
+TEST_F(MurmurHashTest, ListOfStruct)
+{
+  auto col1 = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {-1, -1, 0, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, 1, 2},
+    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
+  auto col2 = cudf::test::strings_column_wrapper{
+    {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"},
+    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1}};
+  auto struct_col = cudf::test::structs_column_wrapper{
+    {col1, col2}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18};
+
+  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(
+    17, offsets.release(), struct_col.release(), null_count, std::move(null_mask));
+
+  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{83451479,
+                                                                 83451479,
+                                                                 83455332,
+                                                                 83455332,
+                                                                 -759684425,
+                                                                 -959632766,
+                                                                 -959632766,
+                                                                 -959632766,
+                                                                 -959636527,
+                                                                 -656998704,
+                                                                 613652814,
+                                                                 1902080426,
+                                                                 1902080426,
+                                                                 2061025592,
+                                                                 2061025592,
+                                                                 -319840811,
+                                                                 -319840811};
+
+  auto const output = cudf::hashing::murmurhash3_x86_32(cudf::table_view({*list_column}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+
+  auto expect_seeded = cudf::test::fixed_width_column_wrapper<uint32_t>{81710442u,
+                                                                        81710442u,
+                                                                        81729816u,
+                                                                        81729816u,
+                                                                        3532787573u,
+                                                                        3642097855u,
+                                                                        3642097855u,
+                                                                        3642097855u,
+                                                                        3642110391u,
+                                                                        3889855760u,
+                                                                        1494406307u,
+                                                                        103934081u,
+                                                                        103934081u,
+                                                                        3462063680u,
+                                                                        3462063680u,
+                                                                        1696730835u,
+                                                                        1696730835u};
+
+  auto const seeded_output =
+    cudf::hashing::murmurhash3_x86_32(cudf::table_view({*list_column}), 619);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
+}
+
+TEST_F(MurmurHashTest, ListOfEmptyStruct)
+{
+  // []
+  // []
+  // Null
+  // Null
+  // [Null, Null]
+  // [Null, Null]
+  // [Null, Null]
+  // [Null]
+  // [Null]
+  // [{}]
+  // [{}]
+  // [{}, {}]
+  // [{}, {}]
+
+  auto struct_validity = std::vector<bool>{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
+  auto struct_col = cudf::make_structs_column(14, {}, null_count, std::move(null_mask));
+
+  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    0, 0, 0, 0, 0, 2, 4, 6, 7, 8, 9, 10, 12, 14};
+  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  std::tie(null_mask, null_count) =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(
+    13, offsets.release(), std::move(struct_col), null_count, std::move(null_mask));
+
+  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{2271818677u,
+                                                                 2271818677u,
+                                                                 2271818614u,
+                                                                 2271818614u,
+                                                                 3954409013u,
+                                                                 3954409013u,
+                                                                 3954409013u,
+                                                                 2295666275u,
+                                                                 2295666275u,
+                                                                 2295666276u,
+                                                                 2295666276u,
+                                                                 3954409052u,
+                                                                 3954409052u};
+
+  auto output = cudf::hashing::murmurhash3_x86_32(cudf::table_view({*list_column}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+}
+
+TEST_F(MurmurHashTest, EmptyDeepList)
+{
+  // List<List<int>>, where all lists are empty
+  // []
+  // []
+  // Null
+  // Null
+
+  // Internal empty list
+  auto list1 = cudf::test::lists_column_wrapper<int>{};
+
+  auto offsets       = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
+  auto list_nullmask = std::vector<bool>{1, 1, 0, 0};
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(
+    4, offsets.release(), list1.release(), null_count, std::move(null_mask));
+
+  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{
+    2271818677u, 2271818677u, 2271818614u, 2271818614u};
+
+  auto output = cudf::hashing::murmurhash3_x86_32(cudf::table_view({*list_column}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+}
+
+template <typename T>
+class MurmurHashTestTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(MurmurHashTestTyped, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(MurmurHashTestTyped, Equality)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> const col{0, 127, 1, 2, 8};
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hashing::murmurhash3_x86_32(input);
+  auto const output2 = cudf::hashing::murmurhash3_x86_32(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TYPED_TEST(MurmurHashTestTyped, EqualityNulls)
+{
+  using T = TypeParam;
+
+  // Nulls with different values should be equal
+  cudf::test::fixed_width_column_wrapper<T, int32_t> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+
+  auto const input1 = cudf::table_view({col1});
+  auto const input2 = cudf::table_view({col2});
+
+  auto const output1 = cudf::hashing::murmurhash3_x86_32(input1);
+  auto const output2 = cudf::hashing::murmurhash3_x86_32(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+template <typename T>
+class MurmurHashTestFloatTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(MurmurHashTestFloatTyped, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(MurmurHashTestFloatTyped, TestExtremes)
+{
+  using T = TypeParam;
+  T min   = std::numeric_limits<T>::min();
+  T max   = std::numeric_limits<T>::max();
+  T nan   = std::numeric_limits<T>::quiet_NaN();
+  T inf   = std::numeric_limits<T>::infinity();
+
+  cudf::test::fixed_width_column_wrapper<T> const col(
+    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col_neg_zero(
+    {T(-0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col_neg_nan(
+    {T(0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
+
+  auto const table_col          = cudf::table_view({col});
+  auto const table_col_neg_zero = cudf::table_view({col_neg_zero});
+  auto const table_col_neg_nan  = cudf::table_view({col_neg_nan});
+
+  auto const hash_col          = cudf::hashing::murmurhash3_x86_32(table_col);
+  auto const hash_col_neg_zero = cudf::hashing::murmurhash3_x86_32(table_col_neg_zero);
+  auto const hash_col_neg_nan  = cudf::hashing::murmurhash3_x86_32(table_col_neg_nan);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_zero, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, verbosity);
+}
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp
new file mode 100644
index 00000000000..c228c1e6378
--- /dev/null
+++ b/cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp
@@ -0,0 +1,576 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/hashing.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
+
+template <typename T>
+class SparkMurmurHashTestTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(SparkMurmurHashTestTyped, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(SparkMurmurHashTestTyped, Equality)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> const col{0, 127, 1, 2, 8};
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const spark_output1 = cudf::hashing::spark_murmurhash3_x86_32(input, 0);
+  auto const spark_output2 = cudf::hashing::spark_murmurhash3_x86_32(input);
+
+  EXPECT_EQ(input.num_rows(), spark_output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
+}
+
+TYPED_TEST(SparkMurmurHashTestTyped, EqualityNulls)
+{
+  using T = TypeParam;
+
+  // Nulls with different values should be equal
+  cudf::test::fixed_width_column_wrapper<T, int32_t> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+
+  auto const input1 = cudf::table_view({col1});
+  auto const input2 = cudf::table_view({col2});
+
+  auto const spark_output1 = cudf::hashing::spark_murmurhash3_x86_32(input1, 0);
+  auto const spark_output2 = cudf::hashing::spark_murmurhash3_x86_32(input2);
+
+  EXPECT_EQ(input1.num_rows(), spark_output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
+}
+
+template <typename T>
+class SparkMurmurHashTestFloatTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(SparkMurmurHashTestFloatTyped, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(SparkMurmurHashTestFloatTyped, TestExtremes)
+{
+  using T = TypeParam;
+  T min   = std::numeric_limits<T>::min();
+  T max   = std::numeric_limits<T>::max();
+  T nan   = std::numeric_limits<T>::quiet_NaN();
+  T inf   = std::numeric_limits<T>::infinity();
+
+  cudf::test::fixed_width_column_wrapper<T> const col(
+    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col_neg_zero(
+    {T(-0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col_neg_nan(
+    {T(0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
+
+  auto const table_col          = cudf::table_view({col});
+  auto const table_col_neg_zero = cudf::table_view({col_neg_zero});
+  auto const table_col_neg_nan  = cudf::table_view({col_neg_nan});
+
+  // Spark hash is sensitive to 0 and -0
+  auto const spark_col         = cudf::hashing::spark_murmurhash3_x86_32(table_col, 0);
+  auto const spark_col_neg_nan = cudf::hashing::spark_murmurhash3_x86_32(table_col_neg_nan);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
+}
+
+class SparkMurmurHashTest : public cudf::test::BaseFixture {};
+
+TEST_F(SparkMurmurHashTest, MultiValueNulls)
+{
+  // Nulls with different values should be equal
+  cudf::test::strings_column_wrapper const strings_col1(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
+    {0, 1, 1, 0, 1});
+  cudf::test::strings_column_wrapper const strings_col2(
+    {"different but null",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "I am Jack's complete lack of null value",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
+    {0, 1, 1, 0, 1});
+
+  // Nulls with different values should be equal
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
+    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
+    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
+
+  // Nulls with different values should be equal
+  // Different truth values should be equal
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
+
+  // Nulls with different values should be equal
+  using ts = cudf::timestamp_s;
+  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col1(
+    {ts::duration::zero(),
+     static_cast<ts::duration>(100),
+     static_cast<ts::duration>(-100),
+     ts::duration::min(),
+     ts::duration::max()},
+    {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col2(
+    {ts::duration::zero(),
+     static_cast<ts::duration>(-200),
+     static_cast<ts::duration>(200),
+     ts::duration::min(),
+     ts::duration::max()},
+    {1, 0, 0, 1, 1});
+
+  auto const input1        = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1});
+  auto const input2        = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2});
+  auto const spark_output1 = cudf::hashing::spark_murmurhash3_x86_32(input1, 0);
+  auto const spark_output2 = cudf::hashing::spark_murmurhash3_x86_32(input2);
+
+  EXPECT_EQ(input1.num_rows(), spark_output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
+}
+
+TEST_F(SparkMurmurHashTest, MultiValueWithSeeds)
+{
+  // The hash values were determined by running the following Scala code in Apache Spark.
+  // Note that Spark >= 3.2 normalizes the float/double value of -0. to +0. and both values hash
+  // to the same result. This is normalized in the calling code (Spark RAPIDS plugin) for Spark
+  // >= 3.2. However, the reference values for -0. below must be obtained with Spark < 3.2 and
+  // libcudf will continue to implement the Spark < 3.2 behavior until Spark >= 3.2 is required and
+  // the workaround in the calling code is removed. This also affects the combined hash values.
+
+  /*
+  import org.apache.spark.sql.functions._
+  import org.apache.spark.sql.types._
+  import org.apache.spark.sql.Row
+  import org.apache.spark.sql.catalyst.util.DateTimeUtils
+
+  val schema = new StructType()
+      .add("structs", new StructType()
+          .add("a", IntegerType)
+          .add("b", StringType)
+          .add("c", new StructType()
+              .add("x", FloatType)
+              .add("y", LongType)))
+      .add("strings", StringType)
+      .add("doubles", DoubleType)
+      .add("timestamps", TimestampType)
+      .add("decimal64", DecimalType(18, 7))
+      .add("longs", LongType)
+      .add("floats", FloatType)
+      .add("dates", DateType)
+      .add("decimal32", DecimalType(9, 3))
+      .add("ints", IntegerType)
+      .add("shorts", ShortType)
+      .add("bytes", ByteType)
+      .add("bools", BooleanType)
+      .add("decimal128", DecimalType(38, 11))
+
+  val data = Seq(
+      Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble,
+          DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat,
+          DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte,
+          false, BigDecimal(0)),
+      Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble),
+          DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat),
+          DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte,
+          true, BigDecimal("0.000000001")),
+      Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN,
+          DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN,
+          DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte,
+          true, BigDecimal("-0.00000000001")),
+      Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)),
+          "All work and no play makes Jack a dull boy", Double.MinValue,
+          DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"),
+          Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100),
+          BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true,
+          BigDecimal("-9999999999999999.99999999999")),
+      Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)),
+          "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue,
+          DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"),
+          Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100),
+          BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false,
+          BigDecimal("99999999999999999999999999.99999999999")))
+
+  val df = spark.createDataFrame(sc.parallelize(data), schema)
+  df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}"))
+  println(s"combined => ${df.select(hash(col("*"))).collect.mkString(",")}")
+  */
+
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_structs_expected(
+    {-105406170, 90479889, -678041645, 1667387937, 301478567});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected(
+    {142593372, 1217302703, -715697185, -2061143941, -111635966});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_doubles_expected(
+    {-1670924195, -853646085, -1281358385, 1897734433, -508695674});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_timestamps_expected(
+    {-1670924195, 1114849490, 904948192, -1832979433, 1752430209});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal64_expected(
+    {-1670924195, 1114849490, 904948192, 1962370902, -1795328666});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_longs_expected(
+    {-1670924195, 1114849490, 904948192, -853646085, -1604625029});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_floats_expected(
+    {933211791, 723455942, -349261430, -1225560532, -338752985});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_dates_expected(
+    {933211791, 751823303, -1080202046, -1906567553, -1503850410});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal32_expected(
+    {-1670924195, 1114849490, 904948192, -1454351396, -193774131});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_ints_expected(
+    {933211791, 751823303, -1080202046, 723455942, 133916647});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_shorts_expected(
+    {933211791, 751823303, -1080202046, -1871935946, 1249274084});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bytes_expected(
+    {933211791, 751823303, -1080202046, 1110053733, 1135925485});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bools_expected(
+    {933211791, -559580957, -559580957, -559580957, 933211791});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal128_expected(
+    {-783713497, -295670906, 1398487324, -52622807, -1359749815});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_combined_expected(
+    {401603227, 588162166, 552160517, 1132537411, -326043017});
+
+  using double_limits = std::numeric_limits<double>;
+  using long_limits   = std::numeric_limits<int64_t>;
+  using float_limits  = std::numeric_limits<float>;
+  using int_limits    = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> a_col{0, 100, -100, 0x1234'5678, -0x7654'3210};
+  cudf::test::strings_column_wrapper b_col{"a", "bc", "def", "ghij", "klmno"};
+  cudf::test::fixed_width_column_wrapper<float> x_col{
+    0.f, 100.f, -100.f, float_limits::infinity(), -float_limits::infinity()};
+  cudf::test::fixed_width_column_wrapper<int64_t> y_col{
+    0L, 100L, -100L, 0x0123'4567'89ab'cdefL, -0x0123'4567'89ab'cdefL};
+  cudf::test::structs_column_wrapper c_col{{x_col, y_col}};
+  cudf::test::structs_column_wrapper const structs_col{{a_col, b_col, c_col}};
+
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
+  cudf::test::fixed_width_column_wrapper<double> const doubles_col(
+    {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()});
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> const
+    timestamps_col({0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000});
+  cudf::test::fixed_point_column_wrapper<int64_t> const decimal64_col(
+    {0L, 100L, -100L, -999999999999999999L, 999999999999999999L}, numeric::scale_type{-7});
+  cudf::test::fixed_width_column_wrapper<int64_t> const longs_col(
+    {0L, 100L, -100L, long_limits::min(), long_limits::max()});
+  cudf::test::fixed_width_column_wrapper<float> const floats_col(
+    {0.f, -0.f, -float_limits::quiet_NaN(), float_limits::lowest(), float_limits::max()});
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> dates_col(
+    {0, 100, -100, int_limits::min() / 100, int_limits::max() / 100});
+  cudf::test::fixed_point_column_wrapper<int32_t> const decimal32_col(
+    {0, 100, -100, -999999999, 999999999}, numeric::scale_type{-3});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, 100, -100, int_limits::min(), int_limits::max()});
+  cudf::test::fixed_width_column_wrapper<int16_t> const shorts_col({0, 100, -100, -32768, 32767});
+  cudf::test::fixed_width_column_wrapper<int8_t> const bytes_col({0, 100, -100, -128, 127});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
+  cudf::test::fixed_point_column_wrapper<__int128_t> const decimal128_col(
+    {static_cast<__int128>(0),
+     static_cast<__int128>(100),
+     static_cast<__int128>(-1),
+     (static_cast<__int128>(0xFFFF'FFFF'FCC4'D1C3u) << 64 | 0x602F'7FC3'1800'0001u),
+     (static_cast<__int128>(0x0785'EE10'D5DA'46D9u) << 64 | 0x00F4'369F'FFFF'FFFFu)},
+    numeric::scale_type{-11});
+
+  auto const hash_structs =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({structs_col}), 42);
+  auto const hash_strings =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({strings_col}), 42);
+  auto const hash_doubles =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({doubles_col}), 42);
+  auto const hash_timestamps =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({timestamps_col}), 42);
+  auto const hash_decimal64 =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({decimal64_col}), 42);
+  auto const hash_longs =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({longs_col}), 42);
+  auto const hash_floats =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({floats_col}), 42);
+  auto const hash_dates =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({dates_col}), 42);
+  auto const hash_decimal32 =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({decimal32_col}), 42);
+  auto const hash_ints = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({ints_col}), 42);
+  auto const hash_shorts =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({shorts_col}), 42);
+  auto const hash_bytes =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({bytes_col}), 42);
+  auto const hash_bools1 =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({bools_col1}), 42);
+  auto const hash_bools2 =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({bools_col2}), 42);
+  auto const hash_decimal128 =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({decimal128_col}), 42);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal64, hash_decimal64_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_longs, hash_longs_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_floats, hash_floats_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_dates, hash_dates_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal32, hash_decimal32_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_ints, hash_ints_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_shorts, hash_shorts_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal128, hash_decimal128_expected, verbosity);
+
+  auto const combined_table = cudf::table_view({structs_col,
+                                                strings_col,
+                                                doubles_col,
+                                                timestamps_col,
+                                                decimal64_col,
+                                                longs_col,
+                                                floats_col,
+                                                dates_col,
+                                                decimal32_col,
+                                                ints_col,
+                                                shorts_col,
+                                                bytes_col,
+                                                bools_col2,
+                                                decimal128_col});
+  auto const hash_combined  = cudf::hashing::spark_murmurhash3_x86_32(combined_table, 42);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity);
+}
+
+TEST_F(SparkMurmurHashTest, StringsWithSeed)
+{
+  // The hash values were determined by running the following Scala code in Apache Spark:
+  // val strs = Seq("", "The quick brown fox",
+  //              "jumps over the lazy dog.",
+  //              "All work and no play makes Jack a dull boy",
+  //              "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721")
+  // println(strs.map(org.apache.spark.unsafe.types.UTF8String.fromString)
+  //   .map(org.apache.spark.sql.catalyst.expressions.Murmur3HashFunction.hash(
+  //     _, org.apache.spark.sql.types.StringType, 314)))
+
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected_seed_314(
+    {1467149710, 723257560, -1620282500, -2001858707, 1588473657});
+
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
+
+  auto const hash_strings =
+    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({strings_col}), 314);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected_seed_314, verbosity);
+}
+
+TEST_F(SparkMurmurHashTest, ListValues)
+{
+  /*
+  import org.apache.spark.sql.functions._
+  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
+  import org.apache.spark.sql.Row
+
+  val schema = new StructType()
+    .add("lists",ArrayType(ArrayType(IntegerType)))
+
+  val data = Seq(
+    Row(null),
+    Row(List(null)),
+    Row(List(List())),
+    Row(List(List(1))),
+    Row(List(List(1, 2))),
+    Row(List(List(1, 2, 3))),
+    Row(List(List(1, 2), List(3))),
+    Row(List(List(1), List(2, 3))),
+    Row(List(List(1), List(null, 2, 3))),
+    Row(List(List(1, 2), List(3), List(null))),
+    Row(List(List(1, 2), null, List(3))),
+  )
+
+  val df = spark.createDataFrame(
+    spark.sparkContext.parallelize(data), schema)
+
+  val df2 = df.selectExpr("lists", "hash(lists) as hash")
+  df2.printSchema()
+  df2.show(false)
+  */
+
+  auto const null = -1;
+  auto nested_list =
+    cudf::test::lists_column_wrapper<int>({{},
+                                           {1},
+                                           {1, 2},
+                                           {1, 2, 3},
+                                           {1, 2},
+                                           {3},
+                                           {1},
+                                           {2, 3},
+                                           {1},
+                                           {{null, 2, 3}, cudf::test::iterators::nulls_at({0})},
+                                           {1, 2},
+                                           {3},
+                                           {{null}, cudf::test::iterators::nulls_at({0})},
+                                           {1, 2},
+                                           {},
+                                           {3}},
+                                          cudf::test::iterators::nulls_at({0, 14}));
+  auto offsets =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 1, 2, 3, 4, 6, 8, 10, 13, 16};
+  auto list_validity = cudf::test::iterators::nulls_at({0});
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(list_validity, list_validity + 11);
+  auto list_column = cudf::make_lists_column(
+    11, offsets.release(), nested_list.release(), null_count, std::move(null_mask));
+
+  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{42,
+                                                                42,
+                                                                42,
+                                                                -559580957,
+                                                                -222940379,
+                                                                -912918097,
+                                                                -912918097,
+                                                                -912918097,
+                                                                -912918097,
+                                                                -912918097,
+                                                                -912918097};
+
+  auto output = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({*list_column}), 42);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+}
+
+TEST_F(SparkMurmurHashTest, StructOfListValues)
+{
+  /*
+  import org.apache.spark.sql.functions._
+  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
+  import org.apache.spark.sql.Row
+
+  val schema = new StructType()
+    .add("structs", new StructType()
+        .add("a", ArrayType(IntegerType))
+        .add("b", ArrayType(IntegerType)))
+
+  val data = Seq(
+    Row(Row(List(), List())),
+    Row(Row(List(0), List(0))),
+    Row(Row(List(1, null), null)),
+    Row(Row(List(1, null), List())),
+    Row(Row(List(), List(null, 1))),
+    Row(Row(null, List(1))),
+    Row(Row(List(2, 3), List(4, 5))),
+  )
+
+  val df = spark.createDataFrame(
+    spark.sparkContext.parallelize(data), schema)
+
+  val df2 = df.selectExpr("lists", "hash(lists) as hash")
+  df2.printSchema()
+  df2.show(false)
+  */
+
+  auto const null = -1;
+  auto col1 =
+    cudf::test::lists_column_wrapper<int>({{},
+                                           {0},
+                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
+                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
+                                           {},
+                                           {} /*NULL*/,
+                                           {2, 3}},
+                                          cudf::test::iterators::nulls_at({5}));
+  auto col2 = cudf::test::lists_column_wrapper<int>(
+    {{}, {0}, {} /*NULL*/, {}, {{null, 1}, cudf::test::iterators::nulls_at({0})}, {1}, {4, 5}},
+    cudf::test::iterators::nulls_at({2}));
+  auto struct_column = cudf::test::structs_column_wrapper{{col1, col2}};
+
+  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
+    42, 59727262, -559580957, -559580957, -559580957, -559580957, 170038658};
+
+  auto output = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({struct_column}), 42);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+}
+
+TEST_F(SparkMurmurHashTest, ListOfStructValues)
+{
+  /*
+  import org.apache.spark.sql.functions._
+  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
+  import org.apache.spark.sql.Row
+
+  val schema = new StructType()
+    .add("lists", ArrayType(new StructType()
+      .add("a", IntegerType)
+      .add("b", IntegerType)))
+
+  val data = Seq(
+    Row(List(Row(0, 0))),
+    Row(List(null)),
+    Row(List(Row(null, null))),
+    Row(List(Row(1, null))),
+    Row(List(Row(null, 1))),
+    Row(List(Row(null, 1), Row(2, 3))),
+    Row(List(Row(2, 3), null)),
+    Row(List(Row(2, 3), Row(4, 5))),
+  )
+
+  val df = spark.createDataFrame(
+    spark.sparkContext.parallelize(data), schema)
+
+  val df2 = df.selectExpr("lists", "hash(lists) as hash")
+  df2.printSchema()
+  df2.show(false)
+  */
+
+  auto const null = -1;
+  auto col1       = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {0, null, null, 1, null, null, 2, 2, null, 2, 4},
+    cudf::test::iterators::nulls_at({1, 2, 4, 5, 8}));
+  auto col2 = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {0, null, null, null, 1, 1, 3, 3, null, 3, 5}, cudf::test::iterators::nulls_at({1, 2, 3, 8}));
+  auto struct_column =
+    cudf::test::structs_column_wrapper{{col1, col2}, {1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1}};
+  auto offsets =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3, 4, 5, 7, 9, 11};
+  auto list_nullmask = std::vector<bool>(1, 8);
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(
+    8, offsets.release(), struct_column.release(), null_count, std::move(null_mask));
+
+  // TODO: Lists of structs are not yet supported. Once support is added,
+  // remove this EXPECT_THROW and uncomment the rest of this test.
+  EXPECT_THROW(cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({*list_column}), 42),
+               cudf::logic_error);
+
+  /*
+  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
+    59727262, 42, 42, -559580957, -559580957, -912918097, 1092624418, 170038658};
+
+  auto output = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({*list_column}), 42);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+  */
+}
diff --git a/cpp/tests/hashing/xxhash_64_test.cpp b/cpp/tests/hashing/xxhash_64_test.cpp
new file mode 100644
index 00000000000..5916c4c2fb9
--- /dev/null
+++ b/cpp/tests/hashing/xxhash_64_test.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/hashing.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+using NumericTypesNoBools =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+
+template <typename T>
+class XXHash_64_TestTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(XXHash_64_TestTyped, NumericTypesNoBools);
+
+TYPED_TEST(XXHash_64_TestTyped, TestAllNumeric)
+{
+  using T   = TypeParam;
+  auto col1 = cudf::test::fixed_width_column_wrapper<T, int32_t>{
+    {-1, -1, 0, 2, 22, 0, 11, 12, 116, 32, 0, 42, 7, 62, 1, -22, 0, 0},
+    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
+  auto col2 = cudf::test::fixed_width_column_wrapper<T, int32_t>{
+    {-1, -1, 0, 2, 22, 1, 11, 12, 116, 32, 0, 42, 7, 62, 1, -22, 1, -22},
+    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
+
+  auto output1 = cudf::hashing::xxhash_64(cudf::table_view({col1}));
+  auto output2 = cudf::hashing::xxhash_64(cudf::table_view({col2}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+
+  constexpr uint64_t seed = 7;
+
+  output1 = cudf::hashing::xxhash_64(cudf::table_view({col1}), seed);
+  output2 = cudf::hashing::xxhash_64(cudf::table_view({col2}), seed);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+class XXHash_64_Test : public cudf::test::BaseFixture {};
+
+TEST_F(XXHash_64_Test, TestInteger)
+{
+  auto col1 =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{-127,
+                                                     -70000,
+                                                     0,
+                                                     200000,
+                                                     128,
+                                                     std::numeric_limits<int32_t>::max(),
+                                                     std::numeric_limits<int32_t>::min(),
+                                                     std::numeric_limits<int32_t>::lowest()}};
+
+  auto const output = cudf::hashing::xxhash_64(cudf::table_view({col1}));
+
+  // these were generated using the CPU compiled version of the cuco xxhash_64 source
+  // https://github.com/NVIDIA/cuCollections/blob/dev/include/cuco/detail/hash_functions/xxhash.cuh
+  auto expected = cudf::test::fixed_width_column_wrapper<uint64_t>({4827426872506142937ul,
+                                                                    13867166853951622683ul,
+                                                                    4246796580750024372ul,
+                                                                    17339819992360460003ul,
+                                                                    7292178400482025765ul,
+                                                                    2971168436322821236ul,
+                                                                    9380524276503839603ul,
+                                                                    9380524276503839603ul});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
+
+TEST_F(XXHash_64_Test, TestDouble)
+{
+  auto col1 =
+    cudf::test::fixed_width_column_wrapper<double>{{-127.,
+                                                    -70000.125,
+                                                    0.0,
+                                                    200000.5,
+                                                    128.5,
+                                                    -0.0,
+                                                    std::numeric_limits<double>::infinity(),
+                                                    std::numeric_limits<double>::quiet_NaN(),
+                                                    std::numeric_limits<double>::max(),
+                                                    std::numeric_limits<double>::min(),
+                                                    std::numeric_limits<double>::lowest()}};
+
+  auto const output = cudf::hashing::xxhash_64(cudf::table_view({col1}));
+
+  // these were generated using the CPU compiled version of the cuco xxhash_64 source
+  // https://github.com/NVIDIA/cuCollections/blob/dev/include/cuco/detail/hash_functions/xxhash.cuh
+  auto expected = cudf::test::fixed_width_column_wrapper<uint64_t>({16892115221677838993ul,
+                                                                    1686446903308179321ul,
+                                                                    3803688792395291579ul,
+                                                                    18250447068822614389ul,
+                                                                    3511911086082166358ul,
+                                                                    4558309869707674848ul,
+                                                                    18031741628920313605ul,
+                                                                    16838308782748609196ul,
+                                                                    3127544388062992779ul,
+                                                                    1692401401506680154ul,
+                                                                    13770442912356326755ul});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
+
+TEST_F(XXHash_64_Test, StringType)
+{
+  // clang-format off
+  auto col1 = cudf::test::strings_column_wrapper(
+    {"The",
+     "quick",
+     "brown fox",
+     "jumps over the lazy dog.",
+     "I am Jack's complete lack of null value",
+     "A very long (greater than 128 bytes/characters) to test a very long string. "
+     "2nd half of the very long string to verify the long string hashing happening.",
+     "Some multi-byte characters here: ééé",
+     "ééé",
+     "ééé ééé",
+     "ééé ééé ééé ééé",
+     "",
+     "!@#$%^&*(())",
+     "0123456789",
+     "{}|:<>?,./;[]=-"});
+  // clang-format on
+
+  auto output = cudf::hashing::xxhash_64(cudf::table_view({col1}));
+
+  // these were generated using the CPU compiled version of the cuco xxhash_64 source
+  // https://github.com/NVIDIA/cuCollections/blob/dev/include/cuco/detail/hash_functions/xxhash.cuh
+  // Also verified these with https://pypi.org/project/xxhash/
+  // using xxhash.xxh64(bytes(s,'utf-8')).intdigest()
+  auto expected = cudf::test::fixed_width_column_wrapper<uint64_t>({4686269239494003989ul,
+                                                                    6715983472207430822ul,
+                                                                    8148134898123095730ul,
+                                                                    17291005374665645904ul,
+                                                                    2631835514925512071ul,
+                                                                    4181420602165187991ul,
+                                                                    8749004388517322364ul,
+                                                                    17701789113925815768ul,
+                                                                    8612485687958712810ul,
+                                                                    5148645515269989956ul,
+                                                                    17241709254077376921ul,
+                                                                    7379359170906687646ul,
+                                                                    4566581271137380327ul,
+                                                                    17962149534752128981ul});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
+
+TEST_F(XXHash_64_Test, TestFixedPoint)
+{
+  auto const col1 = cudf::test::fixed_point_column_wrapper<int32_t>(
+    {0, 100, -100, -999999999, 999999999}, numeric::scale_type{-3});
+  auto const output = cudf::hashing::xxhash_64(cudf::table_view({col1}));
+
+  // these were generated using the CPU compiled version of the cuco xxhash_64 source
+  // https://github.com/NVIDIA/cuCollections/blob/dev/include/cuco/detail/hash_functions/xxhash.cuh
+  // and passing the 'value' of each input (without the scale) as the decimal-type
+  auto expected = cudf::test::fixed_width_column_wrapper<uint64_t>({4246796580750024372ul,
+                                                                    5959467639951725378ul,
+                                                                    4122185689695768261ul,
+                                                                    3249245648192442585ul,
+                                                                    8009575895491381648ul});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp
index 5e6a42f4bd0..fc8f5b37f7e 100644
--- a/cpp/tests/interop/arrow_utils.hpp
+++ b/cpp/tests/interop/arrow_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -194,7 +194,7 @@ template <typename T>
       CUDF_EXPECTS(decimal_builder.AppendNull().ok(), "Failed to append");
     } else {
       CUDF_EXPECTS(
-        decimal_builder.Append(reinterpret_cast<const uint8_t*>(data.data() + BIT_WIDTH_RATIO * i))
+        decimal_builder.Append(reinterpret_cast<uint8_t const*>(data.data() + BIT_WIDTH_RATIO * i))
           .ok(),
         "Failed to append");
     }
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index 81cf571049a..ed44727b712 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -157,7 +157,7 @@ TEST_F(DLPackUntypedTests, TooManyRowsFromDlpack)
   // Spoof too many rows
   constexpr int64_t max_size_type{std::numeric_limits<int32_t>::max()};
   tensor->dl_tensor.shape[0] = max_size_type + 1;
-  EXPECT_THROW(cudf::from_dlpack(tensor.get()), cudf::logic_error);
+  EXPECT_THROW(cudf::from_dlpack(tensor.get()), std::overflow_error);
 }
 
 TEST_F(DLPackUntypedTests, TooManyColsFromDlpack)
@@ -170,7 +170,7 @@ TEST_F(DLPackUntypedTests, TooManyColsFromDlpack)
   // Spoof too many cols
   constexpr int64_t max_size_type{std::numeric_limits<int32_t>::max()};
   tensor->dl_tensor.shape[1] = max_size_type + 1;
-  EXPECT_THROW(cudf::from_dlpack(tensor.get()), cudf::logic_error);
+  EXPECT_THROW(cudf::from_dlpack(tensor.get()), std::overflow_error);
 }
 
 TEST_F(DLPackUntypedTests, InvalidTypeFromDlpack)
@@ -375,7 +375,8 @@ TYPED_TEST(DLPackNumericTests, ToDlpack1D)
 
   // Verify that data matches input column
   constexpr cudf::data_type type{cudf::type_to_id<TypeParam>()};
-  cudf::column_view const result_view(type, tensor.shape[0], tensor.data, col_view.null_mask());
+  cudf::column_view const result_view(
+    type, tensor.shape[0], tensor.data, col_view.null_mask(), col_view.null_count());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_view, result_view);
 }
 
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 12bc031d56f..9a5cc3733af 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -86,8 +86,8 @@ TEST_F(FromArrowTest, DateTimeTable)
   std::shared_ptr<arrow::Array> arr;
   arrow::TimestampBuilder timestamp_builder(arrow::timestamp(arrow::TimeUnit::type::MILLI),
                                             arrow::default_memory_pool());
-  CUDF_EXPECTS(timestamp_builder.AppendValues(data).ok(), "Failed to append values");
-  CUDF_EXPECTS(timestamp_builder.Finish(&arr).ok(), "Failed to build array");
+  ASSERT_TRUE(timestamp_builder.AppendValues(data).ok());
+  ASSERT_TRUE(timestamp_builder.Finish(&arr).ok());
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
@@ -119,9 +119,8 @@ TYPED_TEST(FromArrowTestDurationsTest, DurationTable)
     default: CUDF_FAIL("Unsupported duration unit in arrow");
   }
   arrow::DurationBuilder duration_builder(duration(arrow_unit), arrow::default_memory_pool());
-  CUDF_EXPECTS(duration_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6}).ok(),
-               "Failed to append values");
-  CUDF_EXPECTS(duration_builder.Finish(&arr).ok(), "Failed to build array");
+  ASSERT_TRUE(duration_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6}).ok());
+  ASSERT_TRUE(duration_builder.Finish(&arr).ok());
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 6fc9f47f1f8..97d80984272 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -188,9 +188,8 @@ TEST_F(ToArrowTest, DateTimeTable)
   std::shared_ptr<arrow::Array> arr;
   arrow::TimestampBuilder timestamp_builder(timestamp(arrow::TimeUnit::type::MILLI),
                                             arrow::default_memory_pool());
-  CUDF_EXPECTS(timestamp_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6}).ok(),
-               "Failed to append values");
-  CUDF_EXPECTS(timestamp_builder.Finish(&arr).ok(), "Failed to build array");
+  ASSERT_TRUE(timestamp_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6}).ok());
+  ASSERT_TRUE(timestamp_builder.Finish(&arr).ok());
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
@@ -222,9 +221,8 @@ TYPED_TEST(ToArrowTestDurationsTest, DurationTable)
     default: CUDF_FAIL("Unsupported duration unit in arrow");
   }
   arrow::DurationBuilder duration_builder(duration(arrow_unit), arrow::default_memory_pool());
-  CUDF_EXPECTS(duration_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6}).ok(),
-               "Failed to append values");
-  CUDF_EXPECTS(duration_builder.Finish(&arr).ok(), "Failed to build array");
+  ASSERT_TRUE(duration_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6}).ok());
+  ASSERT_TRUE(duration_builder.Finish(&arr).ok());
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
index 2961deec384..979f8e4fb05 100644
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ b/cpp/tests/io/arrow_io_source_test.cpp
@@ -21,12 +21,14 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/io/datasource.hpp>
+#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/io/parquet.hpp>
 
 #include <arrow/filesystem/filesystem.h>
+#include <arrow/filesystem/s3fs.h>
 #include <arrow/io/api.h>
+#include <arrow/util/config.h>
 
 #include <fstream>
 #include <memory>
@@ -47,8 +49,7 @@ TEST_F(ArrowIOTest, URIFileSystem)
   outfile.close();
 
   std::string file_uri = "file://" + file_name;
-  std::unique_ptr<cudf::io::arrow_io_source> datasource =
-    std::make_unique<cudf::io::arrow_io_source>(file_uri);
+  auto datasource      = std::make_unique<cudf::io::arrow_io_source>(file_uri);
 
   // Populate the JSON Reader Options
   cudf::io::json_reader_options options =
@@ -71,8 +72,7 @@ TEST_F(ArrowIOTest, S3FileSystem)
   if (s3_unsupported) {
     EXPECT_THROW(std::make_unique<cudf::io::arrow_io_source>(s3_uri), cudf::logic_error);
   } else {
-    std::unique_ptr<cudf::io::arrow_io_source> datasource =
-      std::make_unique<cudf::io::arrow_io_source>(s3_uri);
+    auto datasource = std::make_unique<cudf::io::arrow_io_source>(s3_uri);
 
     // Populate the Parquet Reader Options
     cudf::io::source_info src(datasource.get());
@@ -87,6 +87,16 @@ TEST_F(ArrowIOTest, S3FileSystem)
     ASSERT_EQ(1, tbl.tbl->num_columns());  // Only single column specified in reader_options
     ASSERT_EQ(244, tbl.tbl->num_rows());   // known number of rows from the S3 file
   }
+
+#ifdef ARROW_S3
+  if (!s3_unsupported) {
+    // Verify that we are using Arrow with S3, and call finalize
+    // https://github.com/apache/arrow/issues/36974
+    // This needs to be in a separate conditional to ensure we call
+    // finalize after all arrow_io_source instances have been deleted.
+    [[maybe_unused]] auto _ = arrow::fs::EnsureS3Finalized();
+  }
+#endif
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
index d1b5928b5f1..35176c70d84 100644
--- a/cpp/tests/io/comp/decomp_test.cpp
+++ b/cpp/tests/io/comp/decomp_test.cpp
@@ -37,36 +37,36 @@ using cudf::device_span;
  */
 template <typename Decompressor>
 struct DecompressTest : public cudf::test::BaseFixture {
-  std::vector<uint8_t> vector_from_string(const char* str) const
+  std::vector<uint8_t> vector_from_string(char const* str) const
   {
-    return std::vector<uint8_t>(reinterpret_cast<const uint8_t*>(str),
-                                reinterpret_cast<const uint8_t*>(str) + strlen(str));
+    return std::vector<uint8_t>(reinterpret_cast<uint8_t const*>(str),
+                                reinterpret_cast<uint8_t const*>(str) + strlen(str));
   }
 
   void Decompress(std::vector<uint8_t>* decompressed,
-                  const uint8_t* compressed,
+                  uint8_t const* compressed,
                   size_t compressed_size)
   {
     auto stream = cudf::get_default_stream();
     rmm::device_buffer src{compressed, compressed_size, stream};
     rmm::device_uvector<uint8_t> dst{decompressed->size(), stream};
 
-    hostdevice_vector<device_span<uint8_t const>> inf_in(1, stream);
+    cudf::detail::hostdevice_vector<device_span<uint8_t const>> inf_in(1, stream);
     inf_in[0] = {static_cast<uint8_t const*>(src.data()), src.size()};
-    inf_in.host_to_device(stream);
+    inf_in.host_to_device_async(stream);
 
-    hostdevice_vector<device_span<uint8_t>> inf_out(1, stream);
+    cudf::detail::hostdevice_vector<device_span<uint8_t>> inf_out(1, stream);
     inf_out[0] = dst;
-    inf_out.host_to_device(stream);
+    inf_out.host_to_device_async(stream);
 
-    hostdevice_vector<cudf::io::compression_result> inf_stat(1, stream);
+    cudf::detail::hostdevice_vector<cudf::io::compression_result> inf_stat(1, stream);
     inf_stat[0] = {};
-    inf_stat.host_to_device(stream);
+    inf_stat.host_to_device_async(stream);
 
     static_cast<Decompressor*>(this)->dispatch(inf_in, inf_out, inf_stat);
     CUDF_CUDA_TRY(cudaMemcpyAsync(
       decompressed->data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value()));
-    inf_stat.device_to_host(stream, true);
+    inf_stat.device_to_host_sync(stream);
     ASSERT_EQ(inf_stat[0].status, cudf::io::compression_status::SUCCESS);
   }
 };
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 607fe4bd8c6..8922658ac97 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -23,8 +23,8 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/csv.hpp>
-#include <cudf/io/datasource.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -93,7 +93,7 @@ TYPED_TEST_SUITE(CsvReaderNumericTypeTest, SupportedNumericTypes);
 
 template <typename DecimalType>
 struct CsvFixedPointReaderTest : public CsvReaderTest {
-  void run_tests(const std::vector<std::string>& reference_strings, numeric::scale_type scale)
+  void run_tests(std::vector<std::string> const& reference_strings, numeric::scale_type scale)
   {
     cudf::test::strings_column_wrapper const strings(reference_strings.begin(),
                                                      reference_strings.end());
@@ -103,7 +103,7 @@ struct CsvFixedPointReaderTest : public CsvReaderTest {
     auto const buffer = std::accumulate(reference_strings.begin(),
                                         reference_strings.end(),
                                         std::string{},
-                                        [](const std::string& acc, const std::string& rhs) {
+                                        [](std::string const& acc, std::string const& rhs) {
                                           return acc.empty() ? rhs : (acc + "\n" + rhs);
                                         });
 
@@ -169,8 +169,7 @@ void check_float_column(cudf::column_view const& col_lhs,
 
   CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUIVALENT(col_lhs,
                                                 (wrapper<T>{data.begin(), data.end(), validity}));
-  CUDF_EXPECTS(col_lhs.null_count() == 0 and col_rhs.null_count() == 0,
-               "All elements should be valid");
+  EXPECT_TRUE(col_lhs.null_count() == 0 and col_rhs.null_count() == 0);
   EXPECT_THAT(cudf::test::to_host<T>(col_lhs).first,
               ::testing::Pointwise(FloatNearPointwise(tol), data));
 }
@@ -208,7 +207,7 @@ void check_timestamp_column(cudf::column_view const& col_lhs,
 }
 
 // helper to replace in `str`  _all_ occurrences of `from` with `to`
-std::string replace_all_helper(std::string str, const std::string& from, const std::string& to)
+std::string replace_all_helper(std::string str, std::string const& from, std::string const& to)
 {
   size_t start_pos = 0;
   while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
@@ -293,12 +292,12 @@ std::string assign(T input)
 std::string assign(std::string input) { return input; }
 
 template <typename T>
-std::vector<std::string> prepend_zeros(const std::vector<T>& input,
+std::vector<std::string> prepend_zeros(std::vector<T> const& input,
                                        int zero_count         = 0,
                                        bool add_positive_sign = false)
 {
   std::vector<std::string> output(input.size());
-  std::transform(input.begin(), input.end(), output.begin(), [=](const T& num) {
+  std::transform(input.begin(), input.end(), output.begin(), [=](T const& num) {
     auto str         = assign(num);
     bool is_negative = (str[0] == '-');
     if (is_negative) {
@@ -333,7 +332,7 @@ TYPED_TEST(CsvReaderNumericTypeTest, SingleColumn)
     cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   expect_column_data_equal(std::vector<TypeParam>(sequence, sequence + num_rows), view.column(0));
 }
 
@@ -525,7 +524,7 @@ TEST_F(CsvReaderTest, MultiColumn)
                dtype<double>()});
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   expect_column_data_equal(int8_values, view.column(0));
   expect_column_data_equal(int16_values, view.column(1));
   expect_column_data_equal(int32_values, view.column(2));
@@ -567,7 +566,7 @@ TEST_F(CsvReaderTest, RepeatColumn)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(3, view.num_columns());
   expect_column_data_equal(int16_values, view.column(0));
   expect_column_data_equal(int64_values, view.column(1));
@@ -593,7 +592,7 @@ TEST_F(CsvReaderTest, Booleans)
   auto result = cudf::io::read_csv(in_opts);
 
   // Booleans are the same (integer) data type, but valued at 0 or 1
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(4, view.num_columns());
   ASSERT_EQ(type_id::INT32, view.column(0).type().id());
   ASSERT_EQ(type_id::INT32, view.column(1).type().id());
@@ -623,7 +622,7 @@ TEST_F(CsvReaderTest, Dates)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
 
@@ -659,7 +658,7 @@ TEST_F(CsvReaderTest, DatesCastToTimestampSeconds)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::TIMESTAMP_SECONDS, view.column(0).type().id());
 
@@ -695,7 +694,7 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMilliSeconds)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
 
@@ -731,7 +730,7 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMicroSeconds)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id());
 
@@ -767,7 +766,7 @@ TEST_F(CsvReaderTest, DatesCastToTimestampNanoSeconds)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id());
 
@@ -806,7 +805,7 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampSeconds)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::TIMESTAMP_SECONDS, view.column(0).type().id());
 
@@ -834,7 +833,7 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMilliSeconds)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
 
@@ -862,7 +861,7 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMicroSeconds)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id());
 
@@ -890,7 +889,7 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampNanoSeconds)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id());
 
@@ -915,15 +914,15 @@ TEST_F(CsvReaderTest, FloatingPoint)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id());
 
-  const auto ref_vals =
+  auto const ref_vals =
     std::vector<float>{5.6, 56.79, 12000000000, 0.7, 3.000, 12.34, 0.31, -73.98007199999998};
   expect_column_data_equal(ref_vals, view.column(0));
 
-  const auto bitmask = cudf::test::bitmask_to_host(view.column(0));
+  auto const bitmask = cudf::test::bitmask_to_host(view.column(0));
   ASSERT_EQ((1u << ref_vals.size()) - 1, bitmask[0]);
 }
 
@@ -947,7 +946,7 @@ TEST_F(CsvReaderTest, Strings)
       .quoting(cudf::io::quote_style::NONE);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(2, view.num_columns());
   ASSERT_EQ(type_id::INT32, view.column(0).type().id());
   ASSERT_EQ(type_id::STRING, view.column(1).type().id());
@@ -977,7 +976,7 @@ TEST_F(CsvReaderTest, StringsQuotes)
       .quotechar('`');
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(2, view.num_columns());
   ASSERT_EQ(type_id::INT32, view.column(0).type().id());
   ASSERT_EQ(type_id::STRING, view.column(1).type().id());
@@ -1007,7 +1006,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
       .doublequote(false);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(2, view.num_columns());
   ASSERT_EQ(type_id::INT32, view.column(0).type().id());
   ASSERT_EQ(type_id::STRING, view.column(1).type().id());
@@ -1034,7 +1033,7 @@ TEST_F(CsvReaderTest, SkiprowsNrows)
       .nrows(2);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::INT32, view.column(0).type().id());
 
@@ -1058,7 +1057,7 @@ TEST_F(CsvReaderTest, ByteRange)
       .byte_range_size(15);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::INT32, view.column(0).type().id());
 
@@ -1076,7 +1075,7 @@ TEST_F(CsvReaderTest, ByteRangeStrings)
       .byte_range_offset(4);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::STRING, view.column(0).type().id());
 
@@ -1099,7 +1098,7 @@ TEST_F(CsvReaderTest, BlanksAndComments)
       .comment('#');
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::INT32, view.column(0).type().id());
 
@@ -1118,7 +1117,7 @@ TEST_F(CsvReaderTest, EmptyFile)
     cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(0, view.num_columns());
 }
 
@@ -1134,7 +1133,7 @@ TEST_F(CsvReaderTest, NoDataFile)
     cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(0, view.num_columns());
 }
 
@@ -1150,7 +1149,7 @@ TEST_F(CsvReaderTest, HeaderOnlyFile)
     cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(0, view.num_rows());
   EXPECT_EQ(3, view.num_columns());
 }
@@ -1172,7 +1171,7 @@ TEST_F(CsvReaderTest, ArrowFileSource)
       .dtypes({dtype<int8_t>()});
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::INT8, view.column(0).type().id());
 
@@ -1181,7 +1180,7 @@ TEST_F(CsvReaderTest, ArrowFileSource)
 
 TEST_F(CsvReaderTest, InvalidFloatingPoint)
 {
-  const auto filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.csv";
+  auto const filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.csv";
   {
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << "1.2e1+\n3.4e2-\n5.6e3e\n7.8e3A\n9.0Be1\n1C.2";
@@ -1192,9 +1191,9 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint)
       .names({"A"})
       .dtypes({dtype<float>()})
       .header(-1);
-  const auto result = cudf::io::read_csv(in_opts);
+  auto const result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id());
 
@@ -1208,7 +1207,7 @@ TEST_F(CsvReaderTest, StringInference)
   cudf::io::csv_reader_options in_opts =
     cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .header(-1);
-  const auto result = cudf::io::read_csv(in_opts);
+  auto const result = cudf::io::read_csv(in_opts);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), type_id::STRING);
@@ -1221,8 +1220,8 @@ TEST_F(CsvReaderTest, TypeInferenceThousands)
     cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .header(-1)
       .thousands('`');
-  const auto result      = cudf::io::read_csv(in_opts);
-  const auto result_view = result.tbl->view();
+  auto const result      = cudf::io::read_csv(in_opts);
+  auto const result_view = result.tbl->view();
 
   EXPECT_EQ(result_view.num_columns(), 3);
   EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64);
@@ -1249,8 +1248,8 @@ TEST_F(CsvReaderTest, TypeInferenceWithDecimal)
       .header(-1)
       .thousands('`')
       .decimal(';');
-  const auto result      = cudf::io::read_csv(in_opts);
-  const auto result_view = result.tbl->view();
+  auto const result      = cudf::io::read_csv(in_opts);
+  auto const result_view = result.tbl->view();
 
   EXPECT_EQ(result_view.num_columns(), 3);
   EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64);
@@ -1284,7 +1283,7 @@ TEST_F(CsvReaderTest, SkipRowsXorSkipFooter)
 
 TEST_F(CsvReaderTest, nullHandling)
 {
-  const auto filepath = temp_env->get_temp_dir() + "NullValues.csv";
+  auto const filepath = temp_env->get_temp_dir() + "NullValues.csv";
   {
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << "NULL\n\nnull\nn/a\nNull\nNA\nnan";
@@ -1298,8 +1297,8 @@ TEST_F(CsvReaderTest, nullHandling)
         .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
-    const auto result = cudf::io::read_csv(in_opts);
-    const auto view   = result.tbl->view();
+    auto const result = cudf::io::read_csv(in_opts);
+    auto const view   = result.tbl->view();
     auto expect =
       cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, view.column(0));
@@ -1312,8 +1311,8 @@ TEST_F(CsvReaderTest, nullHandling)
         .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
-    const auto result = cudf::io::read_csv(in_opts);
-    const auto view   = result.tbl->view();
+    auto const result = cudf::io::read_csv(in_opts);
+    auto const view   = result.tbl->view();
     auto expect =
       cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"},
                                          {false, false, false, false, true, false, false});
@@ -1329,8 +1328,8 @@ TEST_F(CsvReaderTest, nullHandling)
         .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
-    const auto result = cudf::io::read_csv(in_opts);
-    const auto view   = result.tbl->view();
+    auto const result = cudf::io::read_csv(in_opts);
+    auto const view   = result.tbl->view();
     auto expect =
       cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"},
                                          {false, false, false, false, false, false, false});
@@ -1347,8 +1346,8 @@ TEST_F(CsvReaderTest, nullHandling)
         .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
-    const auto result = cudf::io::read_csv(in_opts);
-    const auto view   = result.tbl->view();
+    auto const result = cudf::io::read_csv(in_opts);
+    auto const view   = result.tbl->view();
     auto expect =
       cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"},
                                          {true, true, true, true, false, true, true, true});
@@ -1364,8 +1363,8 @@ TEST_F(CsvReaderTest, nullHandling)
         .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
-    const auto result = cudf::io::read_csv(in_opts);
-    const auto view   = result.tbl->view();
+    auto const result = cudf::io::read_csv(in_opts);
+    auto const view   = result.tbl->view();
     auto expect =
       cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"});
 
@@ -1381,14 +1380,14 @@ TEST_F(CsvReaderTest, FailCases)
       cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .byte_range_offset(4)
         .skiprows(1),
-      cudf::logic_error);
+      std::invalid_argument);
   }
   {
     EXPECT_THROW(
       cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .byte_range_offset(4)
         .skipfooter(1),
-      cudf::logic_error);
+      std::invalid_argument);
   }
 
   {
@@ -1403,14 +1402,14 @@ TEST_F(CsvReaderTest, FailCases)
       cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .byte_range_size(4)
         .skiprows(1),
-      cudf::logic_error);
+      std::invalid_argument);
   }
   {
     EXPECT_THROW(
       cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .byte_range_size(4)
         .skipfooter(1),
-      cudf::logic_error);
+      std::invalid_argument);
   }
   {
     EXPECT_THROW(
@@ -1466,7 +1465,7 @@ TEST_F(CsvReaderTest, FailCases)
       cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .nrows(1)
         .skipfooter(1),
-      cudf::logic_error);
+      std::invalid_argument);
     ;
   }
   {
@@ -1539,7 +1538,7 @@ TYPED_TEST(CsvReaderNumericTypeTest, SingleColumnWithWriter)
     cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto result_table = result.tbl->view();
+  auto const result_table = result.tbl->view();
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result_table);
 }
 
@@ -1618,11 +1617,11 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter)
                dtype<double>()});
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto result_table = result.tbl->view();
+  auto const result_table = result.tbl->view();
 
   std::vector<cudf::size_type> non_float64s{0, 1, 2, 3, 4, 5, 6, 7, 8};
-  const auto input_sliced_view  = input_table.select(non_float64s);
-  const auto result_sliced_view = result_table.select(non_float64s);
+  auto const input_sliced_view  = input_table.select(non_float64s);
+  auto const result_sliced_view = result_table.select(non_float64s);
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_sliced_view, result_sliced_view);
 
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
@@ -1659,7 +1658,7 @@ TEST_F(CsvReaderTest, DatesWithWriter)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto result_table = result.tbl->view();
+  auto const result_table = result.tbl->view();
 
   check_timestamp_column(input_table.column(0), result_table.column(0));
 }
@@ -1682,7 +1681,7 @@ TEST_F(CsvReaderTest, DatesStringWithWriter)
         .header(-1);
     auto result = cudf::io::read_csv(in_opts);
 
-    const auto result_table = result.tbl->view();
+    auto const result_table = result.tbl->view();
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column, result_table.column(0));
   }
@@ -1705,7 +1704,7 @@ TEST_F(CsvReaderTest, DatesStringWithWriter)
         .header(-1);
     auto result = cudf::io::read_csv(in_opts);
 
-    const auto result_table = result.tbl->view();
+    auto const result_table = result.tbl->view();
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column, result_table.column(0));
   }
@@ -1728,7 +1727,7 @@ TEST_F(CsvReaderTest, DatesStringWithWriter)
         .header(-1);
     auto result = cudf::io::read_csv(in_opts);
 
-    const auto result_table = result.tbl->view();
+    auto const result_table = result.tbl->view();
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column, result_table.column(0));
   }
@@ -1752,7 +1751,7 @@ TEST_F(CsvReaderTest, DatesStringWithWriter)
         .header(-1);
     auto result = cudf::io::read_csv(in_opts);
 
-    const auto result_table = result.tbl->view();
+    auto const result_table = result.tbl->view();
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column, result_table.column(0));
   }
@@ -1775,7 +1774,7 @@ TEST_F(CsvReaderTest, DatesStringWithWriter)
         .header(-1);
     auto result = cudf::io::read_csv(in_opts);
 
-    const auto result_table = result.tbl->view();
+    auto const result_table = result.tbl->view();
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column, result_table.column(0));
   }
@@ -1800,7 +1799,7 @@ TEST_F(CsvReaderTest, FloatingPointWithWriter)
   // in_opts.lineterminator = ';';
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto result_table = result.tbl->view();
+  auto const result_table = result.tbl->view();
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result_table);
 }
 
@@ -1824,7 +1823,7 @@ TEST_F(CsvReaderTest, StringsWithWriter)
       .quoting(cudf::io::quote_style::NONE);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto result_table = result.tbl->view();
+  auto const result_table = result.tbl->view();
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_table.column(0), result_table.column(0));
   check_string_column(input_table.column(1), result_table.column(1));
   ASSERT_EQ(result.metadata.schema_info.size(), names.size());
@@ -1851,7 +1850,7 @@ TEST_F(CsvReaderTest, StringsWithWriterSimple)
       .quoting(cudf::io::quote_style::NONE);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto result_table = result.tbl->view();
+  auto const result_table = result.tbl->view();
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_table.column(0), result_table.column(0));
   check_string_column(input_table.column(1), result_table.column(1));
   ASSERT_EQ(result.metadata.schema_info.size(), names.size());
@@ -2013,7 +2012,7 @@ TEST_F(CsvReaderTest, DurationsWithWriter)
                data_type{type_id::DURATION_NANOSECONDS}});
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto result_table = result.tbl->view();
+  auto const result_table = result.tbl->view();
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result_table);
   ASSERT_EQ(result.metadata.schema_info.size(), names.size());
   for (auto i = 0ul; i < names.size(); ++i)
@@ -2086,7 +2085,7 @@ TEST_F(CsvReaderTest, ParseInRangeIntegers)
     cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_small_int, view.column(0));
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_less_equal_int64_max, view.column(1));
@@ -2165,7 +2164,7 @@ TEST_F(CsvReaderTest, ParseOutOfRangeIntegers)
     cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_out_of_range_positive, view.column(0));
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_out_of_range_negative, view.column(1));
@@ -2196,7 +2195,7 @@ TEST_F(CsvReaderTest, ReadMaxNumericValue)
     cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   expect_column_data_equal(std::vector<uint64_t>(sequence, sequence + num_rows), view.column(0));
 }
 
@@ -2225,7 +2224,7 @@ TEST_F(CsvReaderTest, DtypesMap)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto result_table = result.tbl->view();
+  auto const result_table = result.tbl->view();
   ASSERT_EQ(result_table.num_columns(), 2);
   ASSERT_EQ(result_table.column(0).type(), data_type{type_id::INT32});
   ASSERT_EQ(result_table.column(1).type(), data_type{type_id::INT16});
@@ -2242,7 +2241,7 @@ TEST_F(CsvReaderTest, DtypesMapPartial)
   {
     auto result = cudf::io::read_csv(in_opts);
 
-    const auto view = result.tbl->view();
+    auto const view = result.tbl->view();
     ASSERT_EQ(type_id::INT16, view.column(0).type().id());
     // Default to String if there's no data
     ASSERT_EQ(type_id::STRING, view.column(1).type().id());
@@ -2252,7 +2251,7 @@ TEST_F(CsvReaderTest, DtypesMapPartial)
   {
     auto result = cudf::io::read_csv(in_opts);
 
-    const auto view = result.tbl->view();
+    auto const view = result.tbl->view();
     ASSERT_EQ(type_id::STRING, view.column(0).type().id());
     ASSERT_EQ(type_id::UINT32, view.column(1).type().id());
   }
@@ -2290,7 +2289,7 @@ TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch)
   cudf::io::table_with_metadata new_table_and_metadata = cudf::io::read_csv(read_options);
 
   // verify that the tables are identical, or as identical as expected.
-  const auto new_table_view = new_table_and_metadata.tbl->view();
+  auto const new_table_view = new_table_and_metadata.tbl->view();
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, new_table_view);
   EXPECT_EQ(new_table_and_metadata.metadata.schema_info[0].name, "0");
   EXPECT_EQ(new_table_and_metadata.metadata.schema_info[1].name, "1");
@@ -2328,9 +2327,9 @@ TEST_F(CsvReaderTest, CropColumns)
       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<float>()})
       .names({"a", "b"})
       .header(-1);
-  const auto result = cudf::io::read_csv(in_opts);
+  auto const result = cudf::io::read_csv(in_opts);
 
-  const auto result_table = result.tbl->view();
+  auto const result_table = result.tbl->view();
   ASSERT_EQ(result_table.num_columns(), 2);
   ASSERT_EQ(result_table.column(0).type(), data_type{type_id::INT32});
   ASSERT_EQ(result_table.column(1).type(), data_type{type_id::FLOAT32});
@@ -2350,7 +2349,7 @@ TEST_F(CsvReaderTest, CropColumnsUseColsNames)
       .header(-1);
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto result_table = result.tbl->view();
+  auto const result_table = result.tbl->view();
   ASSERT_EQ(result_table.num_columns(), 1);
   ASSERT_EQ(result_table.column(0).type(), data_type{type_id::FLOAT32});
   expect_column_data_equal(std::vector<float>{9., 8., 7.}, result_table.column(0));
@@ -2366,7 +2365,7 @@ TEST_F(CsvReaderTest, ExtraColumns)
         .header(-1);
     auto result = cudf::io::read_csv(opts);
 
-    const auto result_table = result.tbl->view();
+    auto const result_table = result.tbl->view();
     ASSERT_EQ(result_table.num_columns(), 4);
     ASSERT_EQ(result_table.column(3).type(), data_type{type_id::INT8});
     ASSERT_EQ(result_table.column(3).null_count(), 3);
@@ -2379,7 +2378,7 @@ TEST_F(CsvReaderTest, ExtraColumns)
         .header(-1);
     auto result = cudf::io::read_csv(with_dtypes_opts);
 
-    const auto result_table = result.tbl->view();
+    auto const result_table = result.tbl->view();
     ASSERT_EQ(result_table.num_columns(), 4);
     ASSERT_EQ(result_table.column(3).type(), data_type{type_id::FLOAT32});
     ASSERT_EQ(result_table.column(3).null_count(), 3);
@@ -2398,7 +2397,7 @@ TEST_F(CsvReaderTest, ExtraColumnsUseCols)
         .header(-1);
     auto result = cudf::io::read_csv(in_opts);
 
-    const auto result_table = result.tbl->view();
+    auto const result_table = result.tbl->view();
     ASSERT_EQ(result_table.num_columns(), 2);
     ASSERT_EQ(result_table.column(1).type(), data_type{type_id::INT8});
     ASSERT_EQ(result_table.column(1).null_count(), 3);
@@ -2412,7 +2411,7 @@ TEST_F(CsvReaderTest, ExtraColumnsUseCols)
         .header(-1);
     auto result = cudf::io::read_csv(with_dtypes_opts);
 
-    const auto result_table = result.tbl->view();
+    auto const result_table = result.tbl->view();
     ASSERT_EQ(result_table.num_columns(), 2);
     ASSERT_EQ(result_table.column(1).type(), data_type{type_id::STRING});
     ASSERT_EQ(result_table.column(1).null_count(), 3);
@@ -2431,7 +2430,7 @@ TEST_F(CsvReaderTest, EmptyColumns)
   // More elements in `names` than in the file; additional columns are filled with nulls
   auto result = cudf::io::read_csv(in_opts);
 
-  const auto result_table = result.tbl->view();
+  auto const result_table = result.tbl->view();
   EXPECT_EQ(result_table.num_columns(), 4);
   // All columns should contain only nulls; expect INT8 type to use as little memory as possible
   for (auto& column : result_table) {
@@ -2451,7 +2450,7 @@ TEST_F(CsvReaderTest, BlankLineAfterFirstRow)
     // No header, getting column names/count from first row
     auto result = cudf::io::read_csv(no_header_opts);
 
-    const auto result_table = result.tbl->view();
+    auto const result_table = result.tbl->view();
     ASSERT_EQ(result_table.num_columns(), 3);
   }
   {
@@ -2460,7 +2459,7 @@ TEST_F(CsvReaderTest, BlankLineAfterFirstRow)
     // Getting column names/count from header
     auto result = cudf::io::read_csv(header_opts);
 
-    const auto result_table = result.tbl->view();
+    auto const result_table = result.tbl->view();
     ASSERT_EQ(result_table.num_columns(), 3);
   }
 }
@@ -2471,8 +2470,8 @@ TEST_F(CsvReaderTest, NullCount)
   cudf::io::csv_reader_options in_opts =
     cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .header(-1);
-  const auto result      = cudf::io::read_csv(in_opts);
-  const auto result_view = result.tbl->view();
+  auto const result      = cudf::io::read_csv(in_opts);
+  auto const result_view = result.tbl->view();
 
   EXPECT_EQ(result_view.num_rows(), 8);
   EXPECT_EQ(result_view.column(0).null_count(), 0);
@@ -2480,4 +2479,22 @@ TEST_F(CsvReaderTest, NullCount)
   EXPECT_EQ(result_view.column(2).null_count(), 8);
 }
 
+TEST_F(CsvReaderTest, UTF8BOM)
+{
+  std::string buffer = "\xEF\xBB\xBFMonth,Day,Year\nJune,6,2023\nAugust,25,1990\nMay,1,2000\n";
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()});
+  auto const result      = cudf::io::read_csv(in_opts);
+  auto const result_view = result.tbl->view();
+  EXPECT_EQ(result_view.num_rows(), 3);
+  EXPECT_EQ(result.metadata.schema_info.front().name, "Month");
+
+  auto col1     = cudf::test::strings_column_wrapper({"June", "August", "May"});
+  auto col2     = cudf::test::fixed_width_column_wrapper<int64_t>({6, 25, 1});
+  auto col3     = cudf::test::fixed_width_column_wrapper<int64_t>({2023, 1990, 2000});
+  auto expected = cudf::table_view({col1, col2, col3});
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result_view, expected);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu
index 64aa81343db..fd69251e4f5 100644
--- a/cpp/tests/io/fst/fst_test.cu
+++ b/cpp/tests/io/fst/fst_test.cu
@@ -129,9 +129,6 @@ TEST_F(FstTest, GroundTruth)
   // Type sufficiently large to index symbols within the input and output (may be unsigned)
   using SymbolOffsetT = uint32_t;
 
-  // Helper class to set up transition table, symbol group lookup table, and translation table
-  using DfaFstT = cudf::io::fst::detail::Dfa<char, NUM_SYMBOL_GROUPS, TT_NUM_STATES>;
-
   // Prepare cuda stream for data transfers & kernels
   rmm::cuda_stream stream{};
   rmm::cuda_stream_view stream_view(stream);
@@ -162,12 +159,16 @@ TEST_F(FstTest, GroundTruth)
 
   // Prepare input & output buffers
   constexpr std::size_t single_item = 1;
-  hostdevice_vector<SymbolT> output_gpu(input.size(), stream_view);
-  hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
-  hostdevice_vector<SymbolOffsetT> out_indexes_gpu(input.size(), stream_view);
+  cudf::detail::hostdevice_vector<SymbolT> output_gpu(input.size(), stream_view);
+  cudf::detail::hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
+  cudf::detail::hostdevice_vector<SymbolOffsetT> out_indexes_gpu(input.size(), stream_view);
 
   // Run algorithm
-  DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()};
+  auto parser = cudf::io::fst::detail::make_fst(
+    cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
+    cudf::io::fst::detail::make_transition_table(pda_state_tt),
+    cudf::io::fst::detail::make_translation_table<TT_NUM_STATES * NUM_SYMBOL_GROUPS>(pda_out_tt),
+    stream);
 
   // Allocate device-side temporary storage & run algorithm
   parser.Transduce(d_input.data(),
@@ -179,9 +180,9 @@ TEST_F(FstTest, GroundTruth)
                    stream.value());
 
   // Async copy results from device to host
-  output_gpu.device_to_host(stream.view());
-  out_indexes_gpu.device_to_host(stream.view());
-  output_gpu_size.device_to_host(stream.view());
+  output_gpu.device_to_host_async(stream.view());
+  out_indexes_gpu.device_to_host_async(stream.view());
+  output_gpu_size.device_to_host_async(stream.view());
 
   // Prepare CPU-side results for verification
   std::string output_cpu{};
diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu
index 7c0eef8d288..3d6743702b8 100644
--- a/cpp/tests/io/fst/logical_stack_test.cu
+++ b/cpp/tests/io/fst/logical_stack_test.cu
@@ -200,7 +200,7 @@ TEST_F(LogicalStackTest, GroundTruth)
 
   rmm::device_uvector<SymbolT> d_stack_ops{stack_symbols.size(), stream_view};
   rmm::device_uvector<SymbolOffsetT> d_stack_op_indexes{stack_op_indexes.size(), stream_view};
-  hostdevice_vector<SymbolT> top_of_stack_gpu{string_size, stream_view};
+  cudf::detail::hostdevice_vector<SymbolT> top_of_stack_gpu{string_size, stream_view};
   cudf::device_span<SymbolOffsetT> d_stack_op_idx_span{d_stack_op_indexes};
 
   CUDF_CUDA_TRY(cudaMemcpyAsync(d_stack_ops.data(),
@@ -226,7 +226,7 @@ TEST_F(LogicalStackTest, GroundTruth)
                                                     stream.value());
 
   // Async copy results from device to host
-  top_of_stack_gpu.device_to_host(stream_view);
+  top_of_stack_gpu.device_to_host_async(stream_view);
 
   // Get CPU-side results for verification
   std::string top_of_stack_cpu{};
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp
index 5b7de667f61..e2d5959c19f 100644
--- a/cpp/tests/io/json_chunked_reader.cpp
+++ b/cpp/tests/io/json_chunked_reader.cpp
@@ -20,7 +20,7 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <io/json/experimental/read_json.hpp>
+#include <io/json/read_json.hpp>
 
 /**
  * @brief Base test fixture for JSON reader tests
@@ -37,7 +37,7 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  using namespace cudf::io::detail::json::experimental;
+  using namespace cudf::io::json::detail;
   using cudf::size_type;
   // assuming single source.
   size_t total_source_size = 0;
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index cd067cce928..220f1a3391f 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -23,7 +23,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
-#include <cudf/io/datasource.hpp>
+#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -70,12 +70,12 @@ cudf::test::TempDirTestEnvironment* const temp_env =
     ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
 
 template <typename T>
-std::vector<std::string> prepend_zeros(const std::vector<T>& input,
+std::vector<std::string> prepend_zeros(std::vector<T> const& input,
                                        int zero_count         = 0,
                                        bool add_positive_sign = false)
 {
   std::vector<std::string> output(input.size());
-  std::transform(input.begin(), input.end(), output.begin(), [=](const T& num) {
+  std::transform(input.begin(), input.end(), output.begin(), [=](T const& num) {
     auto str         = std::to_string(num);
     bool is_negative = (str[0] == '-');
     if (is_negative) {
@@ -92,12 +92,12 @@ std::vector<std::string> prepend_zeros(const std::vector<T>& input,
 }
 
 template <>
-std::vector<std::string> prepend_zeros<std::string>(const std::vector<std::string>& input,
+std::vector<std::string> prepend_zeros<std::string>(std::vector<std::string> const& input,
                                                     int zero_count,
                                                     bool add_positive_sign)
 {
   std::vector<std::string> output(input.size());
-  std::transform(input.begin(), input.end(), output.begin(), [=](const std::string& num) {
+  std::transform(input.begin(), input.end(), output.begin(), [=](std::string const& num) {
     auto str         = num;
     bool is_negative = (str[0] == '-');
     if (is_negative) {
@@ -149,7 +149,7 @@ void check_float_column(cudf::column_view const& col,
                         valid_t const& validity)
 {
   CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL(col, (wrapper<T>{data.begin(), data.end(), validity}));
-  CUDF_EXPECTS(col.null_count() == 0, "All elements should be valid");
+  EXPECT_EQ(col.null_count(), 0);
   EXPECT_THAT(cudf::test::to_host<T>(col).first,
               ::testing::Pointwise(FloatNearPointwise(1e-6), data));
 }
@@ -198,6 +198,12 @@ struct JsonReaderParamTest : public cudf::test::BaseFixture,
 struct JsonReaderDualTest : public cudf::test::BaseFixture,
                             public testing::WithParamInterface<json_test_t> {};
 
+/**
+ * @brief Test fixture for parametrized JSON reader tests that only tests the new nested JSON reader
+ */
+struct JsonReaderNoLegacy : public cudf::test::BaseFixture,
+                            public testing::WithParamInterface<json_test_t> {};
+
 /**
  * @brief Generates a JSON lines string that uses the record orient
  *
@@ -245,7 +251,7 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalTy
       std::accumulate(reference_strings.begin(),
                       reference_strings.end(),
                       std::string{},
-                      [](const std::string& acc, const std::string& rhs) {
+                      [](std::string const& acc, std::string const& rhs) {
                         return acc + (acc.empty() ? "" : "\n") + "{\"col0\":" + rhs + "}";
                       });
     cudf::io::json_reader_options const in_opts =
@@ -287,6 +293,12 @@ INSTANTIATE_TEST_CASE_P(JsonReaderDualTest,
                         ::testing::Values(json_test_t::legacy_lines_record_orient,
                                           json_test_t::json_experimental_record_orient));
 
+// Parametrize qualifying JSON tests for executing nested reader only
+INSTANTIATE_TEST_CASE_P(JsonReaderNoLegacy,
+                        JsonReaderNoLegacy,
+                        ::testing::Values(json_test_t::json_experimental_row_orient,
+                                          json_test_t::json_experimental_record_orient));
+
 TEST_P(JsonReaderParamTest, BasicJsonLines)
 {
   auto const test_opt       = GetParam();
@@ -359,7 +371,7 @@ TEST_P(JsonReaderParamTest, FloatingPoint)
     float_wrapper{{5.6, 56.79, 12000000000., 0.7, 3.000, 12.34, 0.31, -73.98007199999998},
                   validity});
 
-  const auto bitmask = cudf::test::bitmask_to_host(result.tbl->get_column(0));
+  auto const bitmask = cudf::test::bitmask_to_host(result.tbl->get_column(0));
   ASSERT_EQ((1u << result.tbl->get_column(0).size()) - 1, bitmask[0]);
 }
 
@@ -453,7 +465,7 @@ TEST_P(JsonReaderParamTest, MultiColumn)
 
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
 
   EXPECT_EQ(view.num_columns(), 6);
   EXPECT_EQ(view.column(0).type().id(), cudf::type_id::INT8);
@@ -504,7 +516,7 @@ TEST_P(JsonReaderParamTest, Booleans)
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   // Booleans are the same (integer) data type, but valued at 0 or 1
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::BOOL8);
 
@@ -549,7 +561,7 @@ TEST_P(JsonReaderParamTest, Dates)
       .legacy(is_legacy_test(test_opt));
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::TIMESTAMP_MILLISECONDS);
 
@@ -605,7 +617,7 @@ TEST_P(JsonReaderParamTest, Durations)
       .legacy(is_legacy_test(test_opt));
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::DURATION_NANOSECONDS);
 
@@ -877,7 +889,7 @@ TEST_F(JsonReaderTest, EmptyFile)
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
   auto result = cudf::io::read_json(in_options);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(0, view.num_columns());
 }
 
@@ -894,7 +906,7 @@ TEST_F(JsonReaderTest, NoDataFile)
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(0, view.num_columns());
 }
 
@@ -911,7 +923,7 @@ TEST_F(JsonReaderTest, NoDataFileValues)
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
   EXPECT_EQ(0, view.num_columns());
 }
 
@@ -957,7 +969,7 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint)
                                                 "\n");
   std::string data          = is_row_orient_test(test_opt) ? row_orient : record_orient;
 
-  const auto filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.json";
+  auto const filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.json";
   {
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << data;
@@ -1074,7 +1086,7 @@ TEST_P(JsonReaderParamTest, ParseInRangeIntegers)
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_small_int, view.column(0));
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_less_equal_int64_max, view.column(1));
@@ -1178,7 +1190,7 @@ TEST_P(JsonReaderParamTest, ParseOutOfRangeIntegers)
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
-  const auto view = result.tbl->view();
+  auto const view = result.tbl->view();
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_out_of_range_positive, view.column(0));
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_out_of_range_negative, view.column(1));
@@ -1235,6 +1247,51 @@ TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputs)
                                  float64_wrapper{{1.1, 2.2, 3.3, 4.4}, validity});
 }
 
+TEST_P(JsonReaderNoLegacy, JsonLinesMultipleFileInputsNoNL)
+{
+  auto const test_opt = GetParam();
+  // Strings for the two separate input files in row-orient that do not end with a newline
+  std::vector<std::string> row_orient{"[11, 1.1]\n[22, 2.2]", "[33, 3.3]\n[44, 4.4]"};
+  // Strings for the two separate input files in record-orient that do not end with a newline
+  std::vector<std::string> record_orient{
+    to_records_orient({{{"0", "11"}, {"1", "1.1"}}, {{"0", "22"}, {"1", "2.2"}}}, "\n"),
+    to_records_orient({{{"0", "33"}, {"1", "3.3"}}, {{"0", "44"}, {"1", "4.4"}}}, "\n")};
+  auto const& data = is_row_orient_test(test_opt) ? row_orient : record_orient;
+
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesFileTest1.json";
+  std::ofstream outfile(file1, std::ofstream::out);
+  outfile << data[0];
+  outfile.close();
+
+  const std::string file2 = temp_env->get_temp_dir() + "JsonLinesFileTest2.json";
+  std::ofstream outfile2(file2, std::ofstream::out);
+  outfile2 << data[1];
+  outfile2.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}})
+      .lines(true)
+      .legacy(is_legacy_test(test_opt));
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 2);
+  EXPECT_EQ(result.tbl->num_rows(), 4);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+  EXPECT_EQ(result.metadata.schema_info[1].name, "1");
+
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper{{11, 22, 33, 44}, validity});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1),
+                                 float64_wrapper{{1.1, 2.2, 3.3, 4.4}, validity});
+}
+
 TEST_F(JsonReaderTest, BadDtypeParams)
 {
   std::string buffer = "[1,2,3,4]";
@@ -1769,4 +1826,50 @@ TEST_F(JsonReaderTest, TrailingCommas)
   }
 }
 
+TEST_F(JsonReaderTest, JSONLinesRecovering)
+{
+  std::string data =
+    // 0 -> a: -2 (valid)
+    R"({"a":-2})"
+    "\n"
+    // 1 -> (invalid)
+    R"({"a":])"
+    "\n"
+    // 2 -> (invalid)
+    R"({"b":{"a":[321})"
+    "\n"
+    // 3 -> c: [1] (valid)
+    R"({"c":1.2})"
+    "\n"
+    "\n"
+    // 4 -> a: 123 (valid)
+    R"({"a":123})";
+
+  auto filepath = temp_env->get_temp_dir() + "RecoveringLines.json";
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    outfile << data;
+  }
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 2);
+  EXPECT_EQ(result.tbl->num_rows(), 5);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64);
+
+  std::vector<bool> a_validity{true, false, false, false, true};
+  std::vector<bool> c_validity{false, false, false, true, false};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper{{-2, 0, 0, 0, 123}, a_validity.cbegin()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1),
+                                 float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0}, c_validity.cbegin()});
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index 57395a3f67b..16c22710003 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -17,8 +17,8 @@
 #include <io/json/nested_json.hpp>
 #include <io/utilities/hostdevice_vector.hpp>
 
-#include <cudf/detail/hashing.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
@@ -258,6 +258,7 @@ tree_meta_t2 get_tree_representation_cpu(
       case cuio_json::token_t::ValueEnd: return "VE";
       case cuio_json::token_t::StructMemberBegin: return " <";
       case cuio_json::token_t::StructMemberEnd: return " >";
+      case cuio_json::token_t::LineEnd: return ";";
       default: return ".";
     }
   };
@@ -466,18 +467,18 @@ records_orient_tree_traversal_cpu(cudf::host_span<cuio_json::SymbolT const> inpu
   auto hash_path = [&](auto node_id) {
     size_t seed = 0;
     while (node_id != top_node) {
-      seed = cudf::detail::hash_combine(
+      seed = cudf::hashing::detail::hash_combine(
         seed, std::hash<cuio_json::TreeDepthT>{}(tree.node_levels[node_id]));
-      seed = cudf::detail::hash_combine(
+      seed = cudf::hashing::detail::hash_combine(
         seed, std::hash<cuio_json::NodeT>{}(tree.node_categories[node_id]));
       if (tree.node_categories[node_id] == cuio_json::node_t::NC_FN) {
         auto field_name =
           std::string_view(input.data() + tree.node_range_begin[node_id],
                            tree.node_range_end[node_id] - tree.node_range_begin[node_id]);
-        seed = cudf::detail::hash_combine(seed, std::hash<std::string_view>{}(field_name));
+        seed = cudf::hashing::detail::hash_combine(seed, std::hash<std::string_view>{}(field_name));
       }
       if (is_array_of_arrays and tree.node_levels[node_id] == row_array_children_level)
-        seed = cudf::detail::hash_combine(seed, list_indices[node_id]);
+        seed = cudf::hashing::detail::hash_combine(seed, list_indices[node_id]);
       node_id = tree.parent_node_ids[node_id];
     }
     return seed;
@@ -585,7 +586,7 @@ TEST_F(JsonTest, TreeRepresentation)
   cudf::io::json_reader_options const options{};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+  auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
     d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
@@ -672,7 +673,7 @@ TEST_F(JsonTest, TreeRepresentation2)
   cudf::io::json_reader_options const options{};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+  auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
     d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
@@ -746,7 +747,7 @@ TEST_F(JsonTest, TreeRepresentation3)
   options.enable_lines(true);
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+  auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
     d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
@@ -771,7 +772,7 @@ TEST_F(JsonTest, TreeRepresentationError)
   cudf::io::json_reader_options const options{};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+  auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
     d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
@@ -853,7 +854,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
                                                              static_cast<size_t>(d_scalar.size())};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+  auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
     d_input, options, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu
index 806ff991579..5c32131114d 100644
--- a/cpp/tests/io/json_type_cast_test.cu
+++ b/cpp/tests/io/json_type_cast_test.cu
@@ -38,7 +38,7 @@ struct JSONTypeCastTest : public cudf::test::BaseFixture {};
 
 namespace {
 struct to_thrust_pair_fn {
-  __device__ thrust::pair<const char*, cudf::size_type> operator()(
+  __device__ thrust::pair<char const*, cudf::size_type> operator()(
     thrust::pair<cudf::string_view, bool> const& p)
   {
     return {p.first.data(), p.first.size_bytes()};
@@ -64,11 +64,11 @@ TEST_F(JSONTypeCastTest, String)
   auto const type   = cudf::data_type{cudf::type_id::STRING};
 
   auto in_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; });
-  std::vector<const char*> input_values{"this", "is", "null", "of", "", "strings", R"("null")"};
+  std::vector<char const*> input_values{"this", "is", "null", "of", "", "strings", R"("null")"};
   cudf::test::strings_column_wrapper input(input_values.begin(), input_values.end(), in_valids);
 
   auto d_column = cudf::column_device_view::create(input);
-  rmm::device_uvector<thrust::pair<const char*, cudf::size_type>> svs(d_column->size(), stream);
+  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> svs(d_column->size(), stream);
   thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                     d_column->pair_begin<cudf::string_view, false>(),
                     d_column->pair_end<cudf::string_view, false>(),
@@ -79,18 +79,18 @@ TEST_F(JSONTypeCastTest, String)
   auto null_mask =
     std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
 
-  auto str_col = cudf::io::json::experimental::detail::parse_data(svs.data(),
-                                                                  svs.size(),
-                                                                  type,
-                                                                  std::move(null_mask),
-                                                                  0,
-                                                                  default_json_options().view(),
-                                                                  stream,
-                                                                  mr);
+  auto str_col = cudf::io::json::detail::parse_data(svs.data(),
+                                                    svs.size(),
+                                                    type,
+                                                    std::move(null_mask),
+                                                    0,
+                                                    default_json_options().view(),
+                                                    stream,
+                                                    mr);
 
   auto out_valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; });
-  std::vector<const char*> expected_values{"this", "is", "", "of", "", "strings", "null"};
+  std::vector<char const*> expected_values{"this", "is", "", "of", "", "strings", "null"};
   cudf::test::strings_column_wrapper expected(
     expected_values.begin(), expected_values.end(), out_valids);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(str_col->view(), expected);
@@ -104,7 +104,7 @@ TEST_F(JSONTypeCastTest, Int)
 
   cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"});
   auto d_column = cudf::column_device_view::create(data);
-  rmm::device_uvector<thrust::pair<const char*, cudf::size_type>> svs(d_column->size(), stream);
+  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> svs(d_column->size(), stream);
   thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                     d_column->pair_begin<cudf::string_view, false>(),
                     d_column->pair_end<cudf::string_view, false>(),
@@ -115,14 +115,14 @@ TEST_F(JSONTypeCastTest, Int)
   auto null_mask =
     std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
 
-  auto col = cudf::io::json::experimental::detail::parse_data(svs.data(),
-                                                              svs.size(),
-                                                              type,
-                                                              std::move(null_mask),
-                                                              0,
-                                                              default_json_options().view(),
-                                                              stream,
-                                                              mr);
+  auto col = cudf::io::json::detail::parse_data(svs.data(),
+                                                svs.size(),
+                                                type,
+                                                std::move(null_mask),
+                                                0,
+                                                default_json_options().view(),
+                                                stream,
+                                                mr);
 
   auto expected =
     cudf::test::fixed_width_column_wrapper<int64_t>{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}};
@@ -147,7 +147,7 @@ TEST_F(JSONTypeCastTest, StringEscapes)
     R"("\"\\\/\b\f\n\r\t")",
   });
   auto d_column = cudf::column_device_view::create(data);
-  rmm::device_uvector<thrust::pair<const char*, cudf::size_type>> svs(d_column->size(), stream);
+  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> svs(d_column->size(), stream);
   thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                     d_column->pair_begin<cudf::string_view, false>(),
                     d_column->pair_end<cudf::string_view, false>(),
@@ -158,14 +158,14 @@ TEST_F(JSONTypeCastTest, StringEscapes)
   auto null_mask =
     std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));
 
-  auto col = cudf::io::json::experimental::detail::parse_data(svs.data(),
-                                                              svs.size(),
-                                                              type,
-                                                              std::move(null_mask),
-                                                              0,
-                                                              default_json_options().view(),
-                                                              stream,
-                                                              mr);
+  auto col = cudf::io::json::detail::parse_data(svs.data(),
+                                                svs.size(),
+                                                type,
+                                                std::move(null_mask),
+                                                0,
+                                                default_json_options().view(),
+                                                stream,
+                                                mr);
 
   auto expected = cudf::test::strings_column_wrapper{
     {"🚀", "Ａ🚀ＡＡ", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"},
diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json_writer.cpp
index e93047e20a5..3a4074c02ad 100644
--- a/cpp/tests/io/json_writer.cpp
+++ b/cpp/tests/io/json_writer.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
+#include <cudf/detail/iterator.cuh>
 #include <cudf/io/json.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/types.hpp>
@@ -438,4 +439,118 @@ TEST_F(JsonWriterTest, ChunkedNested)
   EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
 }
 
+TEST_F(JsonWriterTest, StructAllNullCombinations)
+{
+  auto const_1_iter = thrust::make_constant_iterator(1);
+
+  auto col_a = cudf::test::fixed_width_column_wrapper<int>(
+    const_1_iter, const_1_iter + 32, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+      return i / 16;
+    }));
+
+  auto col_b = cudf::test::fixed_width_column_wrapper<int>(
+    const_1_iter, const_1_iter + 32, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+      return (i / 8) % 2;
+    }));
+
+  auto col_c = cudf::test::fixed_width_column_wrapper<int>(
+    const_1_iter, const_1_iter + 32, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+      return (i / 4) % 2;
+    }));
+
+  auto col_d = cudf::test::fixed_width_column_wrapper<int>(
+    const_1_iter, const_1_iter + 32, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+      return (i / 2) % 2;
+    }));
+
+  auto col_e = cudf::test::fixed_width_column_wrapper<int>(
+    const_1_iter, const_1_iter + 32, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+      return i % 2;
+    }));
+
+  // The table has 32 rows with validity from 00000 to 11111
+  cudf::table_view tbl_view = cudf::table_view({col_a, col_b, col_c, col_d, col_e});
+  cudf::io::table_metadata mt{{{"a"}, {"b"}, {"c"}, {"d"}, {"e"}}};
+
+  std::vector<char> out_buffer;
+  auto destination     = cudf::io::sink_info(&out_buffer);
+  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
+                           .include_nulls(false)
+                           .metadata(mt)
+                           .lines(true)
+                           .na_rep("null");
+
+  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  std::string const expected = R"({}
+{"e":1}
+{"d":1}
+{"d":1,"e":1}
+{"c":1}
+{"c":1,"e":1}
+{"c":1,"d":1}
+{"c":1,"d":1,"e":1}
+{"b":1}
+{"b":1,"e":1}
+{"b":1,"d":1}
+{"b":1,"d":1,"e":1}
+{"b":1,"c":1}
+{"b":1,"c":1,"e":1}
+{"b":1,"c":1,"d":1}
+{"b":1,"c":1,"d":1,"e":1}
+{"a":1}
+{"a":1,"e":1}
+{"a":1,"d":1}
+{"a":1,"d":1,"e":1}
+{"a":1,"c":1}
+{"a":1,"c":1,"e":1}
+{"a":1,"c":1,"d":1}
+{"a":1,"c":1,"d":1,"e":1}
+{"a":1,"b":1}
+{"a":1,"b":1,"e":1}
+{"a":1,"b":1,"d":1}
+{"a":1,"b":1,"d":1,"e":1}
+{"a":1,"b":1,"c":1}
+{"a":1,"b":1,"c":1,"e":1}
+{"a":1,"b":1,"c":1,"d":1}
+{"a":1,"b":1,"c":1,"d":1,"e":1}
+)";
+  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+}
+
+TEST_F(JsonWriterTest, Unicode)
+{
+  //                                       UTF-8,                      UTF-16
+  cudf::test::strings_column_wrapper col1{"\"\\/\b\f\n\r\t", "ராபிட்ஸ்", "$€𐐷𤭢", "C𝞵𝓓𝒻"};
+  // Unicode
+  // 0000-FFFF     Basic Multilingual Plane
+  // 10000-10FFFF  Supplementary Plane
+  cudf::test::strings_column_wrapper col2{
+    "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ",  //  0000-FFFF
+    "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰",                            // 10000-1FFFF
+    "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮",                // 20000-2FFFF
+    "𰾑𱔈𲍉"};                                         // 30000-3FFFF
+  cudf::test::fixed_width_column_wrapper<int16_t> col3{{1, 2, 3, 4},
+                                                       cudf::test::iterators::nulls_at({0, 2})};
+  cudf::table_view tbl_view{{col1, col2, col3}};
+  cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}};
+
+  std::vector<char> out_buffer;
+  auto destination     = cudf::io::sink_info(&out_buffer);
+  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
+                           .include_nulls(true)
+                           .metadata(mt)
+                           .lines(true)
+                           .na_rep("null");
+
+  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+
+  std::string const expected =
+    R"({"col1":"\"\\\/\b\f\n\r\t","col2":"C\u10ae\u226a\u31f3\u434f\u51f9\u6ca6\u738b\u8fbf\u9fb8\ua057\ubbdc\uc2a4\ud3f6\ue4fe\ufd20","int16":null}
+{"col1":"\u0bb0\u0bbe\u0baa\u0bbf\u0b9f\u0bcd\u0bb8\u0bcd","col2":"\ud800\udc00\ud807\udfea\ud809\udc26\ud80c\udcf0\ud811\ude46 \ud81a\udd86\ud81f\udfff\ud823\udcd5\ud82b\udffe[\u21b3] \ud833\udf46\ud835\udcda\ud83a\udd01\ud83c\udd30","int16":2}
+{"col1":"$\u20ac\ud801\udc37\ud852\udf62","col2":"\ud841\ude28\ud846\udd4c\ud849\uddc9\ud84c\uddca\ud850\udea9\ud854\udd7d\ud858\ude71\ud85f\udd31\ud860\udc72\ud864\udc79\ud869\udc22\ud86c\udded\ud872\udf2d\ud877\udeb7\ud878\udea6\u5c6e","int16":null}
+{"col1":"C\ud835\udfb5\ud835\udcd3\ud835\udcbb","col2":"\ud883\udf91\ud885\udd08\ud888\udf49","int16":4}
+)";
+  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 13a95b41703..00d657108b8 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -32,6 +32,11 @@
 #include <cudf_test/io_metadata_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/zip_iterator.h>
+
 #include <string>
 
 namespace cuio_json = cudf::io::json;
@@ -160,13 +165,14 @@ TEST_F(JsonTest, StackContext)
   cudf::string_scalar const d_scalar(input, true, stream);
   auto const d_input =
     cudf::device_span<SymbolT const>{d_scalar.data(), static_cast<size_t>(d_scalar.size())};
-  hostdevice_vector<StackSymbolT> stack_context(input.size(), stream);
+  cudf::detail::hostdevice_vector<StackSymbolT> stack_context(input.size(), stream);
 
   // Run algorithm
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream);
+  constexpr auto stack_behavior = cuio_json::stack_behavior_t::PushPopWithoutReset;
+  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
 
   // Copy back the results
-  stack_context.device_to_host(stream);
+  stack_context.device_to_host_async(stream);
 
   // Make sure we copied back the stack context
   stream.synchronize();
@@ -208,13 +214,14 @@ TEST_F(JsonTest, StackContextUtf8)
   cudf::string_scalar const d_scalar(input, true, stream);
   auto const d_input =
     cudf::device_span<SymbolT const>{d_scalar.data(), static_cast<size_t>(d_scalar.size())};
-  hostdevice_vector<StackSymbolT> stack_context(input.size(), stream);
+  cudf::detail::hostdevice_vector<StackSymbolT> stack_context(input.size(), stream);
 
   // Run algorithm
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream);
+  constexpr auto stack_behavior = cuio_json::stack_behavior_t::PushPopWithoutReset;
+  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
 
   // Copy back the results
-  stack_context.device_to_host(stream);
+  stack_context.device_to_host_async(stream);
 
   // Make sure we copied back the stack context
   stream.synchronize();
@@ -229,6 +236,55 @@ TEST_F(JsonTest, StackContextUtf8)
   CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size());
 }
 
+TEST_F(JsonTest, StackContextRecovering)
+{
+  // Type used to represent the atomic symbol type used within the finite-state machine
+  using SymbolT      = char;
+  using StackSymbolT = char;
+
+  // Prepare cuda stream for data transfers & kernels
+  auto const stream = cudf::get_default_stream();
+
+  // JSON lines input that recovers on invalid lines
+  std::string const input = R"({"a":-2},
+  {"a":
+  {"a":{"a":[321
+  {"a":[1]}
+
+  {"b":123}
+  )";
+
+  // Expected stack context (including stack context of the newline characters)
+  std::string const golden_stack_context =
+    "_{{{{{{{__"
+    "___{{{{{"
+    "___{{{{{{{{{{[[[["
+    "___{{{{{[[{_"
+    "_"
+    "___{{{{{{{{_"
+    "__";
+
+  // Prepare input & output buffers
+  cudf::string_scalar const d_scalar(input, true, stream);
+  auto const d_input =
+    cudf::device_span<SymbolT const>{d_scalar.data(), static_cast<size_t>(d_scalar.size())};
+  cudf::detail::hostdevice_vector<StackSymbolT> stack_context(input.size(), stream);
+
+  // Run algorithm
+  constexpr auto stack_behavior = cuio_json::stack_behavior_t::ResetOnDelimiter;
+  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+
+  // Copy back the results
+  stack_context.device_to_host_async(stream);
+
+  // Make sure we copied back the stack context
+  stream.synchronize();
+
+  // Verify results
+  ASSERT_EQ(golden_stack_context.size(), stack_context.size());
+  CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size());
+}
+
 TEST_F(JsonTest, TokenStream)
 {
   using cuio_json::PdaTokenT;
@@ -264,10 +320,8 @@ TEST_F(JsonTest, TokenStream)
   auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
     d_input, default_options, stream, rmm::mr::get_current_device_resource());
   // Copy back the number of tokens that were written
-  thrust::host_vector<PdaTokenT> const tokens_gpu =
-    cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
-  thrust::host_vector<SymbolOffsetT> const token_indices_gpu =
-    cudf::detail::make_host_vector_async(d_token_indices_gpu, stream);
+  auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
+  auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
 
   // Golden token stream sample
   using token_t = cuio_json::token_t;
@@ -400,10 +454,8 @@ TEST_F(JsonTest, TokenStream2)
   auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
     d_input, default_options, stream, rmm::mr::get_current_device_resource());
   // Copy back the number of tokens that were written
-  thrust::host_vector<PdaTokenT> const tokens_gpu =
-    cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
-  thrust::host_vector<SymbolOffsetT> const token_indices_gpu =
-    cudf::detail::make_host_vector_async(d_token_indices_gpu, stream);
+  auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
+  auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
 
   // Golden token stream sample
   using token_t = cuio_json::token_t;
@@ -487,6 +539,228 @@ TEST_P(JsonParserTest, ExtractColumn)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2);
 }
 
+TEST_F(JsonTest, RecoveringTokenStream)
+{
+  // Test input. Inline comments used to indicate character indexes
+  //                           012345678 <= line 0
+  std::string const input = R"({"a":-2},)"
+                            // 9
+                            "\n"
+                            // 01234 <= line 1
+                            R"({"a":)"
+                            // 5
+                            "\n"
+                            // 67890123456789 <= line 2
+                            R"({"a":{"a":[321)"
+                            // 0
+                            "\n"
+                            // 123456789 <= line 3
+                            R"({"a":[1]})"
+                            // 0
+                            "\n"
+                            // 1  <= line 4
+                            "\n"
+                            // 23456789 <= line 5
+                            R"({"b":123})";
+
+  // Golden token stream sample
+  using token_t = cuio_json::token_t;
+  std::vector<std::pair<std::size_t, cuio_json::PdaTokenT>> const golden_token_stream = {
+    // Line 0 (invalid)
+    {0, token_t::StructBegin},
+    {0, token_t::StructEnd},
+    // Line 1 (valid)
+    {10, token_t::StructBegin},
+    {11, token_t::StructMemberBegin},
+    {11, token_t::FieldNameBegin},
+    {13, token_t::FieldNameEnd},
+    // Line 2 (valid)
+    {16, token_t::StructBegin},
+    {17, token_t::StructMemberBegin},
+    {17, token_t::FieldNameBegin},
+    {19, token_t::FieldNameEnd},
+    {21, token_t::StructBegin},
+    {22, token_t::StructMemberBegin},
+    {22, token_t::FieldNameBegin},
+    {24, token_t::FieldNameEnd},
+    {26, token_t::ListBegin},
+    {27, token_t::ValueBegin},
+    {30, token_t::ValueEnd},
+    // Line 3 (valid)
+    {31, token_t::StructBegin},
+    {32, token_t::StructMemberBegin},
+    {32, token_t::FieldNameBegin},
+    {34, token_t::FieldNameEnd},
+    {36, token_t::ListBegin},
+    {37, token_t::ValueBegin},
+    {38, token_t::ValueEnd},
+    {38, token_t::ListEnd},
+    {39, token_t::StructMemberEnd},
+    {39, token_t::StructEnd},
+    // Line 4 (empty)
+    // Line 5 (valid)
+    {42, token_t::StructBegin},
+    {43, token_t::StructMemberBegin},
+    {43, token_t::FieldNameBegin},
+    {45, token_t::FieldNameEnd},
+    {47, token_t::ValueBegin},
+    {50, token_t::ValueEnd},
+    {50, token_t::StructMemberEnd},
+    {50, token_t::StructEnd}};
+
+  auto const stream = cudf::get_default_stream();
+
+  // Default parsing options
+  cudf::io::json_reader_options default_options{};
+  default_options.set_recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
+  default_options.enable_lines(true);
+
+  // Prepare input & output buffers
+  cudf::string_scalar const d_scalar(input, true, stream);
+  auto const d_input = cudf::device_span<cuio_json::SymbolT const>{
+    d_scalar.data(), static_cast<size_t>(d_scalar.size())};
+
+  // Parse the JSON and get the token stream
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, default_options, stream, rmm::mr::get_current_device_resource());
+  // Copy back the number of tokens that were written
+  auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
+  auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
+
+  // Verify the number of tokens matches
+  ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size());
+  ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size());
+
+  for (std::size_t i = 0; i < tokens_gpu.size(); i++) {
+    // Ensure the index the tokens are pointing to do match
+    EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i;
+    // Ensure the token category is correct
+    EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i;
+  }
+}
+
+TEST_F(JsonTest, PostProcessTokenStream)
+{
+  // Golden token stream sample
+  using token_t       = cuio_json::token_t;
+  using token_index_t = cuio_json::SymbolOffsetT;
+  using tuple_t       = thrust::tuple<token_index_t, cuio_json::PdaTokenT>;
+
+  std::vector<tuple_t> const input = {// Line 0 (invalid)
+                                      {0, token_t::LineEnd},
+                                      {0, token_t::StructBegin},
+                                      {1, token_t::StructMemberBegin},
+                                      {1, token_t::FieldNameBegin},
+                                      {3, token_t::FieldNameEnd},
+                                      {5, token_t::ValueBegin},
+                                      {7, token_t::ValueEnd},
+                                      {7, token_t::StructMemberEnd},
+                                      {7, token_t::StructEnd},
+                                      {8, token_t::ErrorBegin},
+                                      {9, token_t::LineEnd},
+                                      // Line 1
+                                      {10, token_t::StructBegin},
+                                      {11, token_t::StructMemberBegin},
+                                      {11, token_t::FieldNameBegin},
+                                      {13, token_t::FieldNameEnd},
+                                      {15, token_t::LineEnd},
+                                      // Line 2 (invalid)
+                                      {16, token_t::StructBegin},
+                                      {17, token_t::StructMemberBegin},
+                                      {17, token_t::FieldNameBegin},
+                                      {19, token_t::FieldNameEnd},
+                                      {21, token_t::StructBegin},
+                                      {22, token_t::StructMemberBegin},
+                                      {22, token_t::FieldNameBegin},
+                                      {24, token_t::FieldNameEnd},
+                                      {26, token_t::ListBegin},
+                                      {27, token_t::ValueBegin},
+                                      {29, token_t::ErrorBegin},
+                                      {30, token_t::LineEnd},
+                                      // Line 3 (invalid)
+                                      {31, token_t::StructBegin},
+                                      {32, token_t::StructMemberBegin},
+                                      {32, token_t::FieldNameBegin},
+                                      {34, token_t::FieldNameEnd},
+                                      {36, token_t::ListBegin},
+                                      {37, token_t::ValueBegin},
+                                      {38, token_t::ValueEnd},
+                                      {38, token_t::ListEnd},
+                                      {39, token_t::StructMemberEnd},
+                                      {39, token_t::StructEnd},
+                                      {40, token_t::ErrorBegin},
+                                      {40, token_t::LineEnd},
+                                      // Line 4
+                                      {41, token_t::LineEnd},
+                                      // Line 5
+                                      {42, token_t::StructBegin},
+                                      {43, token_t::StructMemberBegin},
+                                      {43, token_t::FieldNameBegin},
+                                      {45, token_t::FieldNameEnd},
+                                      {47, token_t::ValueBegin},
+                                      {50, token_t::ValueEnd},
+                                      {50, token_t::StructMemberEnd},
+                                      {50, token_t::StructEnd}};
+
+  std::vector<tuple_t> const expected_output = {// Line 0 (invalid)
+                                                {0, token_t::StructBegin},
+                                                {0, token_t::StructEnd},
+                                                // Line 1
+                                                {10, token_t::StructBegin},
+                                                {11, token_t::StructMemberBegin},
+                                                {11, token_t::FieldNameBegin},
+                                                {13, token_t::FieldNameEnd},
+                                                // Line 2 (invalid)
+                                                {0, token_t::StructBegin},
+                                                {0, token_t::StructEnd},
+                                                // Line 3 (invalid)
+                                                {0, token_t::StructBegin},
+                                                {0, token_t::StructEnd},
+                                                // Line 4 (empty)
+                                                // Line 5
+                                                {42, token_t::StructBegin},
+                                                {43, token_t::StructMemberBegin},
+                                                {43, token_t::FieldNameBegin},
+                                                {45, token_t::FieldNameEnd},
+                                                {47, token_t::ValueBegin},
+                                                {50, token_t::ValueEnd},
+                                                {50, token_t::StructMemberEnd},
+                                                {50, token_t::StructEnd}};
+
+  // Decompose tuples
+  auto const stream = cudf::get_default_stream();
+  std::vector<token_index_t> offsets(input.size());
+  std::vector<cuio_json::PdaTokenT> tokens(input.size());
+  auto token_tuples = thrust::make_zip_iterator(offsets.begin(), tokens.begin());
+  thrust::copy(input.cbegin(), input.cend(), token_tuples);
+
+  // Initialize device-side test data
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    cudf::host_span<token_index_t const>{offsets.data(), offsets.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
+  auto const d_tokens =
+    cudf::detail::make_device_uvector_async(tokens, stream, rmm::mr::get_current_device_resource());
+
+  // Run system-under-test
+  auto [d_filtered_tokens, d_filtered_indices] =
+    cuio_json::detail::process_token_stream(d_tokens, d_offsets, stream);
+
+  auto const filtered_tokens  = cudf::detail::make_std_vector_async(d_filtered_tokens, stream);
+  auto const filtered_indices = cudf::detail::make_std_vector_async(d_filtered_indices, stream);
+
+  // Verify the number of tokens matches
+  ASSERT_EQ(filtered_tokens.size(), expected_output.size());
+  ASSERT_EQ(filtered_indices.size(), expected_output.size());
+
+  for (std::size_t i = 0; i < filtered_tokens.size(); i++) {
+    // Ensure the index the tokens are pointing to do match
+    EXPECT_EQ(thrust::get<0>(expected_output[i]), filtered_indices[i]) << "Mismatch at #" << i;
+    // Ensure the token category is correct
+    EXPECT_EQ(thrust::get<1>(expected_output[i]), filtered_tokens[i]) << "Mismatch at #" << i;
+  }
+}
+
 TEST_P(JsonParserTest, UTF_JSON)
 {
   // Prepare cuda stream for data transfers & kernels
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 088cff8d790..cff7b1cf081 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -163,7 +163,7 @@ struct SkipRowTest {
   int test_calls{0};
   SkipRowTest() {}
 
-  std::unique_ptr<table> get_expected_result(const std::string& filepath,
+  std::unique_ptr<table> get_expected_result(std::string const& filepath,
                                              int skip_rows,
                                              int file_num_rows,
                                              int read_num_rows)
@@ -387,7 +387,7 @@ TEST_F(OrcWriterTest, MultiColumn)
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .metadata(expected_metadata);
   cudf::io::write_orc(out_opts);
 
   cudf::io::orc_reader_options in_opts =
@@ -454,7 +454,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumnWithNulls.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .metadata(expected_metadata);
   cudf::io::write_orc(out_opts);
 
   cudf::io::orc_reader_options in_opts =
@@ -491,9 +491,9 @@ TEST_F(OrcWriterTest, ReadZeroRows)
 
 TEST_F(OrcWriterTest, Strings)
 {
-  std::vector<const char*> strings{
+  std::vector<char const*> strings{
     "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
-  const auto num_rows = strings.size();
+  auto const num_rows = strings.size();
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
@@ -512,7 +512,7 @@ TEST_F(OrcWriterTest, Strings)
   auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .metadata(expected_metadata);
   cudf::io::write_orc(out_opts);
 
   cudf::io::orc_reader_options in_opts =
@@ -527,9 +527,9 @@ TEST_F(OrcWriterTest, SlicedTable)
 {
   // This test checks for writing zero copy, offsetted views into existing cudf tables
 
-  std::vector<const char*> strings{
+  std::vector<char const*> strings{
     "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
-  const auto num_rows = strings.size();
+  auto const num_rows = strings.size();
 
   auto seq_col0  = random_values<int32_t>(num_rows);
   auto seq_col2  = random_values<float>(num_rows);
@@ -564,7 +564,7 @@ TEST_F(OrcWriterTest, SlicedTable)
   auto filepath = temp_env->get_temp_filepath("SlicedTable.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
-      .metadata(&expected_metadata);
+      .metadata(expected_metadata);
   cudf::io::write_orc(out_opts);
 
   cudf::io::orc_reader_options in_opts =
@@ -578,7 +578,7 @@ TEST_F(OrcWriterTest, SlicedTable)
 TEST_F(OrcWriterTest, HostBuffer)
 {
   constexpr auto num_rows = 100 << 10;
-  const auto seq_col      = random_values<int>(num_rows);
+  auto const seq_col      = random_values<int>(num_rows);
   int32_col col(seq_col.begin(), seq_col.end());
 
   table_view expected{{col}};
@@ -589,14 +589,14 @@ TEST_F(OrcWriterTest, HostBuffer)
   std::vector<char> out_buffer;
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), expected)
-      .metadata(&expected_metadata);
+      .metadata(expected_metadata);
   cudf::io::write_orc(out_opts);
 
   cudf::io::orc_reader_options in_opts =
     cudf::io::orc_reader_options::builder(
       cudf::io::source_info(out_buffer.data(), out_buffer.size()))
       .use_index(false);
-  const auto result = cudf::io::read_orc(in_opts);
+  auto const result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -739,9 +739,9 @@ TEST_F(OrcChunkedWriterTest, ManyTables)
 
 TEST_F(OrcChunkedWriterTest, Metadata)
 {
-  std::vector<const char*> strings{
+  std::vector<char const*> strings{
     "Monday", "Tuesday", "THURSDAY", "Wednesday", "Friday", "Sunday", "Saturday"};
-  const auto num_rows = strings.size();
+  auto const num_rows = strings.size();
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
@@ -760,7 +760,7 @@ TEST_F(OrcChunkedWriterTest, Metadata)
   auto filepath = temp_env->get_temp_filepath("ChunkedMetadata.orc");
   cudf::io::chunked_orc_writer_options opts =
     cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath})
-      .metadata(&expected_metadata);
+      .metadata(expected_metadata);
   cudf::io::orc_chunked_writer(opts).write(expected).write(expected);
 
   cudf::io::orc_reader_options read_opts =
@@ -773,12 +773,12 @@ TEST_F(OrcChunkedWriterTest, Metadata)
 TEST_F(OrcChunkedWriterTest, Strings)
 {
   bool mask1[] = {true, true, false, true, true, true, true};
-  std::vector<const char*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
+  std::vector<char const*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
   str_col strings1(h_strings1.begin(), h_strings1.end(), mask1);
   table_view tbl1({strings1});
 
   bool mask2[] = {false, true, true, true, true, true, true};
-  std::vector<const char*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
+  std::vector<char const*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
   str_col strings2(h_strings2.begin(), h_strings2.end(), mask2);
   table_view tbl2({strings2});
 
@@ -978,7 +978,7 @@ TEST_F(OrcStatisticsTest, Basic)
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
-  std::vector<const char*> strings{
+  std::vector<char const*> strings{
     "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Wednesday", "Tuesday"};
   int num_rows = strings.size();
 
@@ -1050,7 +1050,7 @@ TEST_F(OrcStatisticsTest, Basic)
 
 TEST_F(OrcWriterTest, SlicedValidMask)
 {
-  std::vector<const char*> strings;
+  std::vector<char const*> strings;
   // Need more than 32 elements to reproduce the issue
   for (int i = 0; i < 34; ++i)
     strings.emplace_back("a long string to make sure overflow affects the output");
@@ -1068,7 +1068,7 @@ TEST_F(OrcWriterTest, SlicedValidMask)
   auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl)
-      .metadata(&expected_metadata);
+      .metadata(expected_metadata);
   cudf::io::write_orc(out_opts);
 
   cudf::io::orc_reader_options in_opts =
@@ -1117,7 +1117,7 @@ TEST_F(OrcReaderTest, zstdCompressionRegression)
     0x30, 0x09, 0x82, 0xf4, 0x03, 0x03, 0x4f, 0x52, 0x43, 0x17};
 
   auto source =
-    cudf::io::source_info(reinterpret_cast<const char*>(input_buffer), sizeof(input_buffer));
+    cudf::io::source_info(reinterpret_cast<char const*>(input_buffer), sizeof(input_buffer));
   cudf::io::orc_reader_options in_opts =
     cudf::io::orc_reader_options::builder(source).use_index(false);
 
@@ -1322,14 +1322,14 @@ TEST_P(OrcWriterTestStripes, StripeSize)
   constexpr auto num_rows            = 1000000;
   auto const [size_bytes, size_rows] = GetParam();
 
-  const auto seq_col = random_values<int>(num_rows);
-  const auto validity =
+  auto const seq_col = random_values<int>(num_rows);
+  auto const validity =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   column_wrapper<int64_t> col{seq_col.begin(), seq_col.end(), validity};
 
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col.release());
-  const auto expected = std::make_unique<table>(std::move(cols));
+  auto const expected = std::make_unique<table>(std::move(cols));
 
   auto validate = [&](std::vector<char> const& orc_buffer) {
     auto const expected_stripe_num =
@@ -1379,7 +1379,7 @@ INSTANTIATE_TEST_CASE_P(OrcWriterTest,
 
 TEST_F(OrcWriterTest, StripeSizeInvalid)
 {
-  const auto unused_table = std::make_unique<table>();
+  auto const unused_table = std::make_unique<table>();
   std::vector<char> out_buffer;
 
   EXPECT_THROW(
@@ -1432,7 +1432,7 @@ TEST_F(OrcWriterTest, TestMap)
   auto filepath = temp_env->get_temp_filepath("MapColumn.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .metadata(expected_metadata);
   cudf::io::write_orc(out_opts);
 
   cudf::io::orc_reader_options in_opts =
@@ -1462,7 +1462,7 @@ TEST_F(OrcReaderTest, NestedColumnSelection)
   auto filepath = temp_env->get_temp_filepath("OrcNestedSelection.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .metadata(std::move(expected_metadata));
   cudf::io::write_orc(out_opts);
 
   cudf::io::orc_reader_options in_opts =
@@ -1497,7 +1497,7 @@ TEST_F(OrcReaderTest, DecimalOptions)
   auto filepath = temp_env->get_temp_filepath("OrcDecimalOptions.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .metadata(std::move(expected_metadata));
   cudf::io::write_orc(out_opts);
 
   cudf::io::orc_reader_options valid_opts =
@@ -1544,7 +1544,7 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .metadata(std::move(expected_metadata));
   cudf::io::write_orc(out_opts);
 
   cudf::io::orc_reader_options in_opts =
@@ -1597,7 +1597,7 @@ TEST_F(OrcMetadataReaderTest, TestBasic)
   auto filepath = temp_env->get_temp_filepath("MetadataTest.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .metadata(std::move(expected_metadata));
   cudf::io::write_orc(out_opts);
 
   auto meta = read_orc_metadata(cudf::io::source_info{filepath});
@@ -1644,7 +1644,7 @@ TEST_F(OrcMetadataReaderTest, TestNested)
   auto filepath = temp_env->get_temp_filepath("MetadataTest.orc");
   cudf::io::orc_writer_options out_opts =
     cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .metadata(std::move(expected_metadata));
   cudf::io::write_orc(out_opts);
 
   auto meta = read_orc_metadata(cudf::io::source_info{filepath});
@@ -1705,4 +1705,160 @@ TEST_F(OrcReaderTest, ZstdMaxCompressionRate)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TEST_F(OrcWriterTest, CompStats)
+{
+  auto table = create_random_fixed_table<int>(1, 100000, true);
+
+  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
+
+  std::vector<char> unused_buffer;
+  cudf::io::orc_writer_options opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&unused_buffer}, table->view())
+      .compression_statistics(stats);
+  cudf::io::write_orc(opts);
+
+  EXPECT_NE(stats->num_compressed_bytes(), 0);
+  EXPECT_EQ(stats->num_failed_bytes(), 0);
+  EXPECT_EQ(stats->num_skipped_bytes(), 0);
+  EXPECT_FALSE(std::isnan(stats->compression_ratio()));
+}
+
+TEST_F(OrcChunkedWriterTest, CompStats)
+{
+  auto table = create_random_fixed_table<int>(1, 100000, true);
+
+  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
+
+  std::vector<char> unused_buffer;
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&unused_buffer})
+      .compression_statistics(stats);
+  cudf::io::orc_chunked_writer(opts).write(*table);
+
+  EXPECT_NE(stats->num_compressed_bytes(), 0);
+  EXPECT_EQ(stats->num_failed_bytes(), 0);
+  EXPECT_EQ(stats->num_skipped_bytes(), 0);
+  EXPECT_FALSE(std::isnan(stats->compression_ratio()));
+
+  auto const single_table_comp_stats = *stats;
+  cudf::io::orc_chunked_writer(opts).write(*table);
+
+  EXPECT_EQ(stats->compression_ratio(), single_table_comp_stats.compression_ratio());
+  EXPECT_EQ(stats->num_compressed_bytes(), 2 * single_table_comp_stats.num_compressed_bytes());
+
+  EXPECT_EQ(stats->num_failed_bytes(), 0);
+  EXPECT_EQ(stats->num_skipped_bytes(), 0);
+}
+
+void expect_compression_stats_empty(std::shared_ptr<cudf::io::writer_compression_statistics> stats)
+{
+  EXPECT_EQ(stats->num_compressed_bytes(), 0);
+  EXPECT_EQ(stats->num_failed_bytes(), 0);
+  EXPECT_EQ(stats->num_skipped_bytes(), 0);
+  EXPECT_TRUE(std::isnan(stats->compression_ratio()));
+}
+
+TEST_F(OrcWriterTest, CompStatsEmptyTable)
+{
+  auto table_no_rows = create_random_fixed_table<int>(20, 0, false);
+
+  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
+
+  std::vector<char> unused_buffer;
+  cudf::io::orc_writer_options opts = cudf::io::orc_writer_options::builder(
+                                        cudf::io::sink_info{&unused_buffer}, table_no_rows->view())
+                                        .compression_statistics(stats);
+  cudf::io::write_orc(opts);
+
+  expect_compression_stats_empty(stats);
+}
+
+TEST_F(OrcChunkedWriterTest, CompStatsEmptyTable)
+{
+  auto table_no_rows = create_random_fixed_table<int>(20, 0, false);
+
+  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
+
+  std::vector<char> unused_buffer;
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&unused_buffer})
+      .compression_statistics(stats);
+  cudf::io::orc_chunked_writer(opts).write(*table_no_rows);
+
+  expect_compression_stats_empty(stats);
+}
+
+TEST_F(OrcWriterTest, EmptyRowGroup)
+{
+  std::vector<int> ints(10000 + 5, -1);
+  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i >= 10000; });
+  int32_col col{ints.begin(), ints.end(), mask};
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcEmptyRowGroup.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
+TEST_F(OrcWriterTest, NoNullsAsNonNullable)
+{
+  auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  column_wrapper<int32_t> col{{1, 2, 3}, valids};
+  table_view expected({col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_nullability(false);
+
+  auto filepath = temp_env->get_temp_filepath("NonNullable.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(std::move(expected_metadata));
+  // Writer should be able to write a column without nulls as non-nullable
+  EXPECT_NO_THROW(cudf::io::write_orc(out_opts));
+}
+
+TEST_F(OrcWriterTest, SlicedStringColumn)
+{
+  std::vector<char const*> strings{"a", "bc", "def", "longer", "strings", "at the end"};
+  str_col col(strings.begin(), strings.end());
+  table_view expected({col});
+
+  // Slice the table to include the longer strings
+  auto expected_slice = cudf::slice(expected, {2, 6});
+
+  auto filepath = temp_env->get_temp_filepath("SlicedTable.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
+}
+
+TEST_F(OrcWriterTest, EmptyChildStringColumn)
+{
+  list_col<cudf::string_view> col{{}, {}};
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcEmptyChildStringColumn.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 8662e1d5bd0..3cd5c9f5593 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -30,11 +30,15 @@
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/io/parquet_metadata.hpp>
+#include <cudf/stream_compaction.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/transform.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/span.hpp>
+#include <cudf/wrappers/timestamps.hpp>
 
 #include <src/io/parquet/compact_protocol_reader.hpp>
 #include <src/io/parquet/parquet.hpp>
@@ -195,49 +199,47 @@ std::unique_ptr<cudf::column> make_parquet_list_list_col(
 // given a datasource pointing to a parquet file, read the footer
 // of the file to populate the FileMetaData pointed to by file_meta_data.
 // throws cudf::logic_error if the file or metadata is invalid.
-void read_footer(const std::unique_ptr<cudf::io::datasource>& source,
+void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
                  cudf::io::parquet::FileMetaData* file_meta_data)
 {
   constexpr auto header_len = sizeof(cudf::io::parquet::file_header_s);
   constexpr auto ender_len  = sizeof(cudf::io::parquet::file_ender_s);
 
-  const auto len           = source->size();
-  const auto header_buffer = source->host_read(0, header_len);
-  const auto header =
-    reinterpret_cast<const cudf::io::parquet::file_header_s*>(header_buffer->data());
-  const auto ender_buffer = source->host_read(len - ender_len, ender_len);
-  const auto ender = reinterpret_cast<const cudf::io::parquet::file_ender_s*>(ender_buffer->data());
+  auto const len           = source->size();
+  auto const header_buffer = source->host_read(0, header_len);
+  auto const header =
+    reinterpret_cast<cudf::io::parquet::file_header_s const*>(header_buffer->data());
+  auto const ender_buffer = source->host_read(len - ender_len, ender_len);
+  auto const ender = reinterpret_cast<cudf::io::parquet::file_ender_s const*>(ender_buffer->data());
 
   // checks for valid header, footer, and file length
-  CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source");
-  CUDF_EXPECTS(header->magic == cudf::io::parquet::parquet_magic &&
-                 ender->magic == cudf::io::parquet::parquet_magic,
-               "Corrupted header or footer");
-  CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len),
-               "Incorrect footer length");
+  ASSERT_GT(len, header_len + ender_len);
+  ASSERT_TRUE(header->magic == cudf::io::parquet::parquet_magic &&
+              ender->magic == cudf::io::parquet::parquet_magic);
+  ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len));
 
   // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1"
   // seek backwards from the end of the file (footer_length + 8 bytes of ender)
-  const auto footer_buffer =
+  auto const footer_buffer =
     source->host_read(len - ender->footer_len - ender_len, ender->footer_len);
   cudf::io::parquet::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len);
 
   // returns true on success
   bool res = cp.read(file_meta_data);
-  CUDF_EXPECTS(res, "Cannot parse file metadata");
+  ASSERT_TRUE(res);
 }
 
 // returns the number of bits used for dictionary encoding data at the given page location.
 // this assumes the data is uncompressed.
 // throws cudf::logic_error if the page_loc data is invalid.
-int read_dict_bits(const std::unique_ptr<cudf::io::datasource>& source,
-                   const cudf::io::parquet::PageLocation& page_loc)
+int read_dict_bits(std::unique_ptr<cudf::io::datasource> const& source,
+                   cudf::io::parquet::PageLocation const& page_loc)
 {
   CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header");
   CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length");
 
   cudf::io::parquet::PageHeader page_hdr;
-  const auto page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
+  auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
   cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size());
   bool res = cp.read(&page_hdr);
   CUDF_EXPECTS(res, "Cannot parse page header");
@@ -251,13 +253,13 @@ int read_dict_bits(const std::unique_ptr<cudf::io::datasource>& source,
 // parse and return as a ColumnIndex struct.
 // throws cudf::logic_error if the chunk data is invalid.
 cudf::io::parquet::ColumnIndex read_column_index(
-  const std::unique_ptr<cudf::io::datasource>& source, const cudf::io::parquet::ColumnChunk& chunk)
+  std::unique_ptr<cudf::io::datasource> const& source, cudf::io::parquet::ColumnChunk const& chunk)
 {
   CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index");
   CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length");
 
   cudf::io::parquet::ColumnIndex colidx;
-  const auto ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length);
+  auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length);
   cudf::io::parquet::CompactProtocolReader cp(ci_buf->data(), ci_buf->size());
   bool res = cp.read(&colidx);
   CUDF_EXPECTS(res, "Cannot parse column index");
@@ -268,44 +270,36 @@ cudf::io::parquet::ColumnIndex read_column_index(
 // parse and return as an OffsetIndex struct.
 // throws cudf::logic_error if the chunk data is invalid.
 cudf::io::parquet::OffsetIndex read_offset_index(
-  const std::unique_ptr<cudf::io::datasource>& source, const cudf::io::parquet::ColumnChunk& chunk)
+  std::unique_ptr<cudf::io::datasource> const& source, cudf::io::parquet::ColumnChunk const& chunk)
 {
   CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index");
   CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length");
 
   cudf::io::parquet::OffsetIndex offidx;
-  const auto oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length);
+  auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length);
   cudf::io::parquet::CompactProtocolReader cp(oi_buf->data(), oi_buf->size());
   bool res = cp.read(&offidx);
   CUDF_EXPECTS(res, "Cannot parse offset index");
   return offidx;
 }
 
-// parse the statistics_blob on chunk and return as a Statistics struct.
-// throws cudf::logic_error if the chunk statistics_blob is invalid.
-cudf::io::parquet::Statistics parse_statistics(const cudf::io::parquet::ColumnChunk& chunk)
+// Return as a Statistics from the column chunk
+cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChunk const& chunk)
 {
-  auto& stats_blob = chunk.meta_data.statistics_blob;
-  CUDF_EXPECTS(stats_blob.size() > 0, "Invalid statistics length");
-
-  cudf::io::parquet::Statistics stats;
-  cudf::io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
-  bool res = cp.read(&stats);
-  CUDF_EXPECTS(res, "Cannot parse column statistics");
-  return stats;
+  return chunk.meta_data.statistics;
 }
 
 // read page header from datasource at location indicated by page_loc,
 // parse and return as a PageHeader struct.
 // throws cudf::logic_error if the page_loc data is invalid.
-cudf::io::parquet::PageHeader read_page_header(const std::unique_ptr<cudf::io::datasource>& source,
-                                               const cudf::io::parquet::PageLocation& page_loc)
+cudf::io::parquet::PageHeader read_page_header(std::unique_ptr<cudf::io::datasource> const& source,
+                                               cudf::io::parquet::PageLocation const& page_loc)
 {
   CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header");
   CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length");
 
   cudf::io::parquet::PageHeader page_hdr;
-  const auto page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
+  auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
   cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size());
   bool res = cp.read(&page_hdr);
   CUDF_EXPECTS(res, "Cannot parse page header");
@@ -382,7 +376,7 @@ struct ParquetChunkedWriterNumericTypeTest : public ParquetChunkedWriterTest {
 TYPED_TEST_SUITE(ParquetChunkedWriterNumericTypeTest, SupportedTypes);
 
 // Base test fixture for size-parameterized tests
-class ParquetSizedTest : public ::testing::TestWithParam<int> {};
+class ParquetSizedTest : public ::cudf::test::BaseFixtureWithParam<int> {};
 
 // test the allowed bit widths for dictionary encoding
 // values chosen to trigger 1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, and 24 bit dictionaries
@@ -391,6 +385,13 @@ INSTANTIATE_TEST_SUITE_P(ParquetDictionaryTest,
                          testing::Range(1, 25),
                          testing::PrintToStringParamName());
 
+// Base test fixture for V2 header tests
+class ParquetV2Test : public ::cudf::test::BaseFixtureWithParam<bool> {};
+INSTANTIATE_TEST_SUITE_P(ParquetV2ReadWriteTest,
+                         ParquetV2Test,
+                         testing::Bool(),
+                         testing::PrintToStringParamName());
+
 namespace {
 // Generates a vector of uniform random values of type T
 template <typename T>
@@ -600,9 +601,10 @@ TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampOverflow)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
-TEST_F(ParquetWriterTest, MultiColumn)
+TEST_P(ParquetV2Test, MultiColumn)
 {
   constexpr auto num_rows = 100000;
+  auto const is_v2        = GetParam();
 
   // auto col0_data = random_values<bool>(num_rows);
   auto col1_data = random_values<int8_t>(num_rows);
@@ -651,7 +653,8 @@ TEST_F(ParquetWriterTest, MultiColumn)
   auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .write_v2_headers(is_v2)
+      .metadata(expected_metadata);
   cudf::io::write_parquet(out_opts);
 
   cudf::io::parquet_reader_options in_opts =
@@ -662,9 +665,10 @@ TEST_F(ParquetWriterTest, MultiColumn)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
-TEST_F(ParquetWriterTest, MultiColumnWithNulls)
+TEST_P(ParquetV2Test, MultiColumnWithNulls)
 {
   constexpr auto num_rows = 100;
+  auto const is_v2        = GetParam();
 
   // auto col0_data = random_values<bool>(num_rows);
   auto col1_data = random_values<int8_t>(num_rows);
@@ -721,7 +725,8 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   auto filepath = temp_env->get_temp_filepath("MultiColumnWithNulls.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .write_v2_headers(is_v2)
+      .metadata(expected_metadata);
 
   cudf::io::write_parquet(out_opts);
 
@@ -736,11 +741,13 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
-TEST_F(ParquetWriterTest, Strings)
+TEST_P(ParquetV2Test, Strings)
 {
-  std::vector<const char*> strings{
+  auto const is_v2 = GetParam();
+
+  std::vector<char const*> strings{
     "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
-  const auto num_rows = strings.size();
+  auto const num_rows = strings.size();
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
@@ -760,7 +767,8 @@ TEST_F(ParquetWriterTest, Strings)
   auto filepath = temp_env->get_temp_filepath("Strings.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .write_v2_headers(is_v2)
+      .metadata(expected_metadata);
   cudf::io::write_parquet(out_opts);
 
   cudf::io::parquet_reader_options in_opts =
@@ -773,9 +781,9 @@ TEST_F(ParquetWriterTest, Strings)
 
 TEST_F(ParquetWriterTest, StringsAsBinary)
 {
-  std::vector<const char*> unicode_strings{
+  std::vector<char const*> unicode_strings{
     "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
-  std::vector<const char*> ascii_strings{
+  std::vector<char const*> ascii_strings{
     "Monday", "Wednesday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
 
   column_wrapper<cudf::string_view> col0{ascii_strings.begin(), ascii_strings.end()};
@@ -811,7 +819,7 @@ TEST_F(ParquetWriterTest, StringsAsBinary)
   auto filepath = temp_env->get_temp_filepath("BinaryStrings.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, write_tbl)
-      .metadata(&expected_metadata);
+      .metadata(expected_metadata);
   cudf::io::write_parquet(out_opts);
 
   cudf::io::parquet_reader_options in_opts =
@@ -829,13 +837,14 @@ TEST_F(ParquetWriterTest, StringsAsBinary)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
-TEST_F(ParquetWriterTest, SlicedTable)
+TEST_P(ParquetV2Test, SlicedTable)
 {
   // This test checks for writing zero copy, offsetted views into existing cudf tables
 
-  std::vector<const char*> strings{
+  std::vector<char const*> strings{
     "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
-  const auto num_rows = strings.size();
+  auto const num_rows = strings.size();
+  auto const is_v2    = GetParam();
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
@@ -932,7 +941,8 @@ TEST_F(ParquetWriterTest, SlicedTable)
   auto filepath = temp_env->get_temp_filepath("SlicedTable.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
-      .metadata(&expected_metadata);
+      .write_v2_headers(is_v2)
+      .metadata(expected_metadata);
   cudf::io::write_parquet(out_opts);
 
   cudf::io::parquet_reader_options in_opts =
@@ -943,8 +953,10 @@ TEST_F(ParquetWriterTest, SlicedTable)
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
-TEST_F(ParquetWriterTest, ListColumn)
+TEST_P(ParquetV2Test, ListColumn)
 {
+  auto const is_v2 = GetParam();
+
   auto valids  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
   auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; });
 
@@ -1029,7 +1041,8 @@ TEST_F(ParquetWriterTest, ListColumn)
 
   auto filepath = temp_env->get_temp_filepath("ListColumn.parquet");
   auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-                    .metadata(&expected_metadata)
+                    .write_v2_headers(is_v2)
+                    .metadata(expected_metadata)
                     .compression(cudf::io::compression_type::NONE);
 
   cudf::io::write_parquet(out_opts);
@@ -1070,7 +1083,7 @@ TEST_F(ParquetWriterTest, MultiIndex)
   auto filepath = temp_env->get_temp_filepath("MultiIndex.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata)
+      .metadata(expected_metadata)
       .key_value_metadata(
         {{{"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"}}});
   cudf::io::write_parquet(out_opts);
@@ -1088,12 +1101,12 @@ TEST_F(ParquetWriterTest, MultiIndex)
 TEST_F(ParquetWriterTest, BufferSource)
 {
   constexpr auto num_rows = 100 << 10;
-  const auto seq_col      = random_values<int>(num_rows);
-  const auto validity =
+  auto const seq_col      = random_values<int>(num_rows);
+  auto const validity =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   column_wrapper<int> col{seq_col.begin(), seq_col.end(), validity};
 
-  const auto expected = table_view{{col}};
+  auto const expected = table_view{{col}};
 
   cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("col_other");
@@ -1101,14 +1114,14 @@ TEST_F(ParquetWriterTest, BufferSource)
   std::vector<char> out_buffer;
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), expected)
-      .metadata(&expected_metadata);
+      .metadata(expected_metadata);
   cudf::io::write_parquet(out_opts);
 
   // host buffer
   {
     cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder(
       cudf::io::source_info(out_buffer.data(), out_buffer.size()));
-    const auto result = cudf::io::read_parquet(in_opts);
+    auto const result = cudf::io::read_parquet(in_opts);
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
     cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -1125,7 +1138,7 @@ TEST_F(ParquetWriterTest, BufferSource)
       reinterpret_cast<std::byte const*>(d_input.data()), d_input.size());
     cudf::io::parquet_reader_options in_opts =
       cudf::io::parquet_reader_options::builder(cudf::io::source_info(d_buffer));
-    const auto result = cudf::io::read_parquet(in_opts);
+    auto const result = cudf::io::read_parquet(in_opts);
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
     cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -1204,8 +1217,10 @@ TEST_F(ParquetWriterTest, Struct)
   cudf::io::read_parquet(read_args);
 }
 
-TEST_F(ParquetWriterTest, StructOfList)
+TEST_P(ParquetV2Test, StructOfList)
 {
+  auto const is_v2 = GetParam();
+
   // Struct<is_human:bool,
   //        Struct<weight:float,
   //               ages:int,
@@ -1268,19 +1283,22 @@ TEST_F(ParquetWriterTest, StructOfList)
   auto filepath = temp_env->get_temp_filepath("StructOfList.parquet");
   cudf::io::parquet_writer_options args =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .write_v2_headers(is_v2)
+      .metadata(expected_metadata);
   cudf::io::write_parquet(args);
 
   cudf::io::parquet_reader_options read_args =
     cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath));
-  const auto result = cudf::io::read_parquet(read_args);
+  auto const result = cudf::io::read_parquet(read_args);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
-TEST_F(ParquetWriterTest, ListOfStruct)
+TEST_P(ParquetV2Test, ListOfStruct)
 {
+  auto const is_v2 = GetParam();
+
   // List<Struct<is_human:bool,
   //             Struct<weight:float,
   //                    ages:int,
@@ -1320,12 +1338,13 @@ TEST_F(ParquetWriterTest, ListOfStruct)
   auto filepath = temp_env->get_temp_filepath("ListOfStruct.parquet");
   cudf::io::parquet_writer_options args =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .write_v2_headers(is_v2)
+      .metadata(expected_metadata);
   cudf::io::write_parquet(args);
 
   cudf::io::parquet_reader_options read_args =
     cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath));
-  const auto result = cudf::io::read_parquet(read_args);
+  auto const result = cudf::io::read_parquet(read_args);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -1463,8 +1482,10 @@ TEST_F(ParquetWriterTest, PartitionedWrite)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
 }
 
-TEST_F(ParquetWriterTest, PartitionedWriteEmptyPartitions)
+TEST_P(ParquetV2Test, PartitionedWriteEmptyPartitions)
 {
+  auto const is_v2 = GetParam();
+
   auto source = create_random_fixed_table<int>(4, 4, false);
 
   auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet");
@@ -1482,6 +1503,7 @@ TEST_F(ParquetWriterTest, PartitionedWriteEmptyPartitions)
     cudf::io::parquet_writer_options::builder(
       cudf::io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
       .partitions({partition1, partition2})
+      .write_v2_headers(is_v2)
       .compression(cudf::io::compression_type::NONE);
   cudf::io::write_parquet(args);
 
@@ -1494,8 +1516,10 @@ TEST_F(ParquetWriterTest, PartitionedWriteEmptyPartitions)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
 }
 
-TEST_F(ParquetWriterTest, PartitionedWriteEmptyColumns)
+TEST_P(ParquetV2Test, PartitionedWriteEmptyColumns)
 {
+  auto const is_v2 = GetParam();
+
   auto source = create_random_fixed_table<int>(0, 4, false);
 
   auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet");
@@ -1513,6 +1537,7 @@ TEST_F(ParquetWriterTest, PartitionedWriteEmptyColumns)
     cudf::io::parquet_writer_options::builder(
       cudf::io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
       .partitions({partition1, partition2})
+      .write_v2_headers(is_v2)
       .compression(cudf::io::compression_type::NONE);
   cudf::io::write_parquet(args);
 
@@ -1626,7 +1651,7 @@ TEST_F(ParquetChunkedWriterTest, LargeTables)
   cudf::io::chunked_parquet_writer_options args =
     cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
   auto md = cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2).close();
-  CUDF_EXPECTS(!md, "The return value should be null.");
+  ASSERT_EQ(md, nullptr);
 
   cudf::io::parquet_reader_options read_opts =
     cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
@@ -1657,7 +1682,7 @@ TEST_F(ParquetChunkedWriterTest, ManyTables)
     writer.write(tbl);
   });
   auto md = writer.close({"dummy/path"});
-  CUDF_EXPECTS(md, "The returned metadata should not be null.");
+  ASSERT_NE(md, nullptr);
 
   cudf::io::parquet_reader_options read_opts =
     cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
@@ -1671,13 +1696,13 @@ TEST_F(ParquetChunkedWriterTest, Strings)
   std::vector<std::unique_ptr<cudf::column>> cols;
 
   bool mask1[] = {true, true, false, true, true, true, true};
-  std::vector<const char*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
+  std::vector<char const*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
   cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1);
   cols.push_back(strings1.release());
   cudf::table tbl1(std::move(cols));
 
   bool mask2[] = {false, true, true, true, true, true, true};
-  std::vector<const char*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
+  std::vector<char const*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2);
   cols.push_back(strings2.release());
   cudf::table tbl2(std::move(cols));
@@ -1802,7 +1827,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStruct)
   auto filepath = temp_env->get_temp_filepath("ChunkedListOfStruct.parquet");
   cudf::io::chunked_parquet_writer_options args =
     cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
-  args.set_metadata(&expected_metadata);
+  args.set_metadata(expected_metadata);
   cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2);
 
   cudf::io::parquet_reader_options read_opts =
@@ -1889,7 +1914,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList)
   auto filepath = temp_env->get_temp_filepath("ListOfStructOfStructOfListOfList.parquet");
   cudf::io::chunked_parquet_writer_options args =
     cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
-  args.set_metadata(&expected_metadata);
+  args.set_metadata(expected_metadata);
   cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2);
 
   cudf::io::parquet_reader_options read_opts =
@@ -2060,7 +2085,7 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct)
   auto filepath = temp_env->get_temp_filepath("ChunkedNullableStruct.parquet");
   cudf::io::chunked_parquet_writer_options args =
     cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
-  args.set_metadata(&expected_metadata);
+  args.set_metadata(expected_metadata);
   cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2);
 
   cudf::io::parquet_reader_options read_opts =
@@ -2093,7 +2118,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullability)
 
   cudf::io::chunked_parquet_writer_options args =
     cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath})
-      .metadata(&metadata);
+      .metadata(std::move(metadata));
   cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2);
 
   cudf::io::parquet_reader_options read_opts =
@@ -2148,7 +2173,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
 
   cudf::io::chunked_parquet_writer_options args =
     cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath})
-      .metadata(&metadata);
+      .metadata(std::move(metadata));
   cudf::io::parquet_chunked_writer(args).write(table1).write(table2);
 
   cudf::io::parquet_reader_options read_opts =
@@ -2193,7 +2218,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct)
   auto filepath = temp_env->get_temp_filepath("ChunkedNullableStruct.parquet");
   cudf::io::chunked_parquet_writer_options args =
     cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
-  args.set_metadata(&expected_metadata);
+  args.set_metadata(expected_metadata);
   cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2);
 
   cudf::io::parquet_reader_options read_opts =
@@ -2271,13 +2296,13 @@ TEST_F(ParquetWriterTest, DecimalWrite)
   // verify failure if too small a precision is given
   expected_metadata.column_metadata[0].set_decimal_precision(7);
   expected_metadata.column_metadata[1].set_decimal_precision(1);
-  args.set_metadata(&expected_metadata);
+  args.set_metadata(expected_metadata);
   EXPECT_THROW(cudf::io::write_parquet(args), cudf::logic_error);
 
   // verify success if equal precision is given
   expected_metadata.column_metadata[0].set_decimal_precision(7);
   expected_metadata.column_metadata[1].set_decimal_precision(9);
-  args.set_metadata(&expected_metadata);
+  args.set_metadata(std::move(expected_metadata));
   cudf::io::write_parquet(args);
 
   cudf::io::parquet_reader_options read_opts =
@@ -2691,8 +2716,8 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
   constexpr int floats_per_row = 4;
   auto c1_offset_iter          = cudf::detail::make_counting_transform_iterator(
     0, [floats_per_row](cudf::size_type idx) { return idx * floats_per_row; });
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> c1_offsets(
-    c1_offset_iter, c1_offset_iter + num_rows + 1);
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> c1_offsets(c1_offset_iter,
+                                                                     c1_offset_iter + num_rows + 1);
   cudf::test::fixed_width_column_wrapper<float> c1_floats(
     values, values + (num_rows * floats_per_row), valids);
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows);
@@ -2715,8 +2740,8 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
   cudf::test::strings_column_wrapper string_col{string_iter, string_iter + num_string_rows};
   auto offset_iter = cudf::detail::make_counting_transform_iterator(
     0, [string_per_row](cudf::size_type idx) { return idx * string_per_row; });
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets(offset_iter,
-                                                                    offset_iter + num_rows + 1);
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets(offset_iter,
+                                                                  offset_iter + num_rows + 1);
 
   auto _c3_valids =
     cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 200; });
@@ -2855,7 +2880,7 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
     md.column_metadata[0].set_name("a");
     md.column_metadata[1].set_name("b");
     cudf::io::parquet_writer_options opts =
-      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl).metadata(&md);
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl).metadata(md);
     cudf::io::write_parquet(opts);
 
     // read them out of order
@@ -2878,7 +2903,7 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
     md.column_metadata[0].set_name("a");
     md.column_metadata[1].set_name("b");
     cudf::io::parquet_writer_options opts =
-      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl).metadata(&md);
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl).metadata(md);
     cudf::io::write_parquet(opts);
 
     // read them out of order
@@ -2906,7 +2931,8 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
   md.column_metadata[2].set_name("c");
   md.column_metadata[3].set_name("d");
   cudf::io::parquet_writer_options opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl).metadata(&md);
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl)
+      .metadata(std::move(md));
   cudf::io::write_parquet(opts);
 
   {
@@ -2984,14 +3010,14 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
   auto filepath = temp_env->get_temp_filepath("SelectNestedColumn.parquet");
   cudf::io::parquet_writer_options args =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, input)
-      .metadata(&input_metadata);
+      .metadata(std::move(input_metadata));
   cudf::io::write_parquet(args);
 
   {  // Test selecting a single leaf from the table
     cudf::io::parquet_reader_options read_args =
       cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath))
         .columns({"being.particulars.age"});
-    const auto result = cudf::io::read_parquet(read_args);
+    auto const result = cudf::io::read_parquet(read_args);
 
     auto expect_ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
       {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
@@ -3013,7 +3039,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
     cudf::io::parquet_reader_options read_args =
       cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath))
         .columns({"being.particulars"});
-    const auto result = cudf::io::read_parquet(read_args);
+    auto const result = cudf::io::read_parquet(read_args);
 
     auto expected_weights_col =
       cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
@@ -3042,7 +3068,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
     cudf::io::parquet_reader_options read_args =
       cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath))
         .columns({"being.particulars.age", "being.particulars.weight", "being.human?"});
-    const auto result = cudf::io::read_parquet(read_args);
+    auto const result = cudf::io::read_parquet(read_args);
 
     auto expected_weights_col =
       cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
@@ -3081,7 +3107,7 @@ TEST_F(ParquetReaderTest, DecimalRead)
        This test is a temporary test until python gains the ability to write decimal, so we're
        embedding
        a parquet file directly into the code here to prevent issues with finding the file */
-    const unsigned char decimals_parquet[] = {
+    unsigned char const decimals_parquet[] = {
       0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xb0, 0x03, 0x15, 0xb8, 0x03, 0x2c, 0x15, 0x6a,
       0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00,
       0x18, 0x04, 0x81, 0x69, 0x67, 0xff, 0x00, 0x00, 0x00, 0xd8, 0x01, 0xf0, 0xd7, 0x04, 0x00,
@@ -3243,7 +3269,7 @@ TEST_F(ParquetReaderTest, DecimalRead)
     unsigned int decimals_parquet_len = 2366;
 
     cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(
-      cudf::io::source_info{reinterpret_cast<const char*>(decimals_parquet), decimals_parquet_len});
+      cudf::io::source_info{reinterpret_cast<char const*>(decimals_parquet), decimals_parquet_len});
     auto result = cudf::io::read_parquet(read_opts);
 
     auto validity =
@@ -3298,7 +3324,7 @@ TEST_F(ParquetReaderTest, DecimalRead)
     // dec7p3: Decimal(precision=7, scale=3) backed by FIXED_LENGTH_BYTE_ARRAY(length = 4)
     // dec12p11: Decimal(precision=12, scale=11) backed by FIXED_LENGTH_BYTE_ARRAY(length = 6)
     // dec20p1: Decimal(precision=20, scale=1) backed by FIXED_LENGTH_BYTE_ARRAY(length = 9)
-    const unsigned char fixed_len_bytes_decimal_parquet[] = {
+    unsigned char const fixed_len_bytes_decimal_parquet[] = {
       0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xA8, 0x01, 0x15, 0xAE, 0x01, 0x2C, 0x15, 0x28,
       0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x02, 0x28, 0x04, 0x00, 0x97, 0x45, 0x72,
       0x18, 0x04, 0x00, 0x01, 0x81, 0x3B, 0x00, 0x00, 0x00, 0x54, 0xF0, 0x53, 0x04, 0x00, 0x00,
@@ -3387,7 +3413,7 @@ TEST_F(ParquetReaderTest, DecimalRead)
 
     cudf::io::parquet_reader_options read_opts =
       cudf::io::parquet_reader_options::builder(cudf::io::source_info{
-        reinterpret_cast<const char*>(fixed_len_bytes_decimal_parquet), parquet_len});
+        reinterpret_cast<char const*>(fixed_len_bytes_decimal_parquet), parquet_len});
     auto result = cudf::io::read_parquet(read_opts);
     EXPECT_EQ(result.tbl->view().num_columns(), 3);
 
@@ -3486,7 +3512,7 @@ TEST_F(ParquetReaderTest, EmptyOutput)
   auto filepath = temp_env->get_temp_filepath("EmptyOutput.parquet");
   cudf::io::parquet_writer_options out_args =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-  out_args.set_metadata(&expected_metadata);
+  out_args.set_metadata(std::move(expected_metadata));
   cudf::io::write_parquet(out_args);
 
   cudf::io::parquet_reader_options read_args =
@@ -3498,7 +3524,7 @@ TEST_F(ParquetReaderTest, EmptyOutput)
 
 TEST_F(ParquetWriterTest, RowGroupSizeInvalid)
 {
-  const auto unused_table = std::make_unique<table>();
+  auto const unused_table = std::make_unique<table>();
   std::vector<char> out_buffer;
 
   EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer),
@@ -3541,7 +3567,7 @@ TEST_F(ParquetWriterTest, RowGroupSizeInvalid)
 
 TEST_F(ParquetWriterTest, RowGroupPageSizeMatch)
 {
-  const auto unused_table = std::make_unique<table>();
+  auto const unused_table = std::make_unique<table>();
   std::vector<char> out_buffer;
 
   auto options = cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer),
@@ -3663,10 +3689,10 @@ TEST_F(ParquetWriterTest, CheckPageRows)
   cudf::io::parquet::FileMetaData fmd;
 
   read_footer(source, &fmd);
-  CUDF_EXPECTS(fmd.row_groups.size() > 0, "No row groups found");
-  CUDF_EXPECTS(fmd.row_groups[0].columns.size() == 1, "Invalid number of columns");
+  ASSERT_GT(fmd.row_groups.size(), 0);
+  ASSERT_EQ(fmd.row_groups[0].columns.size(), 1);
   auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data;
-  CUDF_EXPECTS(first_chunk.data_page_offset > 0, "Invalid location for first data page");
+  ASSERT_GT(first_chunk.data_page_offset, 0);
 
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
@@ -3699,10 +3725,10 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
   cudf::io::parquet::FileMetaData fmd;
 
   read_footer(source, &fmd);
-  CUDF_EXPECTS(fmd.row_groups.size() > 0, "No row groups found");
-  CUDF_EXPECTS(fmd.row_groups[0].columns.size() == 1, "Invalid number of columns");
+  ASSERT_GT(fmd.row_groups.size(), 0);
+  ASSERT_EQ(fmd.row_groups[0].columns.size(), 1);
   auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data;
-  CUDF_EXPECTS(first_chunk.data_page_offset > 0, "Invalid location for first data page");
+  ASSERT_GT(first_chunk.data_page_offset, 0);
 
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
@@ -3712,6 +3738,44 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
   EXPECT_LE(ph.data_page_header.num_values, rows_per_page);
 }
 
+TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
+{
+  constexpr auto rows_per_page = 1'000;
+  constexpr auto fragment_size = 5'000;
+  constexpr auto num_rows      = 3 * rows_per_page;
+  const std::string s1(32, 'a');
+  auto col0_elements =
+    cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s1; });
+  auto col0 = cudf::test::strings_column_wrapper(col0_elements, col0_elements + num_rows);
+
+  auto const expected = table_view{{col0}};
+
+  auto const filepath = temp_env->get_temp_filepath("CheckPageRowsTooSmall.parquet");
+  const cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .max_page_fragment_size(fragment_size)
+      .max_page_size_rows(rows_per_page);
+  cudf::io::write_parquet(out_opts);
+
+  // check that file is written correctly when rows/page < fragment size
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::FileMetaData fmd;
+
+  read_footer(source, &fmd);
+  ASSERT_TRUE(fmd.row_groups.size() > 0);
+  ASSERT_TRUE(fmd.row_groups[0].columns.size() == 1);
+  auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data;
+  ASSERT_TRUE(first_chunk.data_page_offset > 0);
+
+  // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
+  // version should be smaller than size of the struct.
+  auto const ph = read_page_header(
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+
+  // there should be only one page since the fragment size is larger than rows_per_page
+  EXPECT_EQ(ph.data_page_header.num_values, num_rows);
+}
+
 TEST_F(ParquetWriterTest, Decimal128Stats)
 {
   // check that decimal128 min and max statistics are written in network byte order
@@ -3738,7 +3802,7 @@ TEST_F(ParquetWriterTest, Decimal128Stats)
 
   read_footer(source, &fmd);
 
-  auto const stats = parse_statistics(fmd.row_groups[0].columns[0]);
+  auto const stats = get_statistics(fmd.row_groups[0].columns[0]);
 
   EXPECT_EQ(expected_min, stats.min_value);
   EXPECT_EQ(expected_max, stats.max_value);
@@ -3970,11 +4034,10 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
   cudf::io::parquet::FileMetaData fmd;
 
   read_footer(source, &fmd);
-  CUDF_EXPECTS(fmd.row_groups.size() > 0, "No row groups found");
+  ASSERT_GT(fmd.row_groups.size(), 0);
 
   auto const& columns = fmd.row_groups[0].columns;
-  CUDF_EXPECTS(columns.size() == static_cast<size_t>(expected.num_columns()),
-               "Invalid number of columns");
+  ASSERT_EQ(columns.size(), static_cast<size_t>(expected.num_columns()));
 
   // now check that the boundary order for chunk 1 is ascending,
   // chunk 2 is descending, and chunk 3 is unordered
@@ -4002,8 +4065,8 @@ int32_t compare(T& v1, T& v2)
 // compare two binary statistics blobs based on their physical
 // and converted types. returns -1 if v1 < v2, 0 if v1 == v2, and
 // 1 if v1 > v2.
-int32_t compare_binary(const std::vector<uint8_t>& v1,
-                       const std::vector<uint8_t>& v2,
+int32_t compare_binary(std::vector<uint8_t> const& v1,
+                       std::vector<uint8_t> const& v2,
                        cudf::io::parquet::Type ptype,
                        cudf::io::parquet::ConvertedType ctype)
 {
@@ -4013,28 +4076,28 @@ int32_t compare_binary(const std::vector<uint8_t>& v1,
         case cudf::io::parquet::UINT_8:
         case cudf::io::parquet::UINT_16:
         case cudf::io::parquet::UINT_32:
-          return compare(*(reinterpret_cast<const uint32_t*>(v1.data())),
-                         *(reinterpret_cast<const uint32_t*>(v2.data())));
+          return compare(*(reinterpret_cast<uint32_t const*>(v1.data())),
+                         *(reinterpret_cast<uint32_t const*>(v2.data())));
         default:
-          return compare(*(reinterpret_cast<const int32_t*>(v1.data())),
-                         *(reinterpret_cast<const int32_t*>(v2.data())));
+          return compare(*(reinterpret_cast<int32_t const*>(v1.data())),
+                         *(reinterpret_cast<int32_t const*>(v2.data())));
       }
 
     case cudf::io::parquet::INT64:
       if (ctype == cudf::io::parquet::UINT_64) {
-        return compare(*(reinterpret_cast<const uint64_t*>(v1.data())),
-                       *(reinterpret_cast<const uint64_t*>(v2.data())));
+        return compare(*(reinterpret_cast<uint64_t const*>(v1.data())),
+                       *(reinterpret_cast<uint64_t const*>(v2.data())));
       }
-      return compare(*(reinterpret_cast<const int64_t*>(v1.data())),
-                     *(reinterpret_cast<const int64_t*>(v2.data())));
+      return compare(*(reinterpret_cast<int64_t const*>(v1.data())),
+                     *(reinterpret_cast<int64_t const*>(v2.data())));
 
     case cudf::io::parquet::FLOAT:
-      return compare(*(reinterpret_cast<const float*>(v1.data())),
-                     *(reinterpret_cast<const float*>(v2.data())));
+      return compare(*(reinterpret_cast<float const*>(v1.data())),
+                     *(reinterpret_cast<float const*>(v2.data())));
 
     case cudf::io::parquet::DOUBLE:
-      return compare(*(reinterpret_cast<const double*>(v1.data())),
-                     *(reinterpret_cast<const double*>(v2.data())));
+      return compare(*(reinterpret_cast<double const*>(v1.data())),
+                     *(reinterpret_cast<double const*>(v2.data())));
 
     case cudf::io::parquet::BYTE_ARRAY: {
       int32_t v1sz = v1.size();
@@ -4050,7 +4113,7 @@ int32_t compare_binary(const std::vector<uint8_t>& v1,
   return 0;
 }
 
-TEST_F(ParquetWriterTest, LargeColumnIndex)
+TEST_P(ParquetV2Test, LargeColumnIndex)
 {
   // create a file large enough to be written in 2 batches (currently 1GB per batch)
   // pick fragment size that num_rows is divisible by, so we'll get equal sized row groups
@@ -4058,6 +4121,7 @@ TEST_F(ParquetWriterTest, LargeColumnIndex)
   const std::string s2(1000, 'b');
   constexpr auto num_rows  = 512 * 1024;
   constexpr auto frag_size = num_rows / 128;
+  auto const is_v2         = GetParam();
 
   auto col0_elements = cudf::detail::make_counting_transform_iterator(
     0, [&](auto i) { return (i < num_rows) ? s1 : s2; });
@@ -4071,6 +4135,7 @@ TEST_F(ParquetWriterTest, LargeColumnIndex)
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
       .compression(cudf::io::compression_type::NONE)
       .dictionary_policy(cudf::io::dictionary_policy::NEVER)
+      .write_v2_headers(is_v2)
       .max_page_fragment_size(frag_size)
       .row_group_size_bytes(1024 * 1024 * 1024)
       .row_group_size_rows(num_rows);
@@ -4086,7 +4151,7 @@ TEST_F(ParquetWriterTest, LargeColumnIndex)
       auto const& chunk = rg.columns[c];
 
       auto const ci    = read_column_index(source, chunk);
-      auto const stats = parse_statistics(chunk);
+      auto const stats = get_statistics(chunk);
 
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
@@ -4097,9 +4162,12 @@ TEST_F(ParquetWriterTest, LargeColumnIndex)
   }
 }
 
-TEST_F(ParquetWriterTest, CheckColumnOffsetIndex)
+TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 {
   constexpr auto num_rows = 100000;
+  auto const is_v2        = GetParam();
+  auto const expected_hdr_type =
+    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4137,6 +4205,7 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndex)
   const cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+      .write_v2_headers(is_v2)
       .max_page_size_rows(20000);
   cudf::io::write_parquet(out_opts);
 
@@ -4158,15 +4227,15 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndex)
       for (size_t o = 0; o < oi.page_locations.size(); o++) {
         auto const& page_loc = oi.page_locations[o];
         auto const ph        = read_page_header(source, page_loc);
-        EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
+        EXPECT_EQ(ph.type, expected_hdr_type);
         EXPECT_EQ(page_loc.first_row_index, num_vals);
-        num_vals += ph.data_page_header.num_values;
+        num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values;
       }
 
       // loop over page stats from the column index. check that stats.min <= page.min
       // and stats.max >= page.max for each page.
       auto const ci    = read_column_index(source, chunk);
-      auto const stats = parse_statistics(chunk);
+      auto const stats = get_statistics(chunk);
 
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
@@ -4184,9 +4253,12 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndex)
   }
 }
 
-TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNulls)
+TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 {
   constexpr auto num_rows = 100000;
+  auto const is_v2        = GetParam();
+  auto const expected_hdr_type =
+    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4234,6 +4306,7 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNulls)
   const cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+      .write_v2_headers(is_v2)
       .max_page_size_rows(20000);
   cudf::io::write_parquet(out_opts);
 
@@ -4255,15 +4328,15 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNulls)
       for (size_t o = 0; o < oi.page_locations.size(); o++) {
         auto const& page_loc = oi.page_locations[o];
         auto const ph        = read_page_header(source, page_loc);
-        EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
+        EXPECT_EQ(ph.type, expected_hdr_type);
         EXPECT_EQ(page_loc.first_row_index, num_vals);
-        num_vals += ph.data_page_header.num_values;
+        num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values;
       }
 
       // loop over page stats from the column index. check that stats.min <= page.min
       // and stats.max >= page.max for each page.
       auto const ci    = read_column_index(source, chunk);
-      auto const stats = parse_statistics(chunk);
+      auto const stats = get_statistics(chunk);
 
       // should be half nulls, except no nulls in column 0
       EXPECT_EQ(stats.null_count, c == 0 ? 0 : num_rows / 2);
@@ -4287,9 +4360,12 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNulls)
   }
 }
 
-TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNullColumn)
+TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 {
   constexpr auto num_rows = 100000;
+  auto const is_v2        = GetParam();
+  auto const expected_hdr_type =
+    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4322,6 +4398,7 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNullColumn)
   const cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+      .write_v2_headers(is_v2)
       .max_page_size_rows(20000);
   cudf::io::write_parquet(out_opts);
 
@@ -4343,15 +4420,15 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNullColumn)
       for (size_t o = 0; o < oi.page_locations.size(); o++) {
         auto const& page_loc = oi.page_locations[o];
         auto const ph        = read_page_header(source, page_loc);
-        EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
+        EXPECT_EQ(ph.type, expected_hdr_type);
         EXPECT_EQ(page_loc.first_row_index, num_vals);
-        num_vals += ph.data_page_header.num_values;
+        num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values;
       }
 
       // loop over page stats from the column index. check that stats.min <= page.min
       // and stats.max >= page.max for each non-empty page.
       auto const ci    = read_column_index(source, chunk);
-      auto const stats = parse_statistics(chunk);
+      auto const stats = get_statistics(chunk);
 
       // there should be no nulls except column 1 which is all nulls
       EXPECT_EQ(stats.null_count, c == 1 ? num_rows : 0);
@@ -4379,8 +4456,12 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNullColumn)
   }
 }
 
-TEST_F(ParquetWriterTest, CheckColumnOffsetIndexStruct)
+TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 {
+  auto const is_v2 = GetParam();
+  auto const expected_hdr_type =
+    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+
   auto c0 = testdata::ascending<uint32_t>();
 
   auto sc0 = testdata::ascending<cudf::string_view>();
@@ -4409,6 +4490,7 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexStruct)
   const cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+      .write_v2_headers(is_v2)
       .max_page_size_rows(page_size_for_ordered_tests);
   cudf::io::write_parquet(out_opts);
 
@@ -4434,16 +4516,17 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexStruct)
       for (size_t o = 0; o < oi.page_locations.size(); o++) {
         auto const& page_loc = oi.page_locations[o];
         auto const ph        = read_page_header(source, page_loc);
-        EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
+        EXPECT_EQ(ph.type, expected_hdr_type);
+        EXPECT_EQ(page_loc.first_row_index, num_vals);
         // last column has 2 values per row
-        EXPECT_EQ(page_loc.first_row_index * (c == rg.columns.size() - 1 ? 2 : 1), num_vals);
-        num_vals += ph.data_page_header.num_values;
+        num_vals += is_v2 ? ph.data_page_header_v2.num_rows
+                          : ph.data_page_header.num_values / (c == rg.columns.size() - 1 ? 2 : 1);
       }
 
       // loop over page stats from the column index. check that stats.min <= page.min
       // and stats.max >= page.max for each page.
       auto const ci    = read_column_index(source, chunk);
-      auto const stats = parse_statistics(chunk);
+      auto const stats = get_statistics(chunk);
 
       auto const ptype = fmd.schema[colidx].type;
       auto const ctype = fmd.schema[colidx].converted_type;
@@ -4457,8 +4540,86 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexStruct)
   }
 }
 
-TEST_F(ParquetWriterTest, CheckColumnIndexListWithNulls)
+TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
 {
+  auto const is_v2 = GetParam();
+  auto const expected_hdr_type =
+    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+
+  auto validity2 =
+    cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; });
+  auto validity3 = cudf::detail::make_counting_transform_iterator(
+    0, [](cudf::size_type i) { return (i % 3) != 0; });
+  auto validity4 = cudf::detail::make_counting_transform_iterator(
+    0, [](cudf::size_type i) { return (i % 4) != 0; });
+  auto validity5 = cudf::detail::make_counting_transform_iterator(
+    0, [](cudf::size_type i) { return (i % 5) != 0; });
+
+  auto c0 = testdata::ascending<uint32_t>();
+
+  auto col1_data = random_values<int32_t>(num_ordered_rows);
+  auto col2_data = random_values<int32_t>(num_ordered_rows);
+  auto col3_data = random_values<int32_t>(num_ordered_rows);
+
+  // col1 is all nulls
+  auto col1 =
+    cudf::test::fixed_width_column_wrapper<int32_t>(col1_data.begin(), col1_data.end(), validity2);
+  auto col2 =
+    cudf::test::fixed_width_column_wrapper<int32_t>(col2_data.begin(), col2_data.end(), validity3);
+  auto col3 =
+    cudf::test::fixed_width_column_wrapper<int32_t>(col2_data.begin(), col2_data.end(), validity4);
+
+  std::vector<std::unique_ptr<cudf::column>> struct_children;
+  struct_children.push_back(col1.release());
+  struct_children.push_back(col2.release());
+  struct_children.push_back(col3.release());
+  auto struct_validity = std::vector<bool>(validity5, validity5 + num_ordered_rows);
+  cudf::test::structs_column_wrapper c1(std::move(struct_children), struct_validity);
+  table_view expected({c0, c1});
+
+  auto const filepath = temp_env->get_temp_filepath("CheckColumnOffsetIndexStructNulls.parquet");
+  const cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+      .write_v2_headers(is_v2)
+      .max_page_size_rows(page_size_for_ordered_tests);
+  cudf::io::write_parquet(out_opts);
+
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::FileMetaData fmd;
+
+  read_footer(source, &fmd);
+
+  for (size_t r = 0; r < fmd.row_groups.size(); r++) {
+    auto const& rg = fmd.row_groups[r];
+    for (size_t c = 0; c < rg.columns.size(); c++) {
+      auto const& chunk = rg.columns[c];
+
+      // loop over offsets, read each page header, make sure it's a data page and that
+      // the first row index is correct
+      auto const oi = read_offset_index(source, chunk);
+      auto const ci = read_column_index(source, chunk);
+
+      int64_t num_vals = 0;
+      for (size_t o = 0; o < oi.page_locations.size(); o++) {
+        auto const& page_loc = oi.page_locations[o];
+        auto const ph        = read_page_header(source, page_loc);
+        EXPECT_EQ(ph.type, expected_hdr_type);
+        EXPECT_EQ(page_loc.first_row_index, num_vals);
+        num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values;
+        // check that null counts match
+        if (is_v2) { EXPECT_EQ(ci.null_counts[o], ph.data_page_header_v2.num_nulls); }
+      }
+    }
+  }
+}
+
+TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
+{
+  auto const is_v2 = GetParam();
+  auto const expected_hdr_type =
+    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+
   using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
   using lcw = cudf::test::lists_column_wrapper<int32_t>;
@@ -4544,6 +4705,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexListWithNulls)
   auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet");
   auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
                     .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+                    .write_v2_headers(is_v2)
                     .compression(cudf::io::compression_type::NONE);
 
   cudf::io::write_parquet(out_opts);
@@ -4562,19 +4724,17 @@ TEST_F(ParquetWriterTest, CheckColumnIndexListWithNulls)
       // the first row index is correct
       auto const oi = read_offset_index(source, chunk);
 
-      int64_t num_vals = 0;
       for (size_t o = 0; o < oi.page_locations.size(); o++) {
         auto const& page_loc = oi.page_locations[o];
         auto const ph        = read_page_header(source, page_loc);
-        EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
-        // last column has 2 values per row
-        EXPECT_EQ(page_loc.first_row_index * (c == rg.columns.size() - 1 ? 2 : 1), num_vals);
-        num_vals += ph.data_page_header.num_values;
+        EXPECT_EQ(ph.type, expected_hdr_type);
+        // check null counts in V2 header
+        if (is_v2) { EXPECT_EQ(ph.data_page_header_v2.num_nulls, expected_null_counts[c]); }
       }
 
       // check null counts in column chunk stats and page indexes
       auto const ci    = read_column_index(source, chunk);
-      auto const stats = parse_statistics(chunk);
+      auto const stats = get_statistics(chunk);
       EXPECT_EQ(stats.null_count, expected_null_counts[c]);
 
       // should only be one page
@@ -4586,7 +4746,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexListWithNulls)
 
 TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
 {
-  const char* coldata[] = {
+  char const* coldata[] = {
     // in-range 7 bit.  should truncate to "yyyyyyyz"
     "yyyyyyyyy",
     // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's
@@ -4612,7 +4772,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
 
   // NOTE: UTF8 min is initialized with 0xf7bfbfbf. Binary values larger
   // than that will not become minimum value (when written as UTF-8).
-  const char* truncated_min[] = {"yyyyyyyy",
+  char const* truncated_min[] = {"yyyyyyyy",
                                  "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f",
                                  "\xf7\xbf\xbf\xbf",
                                  "éééé",
@@ -4623,7 +4783,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
                                  "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf",
                                  "\xf7\xbf\xbf\xbf"};
 
-  const char* truncated_max[] = {"yyyyyyyz",
+  char const* truncated_max[] = {"yyyyyyyz",
                                  "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80",
                                  "\xff\xff\xff\xff\xff\xff\xff\xff\xff",
                                  "éééê",
@@ -4662,7 +4822,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
       auto const& chunk = rg.columns[c];
 
       auto const ci    = read_column_index(source, chunk);
-      auto const stats = parse_statistics(chunk);
+      auto const stats = get_statistics(chunk);
 
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
@@ -4704,7 +4864,7 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation)
   auto const filepath = temp_env->get_temp_filepath("BinaryColumnIndexTruncation.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&output_metadata)
+      .metadata(std::move(output_metadata))
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
       .column_index_truncate_length(8);
   cudf::io::write_parquet(out_opts);
@@ -4720,7 +4880,7 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation)
       auto const& chunk = rg.columns[c];
 
       auto const ci    = read_column_index(source, chunk);
-      auto const stats = parse_statistics(chunk);
+      auto const stats = get_statistics(chunk);
 
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
@@ -4757,9 +4917,9 @@ TEST_F(ParquetReaderTest, EmptyColumnsParam)
 
 TEST_F(ParquetReaderTest, BinaryAsStrings)
 {
-  std::vector<const char*> strings{
+  std::vector<char const*> strings{
     "Monday", "Wednesday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
-  const auto num_rows = strings.size();
+  auto const num_rows = strings.size();
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
@@ -4790,7 +4950,7 @@ TEST_F(ParquetReaderTest, BinaryAsStrings)
   auto filepath = temp_env->get_temp_filepath("BinaryReadStrings.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, output)
-      .metadata(&output_metadata);
+      .metadata(std::move(output_metadata));
   cudf::io::write_parquet(out_opts);
 
   auto expected_string = table_view{{int_col, string_col, float_col, string_col, string_col}};
@@ -4866,7 +5026,7 @@ TEST_F(ParquetReaderTest, NestedByteArray)
   auto filepath = temp_env->get_temp_filepath("NestedByteArray.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&output_metadata);
+      .metadata(std::move(output_metadata));
   cudf::io::write_parquet(out_opts);
 
   auto source = cudf::io::datasource::create(filepath);
@@ -4912,7 +5072,7 @@ TEST_F(ParquetWriterTest, ByteArrayStats)
   auto filepath = temp_env->get_temp_filepath("ByteArrayStats.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&output_metadata);
+      .metadata(std::move(output_metadata));
   cudf::io::write_parquet(out_opts);
 
   cudf::io::parquet_reader_options in_opts =
@@ -4928,8 +5088,8 @@ TEST_F(ParquetWriterTest, ByteArrayStats)
   EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::Type::BYTE_ARRAY);
   EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::Type::BYTE_ARRAY);
 
-  auto const stats0 = parse_statistics(fmd.row_groups[0].columns[0]);
-  auto const stats1 = parse_statistics(fmd.row_groups[0].columns[1]);
+  auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]);
+  auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]);
 
   EXPECT_EQ(expected_col0_min, stats0.min_value);
   EXPECT_EQ(expected_col0_max, stats0.max_value);
@@ -4961,7 +5121,7 @@ TEST_F(ParquetReaderTest, StructByteArray)
   auto filepath = temp_env->get_temp_filepath("StructByteArray.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&output_metadata);
+      .metadata(std::move(output_metadata));
   cudf::io::write_parquet(out_opts);
 
   std::vector<cudf::io::reader_column_schema> md{cudf::io::reader_column_schema().add_child(
@@ -4999,8 +5159,8 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest)
       0, [depth, rows_per_level](cudf::size_type i) { return i * rows_per_level; });
     total_values_produced += (num_rows + 1);
 
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets(offsets_iter,
-                                                                      offsets_iter + num_rows + 1);
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets(offsets_iter,
+                                                                    offsets_iter + num_rows + 1);
     auto c   = cudf::make_lists_column(num_rows, offsets.release(), std::move(prev_col), 0, {});
     prev_col = std::move(c);
   }
@@ -5210,10 +5370,47 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest)
   EXPECT_TRUE(used_dict(1));
 }
 
+TEST_F(ParquetWriterTest, DictionaryPageSizeEst)
+{
+  // one page
+  constexpr unsigned int nrows = 20'000U;
+
+  // this test is creating a pattern of repeating then non-repeating values to trigger
+  // a "worst-case" for page size estimation in the presence of a dictionary. have confirmed
+  // that this fails for values over 16 in the final term of `max_RLE_page_size()`.
+  // The output of the iterator will be 'CCCCCRRRRRCCCCCRRRRR...` where 'C' is a changing
+  // value, and 'R' repeats. The encoder will turn this into a literal run of 8 values
+  // (`CCCCCRRR`) followed by a repeated run of 2 (`RR`). This pattern then repeats, getting
+  // as close as possible to a condition of repeated 8 value literal runs.
+  auto elements0  = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+    if ((i / 5) % 2 == 1) {
+      return std::string("non-unique string");
+    } else {
+      return "a unique string value suffixed with " + std::to_string(i);
+    }
+  });
+  auto const col0 = cudf::test::strings_column_wrapper(elements0, elements0 + nrows);
+
+  auto const expected = table_view{{col0}};
+
+  auto const filepath = temp_env->get_temp_filepath("DictionaryPageSizeEst.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .compression(cudf::io::compression_type::ZSTD)
+      .dictionary_policy(cudf::io::dictionary_policy::ALWAYS);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options default_in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto const result = cudf::io::read_parquet(default_in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 TEST_P(ParquetSizedTest, DictionaryTest)
 {
-  const unsigned int cardinality = (1 << (GetParam() - 1)) + 1;
-  const unsigned int nrows       = std::max(cardinality * 3 / 2, 3'000'000U);
+  unsigned int const cardinality = (1 << (GetParam() - 1)) + 1;
+  unsigned int const nrows       = std::max(cardinality * 3 / 2, 3'000'000U);
 
   auto elements       = cudf::detail::make_counting_transform_iterator(0, [cardinality](auto i) {
     return "a unique string value suffixed with " + std::to_string(i % cardinality);
@@ -5277,7 +5474,7 @@ TYPED_TEST(ParquetReaderSourceTest, BufferSourceTypes)
     cudf::io::parquet_reader_options in_opts =
       cudf::io::parquet_reader_options::builder(cudf::io::source_info(
         cudf::host_span<T>(reinterpret_cast<T*>(out_buffer.data()), out_buffer.size())));
-    const auto result = cudf::io::read_parquet(in_opts);
+    auto const result = cudf::io::read_parquet(in_opts);
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view());
   }
@@ -5286,7 +5483,7 @@ TYPED_TEST(ParquetReaderSourceTest, BufferSourceTypes)
     cudf::io::parquet_reader_options in_opts =
       cudf::io::parquet_reader_options::builder(cudf::io::source_info(cudf::host_span<T const>(
         reinterpret_cast<T const*>(out_buffer.data()), out_buffer.size())));
-    const auto result = cudf::io::read_parquet(in_opts);
+    auto const result = cudf::io::read_parquet(in_opts);
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view());
   }
@@ -5312,7 +5509,7 @@ TYPED_TEST(ParquetReaderSourceTest, BufferSourceArrayTypes)
       cudf::host_span<T>(reinterpret_cast<T*>(out_buffer.data()), out_buffer.size())};
     cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder(
       cudf::io::source_info(cudf::host_span<cudf::host_span<T>>(spans.data(), spans.size())));
-    const auto result = cudf::io::read_parquet(in_opts);
+    auto const result = cudf::io::read_parquet(in_opts);
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view());
   }
@@ -5323,7 +5520,7 @@ TYPED_TEST(ParquetReaderSourceTest, BufferSourceArrayTypes)
       cudf::host_span<T const>(reinterpret_cast<T const*>(out_buffer.data()), out_buffer.size())};
     cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder(
       cudf::io::source_info(cudf::host_span<cudf::host_span<T const>>(spans.data(), spans.size())));
-    const auto result = cudf::io::read_parquet(in_opts);
+    auto const result = cudf::io::read_parquet(in_opts);
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view());
   }
@@ -5344,7 +5541,7 @@ TEST_F(ParquetWriterTest, UserNullability)
   auto filepath = temp_env->get_temp_filepath("SingleWriteNullable.parquet");
   cudf::io::parquet_writer_options write_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .metadata(&expected_metadata);
+      .metadata(std::move(expected_metadata));
   cudf::io::write_parquet(write_opts);
 
   cudf::io::parquet_reader_options read_opts =
@@ -5371,7 +5568,7 @@ TEST_F(ParquetWriterTest, UserNullabilityInvalid)
 
   cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_nullability(false);
-  write_opts.set_metadata(&expected_metadata);
+  write_opts.set_metadata(std::move(expected_metadata));
   // Can't write a column with nulls as not nullable
   EXPECT_THROW(cudf::io::write_parquet(write_opts), cudf::logic_error);
 }
@@ -5396,7 +5593,7 @@ TEST_F(ParquetReaderTest, SingleLevelLists)
 
   // read single level list reproducing parquet file
   cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(
-    cudf::io::source_info{reinterpret_cast<const char*>(list_bytes), sizeof(list_bytes)});
+    cudf::io::source_info{reinterpret_cast<char const*>(list_bytes), sizeof(list_bytes)});
   auto table = cudf::io::read_parquet(read_opts);
 
   auto const c0 = table.tbl->get_column(0);
@@ -5428,7 +5625,7 @@ TEST_F(ParquetReaderTest, ChunkedSingleLevelLists)
   auto reader = cudf::io::chunked_parquet_reader(
     1L << 31,
     cudf::io::parquet_reader_options::builder(
-      cudf::io::source_info{reinterpret_cast<const char*>(list_bytes), sizeof(list_bytes)}));
+      cudf::io::source_info{reinterpret_cast<char const*>(list_bytes), sizeof(list_bytes)}));
   int iterations = 0;
   while (reader.has_next() && iterations < 10) {
     auto chunk = reader.read_chunk();
@@ -5436,4 +5633,1104 @@ TEST_F(ParquetReaderTest, ChunkedSingleLevelLists)
   EXPECT_TRUE(iterations < 10);
 }
 
+TEST_F(ParquetWriterTest, CompStats)
+{
+  auto table = create_random_fixed_table<int>(1, 100000, true);
+
+  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
+
+  std::vector<char> unused_buffer;
+  cudf::io::parquet_writer_options opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&unused_buffer}, table->view())
+      .compression_statistics(stats);
+  cudf::io::write_parquet(opts);
+
+  EXPECT_NE(stats->num_compressed_bytes(), 0);
+  EXPECT_EQ(stats->num_failed_bytes(), 0);
+  EXPECT_EQ(stats->num_skipped_bytes(), 0);
+  EXPECT_FALSE(std::isnan(stats->compression_ratio()));
+}
+
+TEST_F(ParquetChunkedWriterTest, CompStats)
+{
+  auto table = create_random_fixed_table<int>(1, 100000, true);
+
+  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
+
+  std::vector<char> unused_buffer;
+  cudf::io::chunked_parquet_writer_options opts =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{&unused_buffer})
+      .compression_statistics(stats);
+  cudf::io::parquet_chunked_writer(opts).write(*table);
+
+  EXPECT_NE(stats->num_compressed_bytes(), 0);
+  EXPECT_EQ(stats->num_failed_bytes(), 0);
+  EXPECT_EQ(stats->num_skipped_bytes(), 0);
+  EXPECT_FALSE(std::isnan(stats->compression_ratio()));
+
+  auto const single_table_comp_stats = *stats;
+  cudf::io::parquet_chunked_writer(opts).write(*table);
+
+  EXPECT_EQ(stats->compression_ratio(), single_table_comp_stats.compression_ratio());
+  EXPECT_EQ(stats->num_compressed_bytes(), 2 * single_table_comp_stats.num_compressed_bytes());
+
+  EXPECT_EQ(stats->num_failed_bytes(), 0);
+  EXPECT_EQ(stats->num_skipped_bytes(), 0);
+}
+
+void expect_compression_stats_empty(std::shared_ptr<cudf::io::writer_compression_statistics> stats)
+{
+  EXPECT_EQ(stats->num_compressed_bytes(), 0);
+  EXPECT_EQ(stats->num_failed_bytes(), 0);
+  EXPECT_EQ(stats->num_skipped_bytes(), 0);
+  EXPECT_TRUE(std::isnan(stats->compression_ratio()));
+}
+
+TEST_F(ParquetWriterTest, CompStatsEmptyTable)
+{
+  auto table_no_rows = create_random_fixed_table<int>(20, 0, false);
+
+  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
+
+  std::vector<char> unused_buffer;
+  cudf::io::parquet_writer_options opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&unused_buffer},
+                                              table_no_rows->view())
+      .compression_statistics(stats);
+  cudf::io::write_parquet(opts);
+
+  expect_compression_stats_empty(stats);
+}
+
+TEST_F(ParquetChunkedWriterTest, CompStatsEmptyTable)
+{
+  auto table_no_rows = create_random_fixed_table<int>(20, 0, false);
+
+  auto const stats = std::make_shared<cudf::io::writer_compression_statistics>();
+
+  std::vector<char> unused_buffer;
+  cudf::io::chunked_parquet_writer_options opts =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{&unused_buffer})
+      .compression_statistics(stats);
+  cudf::io::parquet_chunked_writer(opts).write(*table_no_rows);
+
+  expect_compression_stats_empty(stats);
+}
+
+TEST_F(ParquetReaderTest, ReorderedReadMultipleFiles)
+{
+  constexpr auto num_rows    = 50'000;
+  constexpr auto cardinality = 20'000;
+
+  // table 1
+  auto str1 = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return "cat " + std::to_string(i % cardinality); });
+  auto cols1 = cudf::test::strings_column_wrapper(str1, str1 + num_rows);
+
+  auto int1 =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % cardinality; });
+  auto coli1 = cudf::test::fixed_width_column_wrapper<int>(int1, int1 + num_rows);
+
+  auto const expected1 = table_view{{cols1, coli1}};
+  auto const swapped1  = table_view{{coli1, cols1}};
+
+  auto const filepath1 = temp_env->get_temp_filepath("LargeReorderedRead1.parquet");
+  auto out_opts1 =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath1}, expected1)
+      .compression(cudf::io::compression_type::NONE);
+  cudf::io::write_parquet(out_opts1);
+
+  // table 2
+  auto str2 = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return "dog " + std::to_string(i % cardinality); });
+  auto cols2 = cudf::test::strings_column_wrapper(str2, str2 + num_rows);
+
+  auto int2 = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return (i % cardinality) + cardinality; });
+  auto coli2 = cudf::test::fixed_width_column_wrapper<int>(int2, int2 + num_rows);
+
+  auto const expected2 = table_view{{cols2, coli2}};
+  auto const swapped2  = table_view{{coli2, cols2}};
+
+  auto const filepath2 = temp_env->get_temp_filepath("LargeReorderedRead2.parquet");
+  auto out_opts2 =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath2}, expected2)
+      .compression(cudf::io::compression_type::NONE);
+  cudf::io::write_parquet(out_opts2);
+
+  // read in both files swapping the columns
+  auto read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{{filepath1, filepath2}})
+      .columns({"_col1", "_col0"});
+  auto result = cudf::io::read_parquet(read_opts);
+  auto sliced = cudf::slice(result.tbl->view(), {0, num_rows, num_rows, 2 * num_rows});
+  CUDF_TEST_EXPECT_TABLES_EQUAL(sliced[0], swapped1);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(sliced[1], swapped2);
+}
+
+// Test fixture for metadata tests
+struct ParquetMetadataReaderTest : public cudf::test::BaseFixture {
+  std::string print(cudf::io::parquet_column_schema schema, int depth = 0)
+  {
+    std::string child_str;
+    for (auto const& child : schema.children()) {
+      child_str += print(child, depth + 1);
+    }
+    return std::string(depth, ' ') + schema.name() + "\n" + child_str;
+  }
+};
+
+TEST_F(ParquetMetadataReaderTest, TestBasic)
+{
+  auto const num_rows = 1200;
+
+  auto ints   = random_values<int>(num_rows);
+  auto floats = random_values<float>(num_rows);
+  column_wrapper<int> int_col(ints.begin(), ints.end());
+  column_wrapper<float> float_col(floats.begin(), floats.end());
+
+  table_view expected({int_col, float_col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("int_col");
+  expected_metadata.column_metadata[1].set_name("float_col");
+
+  auto filepath = temp_env->get_temp_filepath("MetadataTest.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(std::move(expected_metadata));
+  cudf::io::write_parquet(out_opts);
+
+  auto meta = read_parquet_metadata(cudf::io::source_info{filepath});
+  EXPECT_EQ(meta.num_rows(), num_rows);
+
+  std::string expected_schema = R"(schema
+ int_col
+ float_col
+)";
+  EXPECT_EQ(expected_schema, print(meta.schema().root()));
+
+  EXPECT_EQ(meta.schema().root().name(), "schema");
+  EXPECT_EQ(meta.schema().root().type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE);
+  ASSERT_EQ(meta.schema().root().num_children(), 2);
+
+  EXPECT_EQ(meta.schema().root().child(0).name(), "int_col");
+  EXPECT_EQ(meta.schema().root().child(1).name(), "float_col");
+}
+
+TEST_F(ParquetMetadataReaderTest, TestNested)
+{
+  auto const num_rows       = 1200;
+  auto const lists_per_row  = 4;
+  auto const num_child_rows = num_rows * lists_per_row;
+
+  auto keys = random_values<int>(num_child_rows);
+  auto vals = random_values<float>(num_child_rows);
+  column_wrapper<int> keys_col(keys.begin(), keys.end());
+  column_wrapper<float> vals_col(vals.begin(), vals.end());
+  auto s_col = cudf::test::structs_column_wrapper({keys_col, vals_col}).release();
+
+  std::vector<int> row_offsets(num_rows + 1);
+  for (int idx = 0; idx < num_rows + 1; ++idx) {
+    row_offsets[idx] = idx * lists_per_row;
+  }
+  column_wrapper<int> offsets(row_offsets.begin(), row_offsets.end());
+
+  auto list_col =
+    cudf::make_lists_column(num_rows, offsets.release(), std::move(s_col), 0, rmm::device_buffer{});
+
+  table_view expected({*list_col, *list_col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("maps");
+  expected_metadata.column_metadata[0].set_list_column_as_map();
+  expected_metadata.column_metadata[1].set_name("lists");
+  expected_metadata.column_metadata[1].child(1).child(0).set_name("int_field");
+  expected_metadata.column_metadata[1].child(1).child(1).set_name("float_field");
+
+  auto filepath = temp_env->get_temp_filepath("MetadataTest.orc");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(std::move(expected_metadata));
+  cudf::io::write_parquet(out_opts);
+
+  auto meta = read_parquet_metadata(cudf::io::source_info{filepath});
+  EXPECT_EQ(meta.num_rows(), num_rows);
+
+  std::string expected_schema = R"(schema
+ maps
+  key_value
+   key
+   value
+ lists
+  list
+   element
+    int_field
+    float_field
+)";
+  EXPECT_EQ(expected_schema, print(meta.schema().root()));
+
+  EXPECT_EQ(meta.schema().root().name(), "schema");
+  EXPECT_EQ(meta.schema().root().type_kind(),
+            cudf::io::parquet::TypeKind::UNDEFINED_TYPE);  // struct
+  ASSERT_EQ(meta.schema().root().num_children(), 2);
+
+  auto const& out_map_col = meta.schema().root().child(0);
+  EXPECT_EQ(out_map_col.name(), "maps");
+  EXPECT_EQ(out_map_col.type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE);  // map
+
+  ASSERT_EQ(out_map_col.num_children(), 1);
+  EXPECT_EQ(out_map_col.child(0).name(), "key_value");       // key_value (named in parquet writer)
+  ASSERT_EQ(out_map_col.child(0).num_children(), 2);
+  EXPECT_EQ(out_map_col.child(0).child(0).name(), "key");    // key (named in parquet writer)
+  EXPECT_EQ(out_map_col.child(0).child(1).name(), "value");  // value (named in parquet writer)
+  EXPECT_EQ(out_map_col.child(0).child(0).type_kind(), cudf::io::parquet::TypeKind::INT32);  // int
+  EXPECT_EQ(out_map_col.child(0).child(1).type_kind(),
+            cudf::io::parquet::TypeKind::FLOAT);  // float
+
+  auto const& out_list_col = meta.schema().root().child(1);
+  EXPECT_EQ(out_list_col.name(), "lists");
+  EXPECT_EQ(out_list_col.type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE);  // list
+  // TODO repetition type?
+  ASSERT_EQ(out_list_col.num_children(), 1);
+  EXPECT_EQ(out_list_col.child(0).name(), "list");  // list (named in parquet writer)
+  ASSERT_EQ(out_list_col.child(0).num_children(), 1);
+
+  auto const& out_list_struct_col = out_list_col.child(0).child(0);
+  EXPECT_EQ(out_list_struct_col.name(), "element");        // elements (named in parquet writer)
+  EXPECT_EQ(out_list_struct_col.type_kind(),
+            cudf::io::parquet::TypeKind::UNDEFINED_TYPE);  // struct
+  ASSERT_EQ(out_list_struct_col.num_children(), 2);
+
+  auto const& out_int_col = out_list_struct_col.child(0);
+  EXPECT_EQ(out_int_col.name(), "int_field");
+  EXPECT_EQ(out_int_col.type_kind(), cudf::io::parquet::TypeKind::INT32);
+
+  auto const& out_float_col = out_list_struct_col.child(1);
+  EXPECT_EQ(out_float_col.name(), "float_field");
+  EXPECT_EQ(out_float_col.type_kind(), cudf::io::parquet::TypeKind::FLOAT);
+}
+
+TEST_F(ParquetWriterTest, NoNullsAsNonNullable)
+{
+  auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  column_wrapper<int32_t> col{{1, 2, 3}, valids};
+  table_view expected({col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_nullability(false);
+
+  auto filepath = temp_env->get_temp_filepath("NonNullable.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(std::move(expected_metadata));
+  // Writer should be able to write a column without nulls as non-nullable
+  EXPECT_NO_THROW(cudf::io::write_parquet(out_opts));
+}
+
+TEST_F(ParquetReaderTest, FilterSimple)
+{
+  srand(31337);
+  auto written_table = create_random_fixed_table<int>(9, 9, false);
+
+  auto filepath = temp_env->get_temp_filepath("FilterSimple.parquet");
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *written_table);
+  cudf::io::write_parquet(args);
+
+  // Filtering AST - table[0] < RAND_MAX/2
+  auto literal_value     = cudf::numeric_scalar<decltype(RAND_MAX)>(RAND_MAX / 2);
+  auto literal           = cudf::ast::literal(literal_value);
+  auto col_ref_0         = cudf::ast::column_reference(0);
+  auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal);
+
+  auto predicate = cudf::compute_column(*written_table, filter_expression);
+  EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8)
+    << "Predicate filter should return a boolean";
+  auto expected = cudf::apply_boolean_mask(*written_table, *predicate);
+  // To make sure AST filters out some elements
+  EXPECT_LT(expected->num_rows(), written_table->num_rows());
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .filter(filter_expression);
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
+}
+
+auto create_parquet_with_stats(std::string const& filename)
+{
+  auto col0 = testdata::ascending<uint32_t>();
+  auto col1 = testdata::descending<int64_t>();
+  auto col2 = testdata::unordered<double>();
+
+  auto const expected = table_view{{col0, col1, col2}};
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_uint32");
+  expected_metadata.column_metadata[1].set_name("col_int64");
+  expected_metadata.column_metadata[2].set_name("col_double");
+
+  auto const filepath = temp_env->get_temp_filepath(filename);
+  const cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(std::move(expected_metadata))
+      .row_group_size_rows(8000)
+      .stats_level(cudf::io::statistics_freq::STATISTICS_ROWGROUP);
+  cudf::io::write_parquet(out_opts);
+
+  std::vector<std::unique_ptr<column>> columns;
+  columns.push_back(col0.release());
+  columns.push_back(col1.release());
+  columns.push_back(col2.release());
+
+  return std::pair{cudf::table{std::move(columns)}, filepath};
+}
+
+TEST_F(ParquetReaderTest, FilterIdentity)
+{
+  auto [src, filepath] = create_parquet_with_stats("FilterIdentity.parquet");
+
+  // Filtering AST - identity function, always true.
+  auto literal_value     = cudf::numeric_scalar<bool>(true);
+  auto literal           = cudf::ast::literal(literal_value);
+  auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::IDENTITY, literal);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .filter(filter_expression);
+  auto result = cudf::io::read_parquet(read_opts);
+
+  cudf::io::parquet_reader_options read_opts2 =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result2 = cudf::io::read_parquet(read_opts2);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *result2.tbl);
+}
+
+TEST_F(ParquetReaderTest, FilterReferenceExpression)
+{
+  auto [src, filepath] = create_parquet_with_stats("FilterReferenceExpression.parquet");
+  // Filtering AST - table[0] < 150
+  auto literal_value     = cudf::numeric_scalar<uint32_t>(150);
+  auto literal           = cudf::ast::literal(literal_value);
+  auto col_ref_0         = cudf::ast::column_reference(0);
+  auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal);
+
+  // Expected result
+  auto predicate = cudf::compute_column(src, filter_expression);
+  auto expected  = cudf::apply_boolean_mask(src, *predicate);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .filter(filter_expression);
+  auto result = cudf::io::read_parquet(read_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
+}
+
+TEST_F(ParquetReaderTest, FilterNamedExpression)
+{
+  auto [src, filepath] = create_parquet_with_stats("NamedExpression.parquet");
+  // Filtering AST - table["col_uint32"] < 150
+  auto literal_value  = cudf::numeric_scalar<uint32_t>(150);
+  auto literal        = cudf::ast::literal(literal_value);
+  auto col_name_0     = cudf::ast::column_name_reference("col_uint32");
+  auto parquet_filter = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal);
+  auto col_ref_0      = cudf::ast::column_reference(0);
+  auto table_filter   = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal);
+
+  // Expected result
+  auto predicate = cudf::compute_column(src, table_filter);
+  auto expected  = cudf::apply_boolean_mask(src, *predicate);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .filter(parquet_filter);
+  auto result = cudf::io::read_parquet(read_opts);
+
+  // tests
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
+}
+
+// Test for Types - numeric, chrono, string.
+template <typename T>
+struct ParquetReaderPredicatePushdownTest : public ParquetReaderTest {};
+
+// These chrono types are not supported because parquet writer does not have a type to represent
+// them.
+using UnsupportedChronoTypes =
+  cudf::test::Types<cudf::timestamp_s, cudf::duration_D, cudf::duration_s>;
+// Also fixed point types unsupported, because AST does not support them yet.
+using SupportedTestTypes = cudf::test::RemoveIf<cudf::test::ContainedIn<UnsupportedChronoTypes>,
+                                                cudf::test::ComparableTypes>;
+
+TYPED_TEST_SUITE(ParquetReaderPredicatePushdownTest, SupportedTestTypes);
+
+template <typename T>
+auto create_parquet_typed_with_stats(std::string const& filename)
+{
+  auto col0 = testdata::ascending<T>();
+  auto col1 = testdata::descending<T>();
+  auto col2 = testdata::unordered<T>();
+
+  auto const written_table = table_view{{col0, col1, col2}};
+  auto const filepath      = temp_env->get_temp_filepath("FilterTyped.parquet");
+  {
+    cudf::io::table_input_metadata expected_metadata(written_table);
+    expected_metadata.column_metadata[0].set_name("col0");
+    expected_metadata.column_metadata[1].set_name("col1");
+    expected_metadata.column_metadata[2].set_name("col2");
+
+    const cudf::io::parquet_writer_options out_opts =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table)
+        .metadata(std::move(expected_metadata))
+        .row_group_size_rows(8000);
+    cudf::io::write_parquet(out_opts);
+  }
+
+  std::vector<std::unique_ptr<column>> columns;
+  columns.push_back(col0.release());
+  columns.push_back(col1.release());
+  columns.push_back(col2.release());
+
+  return std::pair{cudf::table{std::move(columns)}, filepath};
+}
+
+TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped)
+{
+  using T = TypeParam;
+
+  auto const [src, filepath] = create_parquet_typed_with_stats<T>("FilterTyped.parquet");
+  auto const written_table   = src.view();
+
+  // Filtering AST
+  auto literal_value = []() {
+    if constexpr (cudf::is_timestamp<T>()) {
+      // table[0] < 10000 timestamp days/seconds/milliseconds/microseconds/nanoseconds
+      return cudf::timestamp_scalar<T>(T(typename T::duration(10000)));  // i (0-20,000)
+    } else if constexpr (cudf::is_duration<T>()) {
+      // table[0] < 10000 day/seconds/milliseconds/microseconds/nanoseconds
+      return cudf::duration_scalar<T>(T(10000));  // i (0-20,000)
+    } else if constexpr (std::is_same_v<T, cudf::string_view>) {
+      // table[0] < "000010000"
+      return cudf::string_scalar("000010000");  // i (0-20,000)
+    } else {
+      // table[0] < 0 or 100u
+      return cudf::numeric_scalar<T>((100 - 100 * std::is_signed_v<T>));  // i/100 (-100-100/ 0-200)
+    }
+  }();
+  auto literal           = cudf::ast::literal(literal_value);
+  auto col_name_0        = cudf::ast::column_name_reference("col0");
+  auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal);
+  auto col_ref_0         = cudf::ast::column_reference(0);
+  auto ref_filter        = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal);
+
+  // Expected result
+  auto predicate = cudf::compute_column(written_table, ref_filter);
+  EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8)
+    << "Predicate filter should return a boolean";
+  auto expected = cudf::apply_boolean_mask(written_table, *predicate);
+
+  // Reading with Predicate Pushdown
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .filter(filter_expression);
+  auto result       = cudf::io::read_parquet(read_opts);
+  auto result_table = result.tbl->view();
+
+  // tests
+  EXPECT_EQ(int(written_table.column(0).type().id()), int(result_table.column(0).type().id()))
+    << "col0 type mismatch";
+  // To make sure AST filters out some elements
+  EXPECT_LT(expected->num_rows(), written_table.num_rows());
+  EXPECT_EQ(result_table.num_rows(), expected->num_rows());
+  EXPECT_EQ(result_table.num_columns(), expected->num_columns());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table);
+}
+
+TEST_F(ParquetReaderTest, FilterMultiple1)
+{
+  using T = cudf::string_view;
+
+  auto const [src, filepath] = create_parquet_typed_with_stats<T>("FilterMultiple1.parquet");
+  auto const written_table   = src.view();
+
+  // Filtering AST - 10000 < table[0] < 12000
+  std::string const low  = "000010000";
+  std::string const high = "000012000";
+  auto lov               = cudf::string_scalar(low, true);
+  auto hiv               = cudf::string_scalar(high, true);
+  auto filter_col        = cudf::ast::column_reference(0);
+  auto lo_lit            = cudf::ast::literal(lov);
+  auto hi_lit            = cudf::ast::literal(hiv);
+  auto expr_1 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col, lo_lit);
+  auto expr_2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col, hi_lit);
+  auto expr_3 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_1, expr_2);
+
+  // Expected result
+  auto predicate = cudf::compute_column(written_table, expr_3);
+  auto expected  = cudf::apply_boolean_mask(written_table, *predicate);
+
+  auto si                  = cudf::io::source_info(filepath);
+  auto builder             = cudf::io::parquet_reader_options::builder(si).filter(expr_3);
+  auto table_with_metadata = cudf::io::read_parquet(builder);
+  auto result              = table_with_metadata.tbl->view();
+
+  // tests
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result);
+}
+
+TEST_F(ParquetReaderTest, FilterMultiple2)
+{
+  // multiple conditions on same column.
+  using T = cudf::string_view;
+
+  auto const [src, filepath] = create_parquet_typed_with_stats<T>("FilterMultiple2.parquet");
+  auto const written_table   = src.view();
+  // 0-8000, 8001-16000, 16001-20000
+
+  // Filtering AST
+  // (table[0] >= "000010000" AND table[0] < "000012000") OR
+  // (table[0] >= "000017000" AND table[0] < "000019000")
+  std::string const low1  = "000010000";
+  std::string const high1 = "000012000";
+  auto lov                = cudf::string_scalar(low1, true);
+  auto hiv                = cudf::string_scalar(high1, true);
+  auto filter_col         = cudf::ast::column_reference(0);
+  auto lo_lit             = cudf::ast::literal(lov);
+  auto hi_lit             = cudf::ast::literal(hiv);
+  auto expr_1 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col, lo_lit);
+  auto expr_2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col, hi_lit);
+  auto expr_3 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_1, expr_2);
+  std::string const low2  = "000017000";
+  std::string const high2 = "000019000";
+  auto lov2               = cudf::string_scalar(low2, true);
+  auto hiv2               = cudf::string_scalar(high2, true);
+  auto lo_lit2            = cudf::ast::literal(lov2);
+  auto hi_lit2            = cudf::ast::literal(hiv2);
+  auto expr_4 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col, lo_lit2);
+  auto expr_5 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col, hi_lit2);
+  auto expr_6 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_4, expr_5);
+  auto expr_7 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr_3, expr_6);
+
+  // Expected result
+  auto predicate = cudf::compute_column(written_table, expr_7);
+  auto expected  = cudf::apply_boolean_mask(written_table, *predicate);
+
+  auto si                  = cudf::io::source_info(filepath);
+  auto builder             = cudf::io::parquet_reader_options::builder(si).filter(expr_7);
+  auto table_with_metadata = cudf::io::read_parquet(builder);
+  auto result              = table_with_metadata.tbl->view();
+
+  // tests
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result);
+}
+
+TEST_F(ParquetReaderTest, FilterMultiple3)
+{
+  // multiple conditions with reference to multiple columns.
+  // index and name references mixed.
+  using T                    = uint32_t;
+  auto const [src, filepath] = create_parquet_typed_with_stats<T>("FilterMultiple3.parquet");
+  auto const written_table   = src.view();
+
+  // Filtering AST - (table[0] >= 70 AND table[0] < 90) OR (table[1] >= 100 AND table[1] < 120)
+  // row groups min, max:
+  // table[0] 0-80, 81-160, 161-200.
+  // table[1] 200-121, 120-41, 40-0.
+  auto filter_col1  = cudf::ast::column_reference(0);
+  auto filter_col2  = cudf::ast::column_name_reference("col1");
+  T constexpr low1  = 70;
+  T constexpr high1 = 90;
+  T constexpr low2  = 100;
+  T constexpr high2 = 120;
+  auto lov          = cudf::numeric_scalar(low1, true);
+  auto hiv          = cudf::numeric_scalar(high1, true);
+  auto lo_lit1      = cudf::ast::literal(lov);
+  auto hi_lit1      = cudf::ast::literal(hiv);
+  auto expr_1  = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col1, lo_lit1);
+  auto expr_2  = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, hi_lit1);
+  auto expr_3  = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_1, expr_2);
+  auto lov2    = cudf::numeric_scalar(low2, true);
+  auto hiv2    = cudf::numeric_scalar(high2, true);
+  auto lo_lit2 = cudf::ast::literal(lov2);
+  auto hi_lit2 = cudf::ast::literal(hiv2);
+  auto expr_4  = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col2, lo_lit2);
+  auto expr_5  = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col2, hi_lit2);
+  auto expr_6  = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_4, expr_5);
+  // expression to test
+  auto expr_7 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr_3, expr_6);
+
+  // Expected result
+  auto filter_col2_ref = cudf::ast::column_reference(1);
+  auto expr_4_ref =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col2_ref, lo_lit2);
+  auto expr_5_ref = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col2_ref, hi_lit2);
+  auto expr_6_ref =
+    cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_4_ref, expr_5_ref);
+  auto expr_7_ref = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr_3, expr_6_ref);
+  auto predicate  = cudf::compute_column(written_table, expr_7_ref);
+  auto expected   = cudf::apply_boolean_mask(written_table, *predicate);
+
+  auto si                  = cudf::io::source_info(filepath);
+  auto builder             = cudf::io::parquet_reader_options::builder(si).filter(expr_7);
+  auto table_with_metadata = cudf::io::read_parquet(builder);
+  auto result              = table_with_metadata.tbl->view();
+
+  // tests
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result);
+}
+
+TEST_F(ParquetReaderTest, FilterSupported)
+{
+  using T                    = uint32_t;
+  auto const [src, filepath] = create_parquet_typed_with_stats<T>("FilterSupported.parquet");
+  auto const written_table   = src.view();
+
+  // Filtering AST - ((table[0] > 70 AND table[0] <= 90) OR (table[1] >= 100 AND table[1] < 120))
+  //              AND (table[1] != 110)
+  // row groups min, max:
+  // table[0] 0-80, 81-160, 161-200.
+  // table[1] 200-121, 120-41, 40-0.
+  auto filter_col1       = cudf::ast::column_reference(0);
+  auto filter_col2       = cudf::ast::column_reference(1);
+  T constexpr low1       = 70;
+  T constexpr high1      = 90;
+  T constexpr low2       = 100;
+  T constexpr high2      = 120;
+  T constexpr skip_value = 110;
+  auto lov               = cudf::numeric_scalar(low1, true);
+  auto hiv               = cudf::numeric_scalar(high1, true);
+  auto lo_lit1           = cudf::ast::literal(lov);
+  auto hi_lit1           = cudf::ast::literal(hiv);
+  auto expr_1  = cudf::ast::operation(cudf::ast::ast_operator::GREATER, filter_col1, lo_lit1);
+  auto expr_2  = cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, filter_col1, hi_lit1);
+  auto expr_3  = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_1, expr_2);
+  auto lov2    = cudf::numeric_scalar(low2, true);
+  auto hiv2    = cudf::numeric_scalar(high2, true);
+  auto lo_lit2 = cudf::ast::literal(lov2);
+  auto hi_lit2 = cudf::ast::literal(hiv2);
+  auto expr_4  = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col2, lo_lit2);
+  auto expr_5  = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col2, hi_lit2);
+  auto expr_6  = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_4, expr_5);
+  auto expr_7  = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr_3, expr_6);
+  auto skip_ov = cudf::numeric_scalar(skip_value, true);
+  auto skip_lit = cudf::ast::literal(skip_ov);
+  auto expr_8   = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col2, skip_lit);
+  auto expr_9   = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_7, expr_8);
+
+  // Expected result
+  auto predicate = cudf::compute_column(written_table, expr_9);
+  auto expected  = cudf::apply_boolean_mask(written_table, *predicate);
+
+  auto si                  = cudf::io::source_info(filepath);
+  auto builder             = cudf::io::parquet_reader_options::builder(si).filter(expr_9);
+  auto table_with_metadata = cudf::io::read_parquet(builder);
+  auto result              = table_with_metadata.tbl->view();
+
+  // tests
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result);
+}
+
+TEST_F(ParquetReaderTest, FilterSupported2)
+{
+  using T                 = uint32_t;
+  constexpr auto num_rows = 4000;
+  auto elements0 =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 2000; });
+  auto elements1 =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 1000; });
+  auto elements2 =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 500; });
+  auto col0 = cudf::test::fixed_width_column_wrapper<T>(elements0, elements0 + num_rows);
+  auto col1 = cudf::test::fixed_width_column_wrapper<T>(elements1, elements1 + num_rows);
+  auto col2 = cudf::test::fixed_width_column_wrapper<T>(elements2, elements2 + num_rows);
+  auto const written_table = table_view{{col0, col1, col2}};
+  auto const filepath      = temp_env->get_temp_filepath("FilterSupported2.parquet");
+  {
+    const cudf::io::parquet_writer_options out_opts =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table)
+        .row_group_size_rows(1000);
+    cudf::io::write_parquet(out_opts);
+  }
+  auto si          = cudf::io::source_info(filepath);
+  auto filter_col0 = cudf::ast::column_reference(0);
+  auto filter_col1 = cudf::ast::column_reference(1);
+  auto filter_col2 = cudf::ast::column_reference(2);
+  auto s_value     = cudf::numeric_scalar<T>(1, true);
+  auto lit_value   = cudf::ast::literal(s_value);
+
+  auto test_expr = [&](auto& expr) {
+    // Expected result
+    auto predicate = cudf::compute_column(written_table, expr);
+    auto expected  = cudf::apply_boolean_mask(written_table, *predicate);
+
+    // tests
+    auto builder             = cudf::io::parquet_reader_options::builder(si).filter(expr);
+    auto table_with_metadata = cudf::io::read_parquet(builder);
+    auto result              = table_with_metadata.tbl->view();
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result);
+  };
+
+  // row groups min, max:
+  // table[0] 0-0, 0-0, 1-1, 1-1
+  // table[1] 0-0, 1-1, 2-2, 3-3
+  // table[2] 0-1, 2-3, 4-5, 6-7
+
+  // Filtering AST -   table[i] == 1
+  {
+    auto expr0 = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, filter_col0, lit_value);
+    test_expr(expr0);
+
+    auto expr1 = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, filter_col1, lit_value);
+    test_expr(expr1);
+
+    auto expr2 = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, filter_col2, lit_value);
+    test_expr(expr2);
+  }
+  // Filtering AST -   table[i] != 1
+  {
+    auto expr0 = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col0, lit_value);
+    test_expr(expr0);
+
+    auto expr1 = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col1, lit_value);
+    test_expr(expr1);
+
+    auto expr2 = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col2, lit_value);
+    test_expr(expr2);
+  }
+}
+
+// Error types - type mismatch, invalid column name, invalid literal type, invalid operator,
+// non-bool filter output type.
+TEST_F(ParquetReaderTest, FilterErrors)
+{
+  using T                    = uint32_t;
+  auto const [src, filepath] = create_parquet_typed_with_stats<T>("FilterErrors.parquet");
+  auto const written_table   = src.view();
+  auto si                    = cudf::io::source_info(filepath);
+
+  // Filtering AST - invalid column index
+  {
+    auto filter_col1 = cudf::ast::column_reference(3);
+    T constexpr low  = 100;
+    auto lov         = cudf::numeric_scalar(low, true);
+    auto low_lot     = cudf::ast::literal(lov);
+    auto expr        = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, low_lot);
+
+    auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr);
+    EXPECT_THROW(cudf::io::read_parquet(builder), cudf::logic_error);
+  }
+
+  // Filtering AST - invalid column name
+  {
+    auto filter_col1 = cudf::ast::column_name_reference("col3");
+    T constexpr low  = 100;
+    auto lov         = cudf::numeric_scalar(low, true);
+    auto low_lot     = cudf::ast::literal(lov);
+    auto expr        = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, low_lot);
+    auto builder     = cudf::io::parquet_reader_options::builder(si).filter(expr);
+    EXPECT_THROW(cudf::io::read_parquet(builder), cudf::logic_error);
+  }
+
+  // Filtering AST - incompatible literal type
+  {
+    auto filter_col1      = cudf::ast::column_name_reference("col0");
+    auto filter_col2      = cudf::ast::column_reference(1);
+    int64_t constexpr low = 100;
+    auto lov              = cudf::numeric_scalar(low, true);
+    auto low_lot          = cudf::ast::literal(lov);
+    auto expr1    = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, low_lot);
+    auto expr2    = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col2, low_lot);
+    auto builder1 = cudf::io::parquet_reader_options::builder(si).filter(expr1);
+    EXPECT_THROW(cudf::io::read_parquet(builder1), cudf::logic_error);
+
+    auto builder2 = cudf::io::parquet_reader_options::builder(si).filter(expr2);
+    EXPECT_THROW(cudf::io::read_parquet(builder2), cudf::logic_error);
+  }
+
+  // Filtering AST - "table[0] + 110" is invalid filter expression
+  {
+    auto filter_col1      = cudf::ast::column_reference(0);
+    T constexpr add_value = 110;
+    auto add_v            = cudf::numeric_scalar(add_value, true);
+    auto add_lit          = cudf::ast::literal(add_v);
+    auto expr_8 = cudf::ast::operation(cudf::ast::ast_operator::ADD, filter_col1, add_lit);
+
+    auto si      = cudf::io::source_info(filepath);
+    auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr_8);
+    EXPECT_THROW(cudf::io::read_parquet(builder), cudf::logic_error);
+
+    // Expected result throw to show that the filter expression is invalid,
+    // not a limitation of the parquet predicate pushdown.
+    auto predicate = cudf::compute_column(written_table, expr_8);
+    EXPECT_THROW(cudf::apply_boolean_mask(written_table, *predicate), cudf::logic_error);
+  }
+
+  // Filtering AST - INT64(table[0] < 100) non-bool expression
+  {
+    auto filter_col1 = cudf::ast::column_reference(0);
+    T constexpr low  = 100;
+    auto lov         = cudf::numeric_scalar(low, true);
+    auto low_lot     = cudf::ast::literal(lov);
+    auto bool_expr   = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, low_lot);
+    auto cast        = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_INT64, bool_expr);
+
+    auto builder = cudf::io::parquet_reader_options::builder(si).filter(cast);
+    EXPECT_THROW(cudf::io::read_parquet(builder), cudf::logic_error);
+    EXPECT_NO_THROW(cudf::compute_column(written_table, cast));
+    auto predicate = cudf::compute_column(written_table, cast);
+    EXPECT_NE(predicate->view().type().id(), cudf::type_id::BOOL8);
+  }
+}
+
+// Filter without stats information in file.
+TEST_F(ParquetReaderTest, FilterNoStats)
+{
+  using T                 = uint32_t;
+  constexpr auto num_rows = 16000;
+  auto elements =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 1000; });
+  auto col0 = cudf::test::fixed_width_column_wrapper<T>(elements, elements + num_rows);
+  auto const written_table = table_view{{col0}};
+  auto const filepath      = temp_env->get_temp_filepath("FilterNoStats.parquet");
+  {
+    const cudf::io::parquet_writer_options out_opts =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table)
+        .row_group_size_rows(8000)
+        .stats_level(cudf::io::statistics_freq::STATISTICS_NONE);
+    cudf::io::write_parquet(out_opts);
+  }
+  auto si          = cudf::io::source_info(filepath);
+  auto filter_col0 = cudf::ast::column_reference(0);
+  auto s_value     = cudf::numeric_scalar<T>(1, true);
+  auto lit_value   = cudf::ast::literal(s_value);
+
+  // row groups min, max:
+  // table[0] 0-0, 1-1, 2-2, 3-3
+  // Filtering AST - table[0] > 1
+  auto expr = cudf::ast::operation(cudf::ast::ast_operator::GREATER, filter_col0, lit_value);
+
+  // Expected result
+  auto predicate = cudf::compute_column(written_table, expr);
+  auto expected  = cudf::apply_boolean_mask(written_table, *predicate);
+
+  // tests
+  auto builder             = cudf::io::parquet_reader_options::builder(si).filter(expr);
+  auto table_with_metadata = cudf::io::read_parquet(builder);
+  auto result              = table_with_metadata.tbl->view();
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result);
+}
+
+// Filter for float column with NaN values
+TEST_F(ParquetReaderTest, FilterFloatNAN)
+{
+  constexpr auto num_rows = 24000;
+  auto elements           = cudf::detail::make_counting_transform_iterator(
+    0, [num_rows](auto i) { return i > num_rows / 2 ? NAN : i; });
+  auto col0 = cudf::test::fixed_width_column_wrapper<float>(elements, elements + num_rows);
+  auto col1 = cudf::test::fixed_width_column_wrapper<double>(elements, elements + num_rows);
+
+  cudf::test::print(col0);
+  auto const written_table = table_view{{col0, col1}};
+  auto const filepath      = temp_env->get_temp_filepath("FilterFloatNAN.parquet");
+  {
+    const cudf::io::parquet_writer_options out_opts =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table)
+        .row_group_size_rows(8000);
+    cudf::io::write_parquet(out_opts);
+  }
+  auto si          = cudf::io::source_info(filepath);
+  auto filter_col0 = cudf::ast::column_reference(0);
+  auto filter_col1 = cudf::ast::column_reference(1);
+  auto s0_value    = cudf::numeric_scalar<float>(NAN, true);
+  auto lit0_value  = cudf::ast::literal(s0_value);
+  auto s1_value    = cudf::numeric_scalar<double>(NAN, true);
+  auto lit1_value  = cudf::ast::literal(s1_value);
+
+  // row groups min, max:
+  // table[0] 0-0, 1-1, 2-2, 3-3
+  // Filtering AST - table[0] == NAN, table[1] != NAN
+  auto expr_eq  = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, filter_col0, lit0_value);
+  auto expr_neq = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col1, lit1_value);
+
+  // Expected result
+  auto predicate0 = cudf::compute_column(written_table, expr_eq);
+  auto expected0  = cudf::apply_boolean_mask(written_table, *predicate0);
+  auto predicate1 = cudf::compute_column(written_table, expr_neq);
+  auto expected1  = cudf::apply_boolean_mask(written_table, *predicate1);
+
+  // tests
+  auto builder0             = cudf::io::parquet_reader_options::builder(si).filter(expr_eq);
+  auto table_with_metadata0 = cudf::io::read_parquet(builder0);
+  auto result0              = table_with_metadata0.tbl->view();
+  auto builder1             = cudf::io::parquet_reader_options::builder(si).filter(expr_neq);
+  auto table_with_metadata1 = cudf::io::read_parquet(builder1);
+  auto result1              = table_with_metadata1.tbl->view();
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected0->view(), result0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected1->view(), result1);
+}
+
+TEST_F(ParquetWriterTest, TimestampMicrosINT96NoOverflow)
+{
+  using namespace cuda::std::chrono;
+  using namespace cudf::io;
+
+  column_wrapper<cudf::timestamp_us> big_ts_col{
+    sys_days{year{3023} / month{7} / day{14}} + 7h + 38min + 45s + 418688us,
+    sys_days{year{723} / month{3} / day{21}} + 14h + 20min + 13s + microseconds{781ms}};
+
+  table_view expected({big_ts_col});
+  auto filepath = temp_env->get_temp_filepath("BigINT96Timestamp.parquet");
+
+  auto const out_opts =
+    parquet_writer_options::builder(sink_info{filepath}, expected).int96_timestamps(true).build();
+  write_parquet(out_opts);
+
+  auto const in_opts = parquet_reader_options::builder(source_info(filepath))
+                         .timestamp_type(cudf::data_type(cudf::type_id::TIMESTAMP_MICROSECONDS))
+                         .build();
+  auto const result = read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, PreserveNullability)
+{
+  constexpr auto num_rows = 100;
+
+  auto const col0_data = random_values<int32_t>(num_rows);
+  auto const col1_data = random_values<int32_t>(num_rows);
+
+  auto const col0_validity = cudf::test::iterators::no_nulls();
+  auto const col1_validity =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+
+  column_wrapper<int32_t> col0{col0_data.begin(), col0_data.end(), col0_validity};
+  column_wrapper<int32_t> col1{col1_data.begin(), col1_data.end(), col1_validity};
+  auto const col2 = make_parquet_list_list_col<int>(0, num_rows, 5, 8, true);
+
+  auto const expected = table_view{{col0, col1, *col2}};
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("mandatory");
+  expected_metadata.column_metadata[0].set_nullability(false);
+  expected_metadata.column_metadata[1].set_name("optional");
+  expected_metadata.column_metadata[1].set_nullability(true);
+  expected_metadata.column_metadata[2].set_name("lists");
+  expected_metadata.column_metadata[2].set_nullability(true);
+  // offsets is a cudf thing that's not part of the parquet schema so it won't have nullability set
+  expected_metadata.column_metadata[2].child(0).set_name("offsets");
+  expected_metadata.column_metadata[2].child(1).set_name("element");
+  expected_metadata.column_metadata[2].child(1).set_nullability(false);
+  expected_metadata.column_metadata[2].child(1).child(0).set_name("offsets");
+  expected_metadata.column_metadata[2].child(1).child(1).set_name("element");
+  expected_metadata.column_metadata[2].child(1).child(1).set_nullability(true);
+
+  auto const filepath = temp_env->get_temp_filepath("PreserveNullability.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options const in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto const result        = cudf::io::read_parquet(in_opts);
+  auto const read_metadata = cudf::io::table_input_metadata{result.metadata};
+
+  // test that expected_metadata matches read_metadata
+  std::function<void(cudf::io::column_in_metadata, cudf::io::column_in_metadata)>
+    compare_names_and_nullability = [&](auto lhs, auto rhs) {
+      EXPECT_EQ(lhs.get_name(), rhs.get_name());
+      ASSERT_EQ(lhs.is_nullability_defined(), rhs.is_nullability_defined());
+      if (lhs.is_nullability_defined()) { EXPECT_EQ(lhs.nullable(), rhs.nullable()); }
+      ASSERT_EQ(lhs.num_children(), rhs.num_children());
+      for (int i = 0; i < lhs.num_children(); ++i) {
+        compare_names_and_nullability(lhs.child(i), rhs.child(i));
+      }
+    };
+
+  ASSERT_EQ(expected_metadata.column_metadata.size(), read_metadata.column_metadata.size());
+
+  for (size_t i = 0; i < expected_metadata.column_metadata.size(); ++i) {
+    compare_names_and_nullability(expected_metadata.column_metadata[i],
+                                  read_metadata.column_metadata[i]);
+  }
+}
+
+TEST_P(ParquetV2Test, CheckEncodings)
+{
+  using cudf::io::parquet::Encoding;
+  constexpr auto num_rows = 100'000;
+  auto const is_v2        = GetParam();
+
+  auto const validity = cudf::test::iterators::no_nulls();
+  // data should be PLAIN for v1, RLE for V2
+  auto col0_data =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 2 == 0; });
+  // data should be PLAIN for both
+  auto col1_data = random_values<int32_t>(num_rows);
+  // data should be PLAIN_DICTIONARY for v1, PLAIN and RLE_DICTIONARY for v2
+  auto col2_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
+
+  cudf::test::fixed_width_column_wrapper<bool> col0{col0_data, col0_data + num_rows, validity};
+  column_wrapper<int32_t> col1{col1_data.begin(), col1_data.end(), validity};
+  column_wrapper<int32_t> col2{col2_data, col2_data + num_rows, validity};
+
+  auto expected = table_view{{col0, col1, col2}};
+
+  auto const filename = is_v2 ? "CheckEncodingsV2.parquet" : "CheckEncodingsV1.parquet";
+  auto filepath       = temp_env->get_temp_filepath(filename);
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .max_page_size_rows(num_rows)
+      .write_v2_headers(is_v2);
+  cudf::io::write_parquet(out_opts);
+
+  // make sure the expected encodings are present
+  auto contains = [](auto const& vec, auto const& enc) {
+    return std::find(vec.begin(), vec.end(), enc) != vec.end();
+  };
+
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::FileMetaData fmd;
+
+  read_footer(source, &fmd);
+  auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings;
+  auto const& chunk1_enc = fmd.row_groups[0].columns[1].meta_data.encodings;
+  auto const& chunk2_enc = fmd.row_groups[0].columns[2].meta_data.encodings;
+  if (is_v2) {
+    // col0 should have RLE for rep/def and data
+    EXPECT_TRUE(chunk0_enc.size() == 1);
+    EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE));
+    // col1 should have RLE for rep/def and PLAIN for data
+    EXPECT_TRUE(chunk1_enc.size() == 2);
+    EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE));
+    EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN));
+    // col2 should have RLE for rep/def, PLAIN for dict, and RLE_DICTIONARY for data
+    EXPECT_TRUE(chunk2_enc.size() == 3);
+    EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE));
+    EXPECT_TRUE(contains(chunk2_enc, Encoding::PLAIN));
+    EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE_DICTIONARY));
+  } else {
+    // col0 should have RLE for rep/def and PLAIN for data
+    EXPECT_TRUE(chunk0_enc.size() == 2);
+    EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE));
+    EXPECT_TRUE(contains(chunk0_enc, Encoding::PLAIN));
+    // col1 should have RLE for rep/def and PLAIN for data
+    EXPECT_TRUE(chunk1_enc.size() == 2);
+    EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE));
+    EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN));
+    // col2 should have RLE for rep/def and PLAIN_DICTIONARY for data and dict
+    EXPECT_TRUE(chunk2_enc.size() == 2);
+    EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE));
+    EXPECT_TRUE(contains(chunk2_enc, Encoding::PLAIN_DICTIONARY));
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp
index 984d9425a33..b4583ac4f17 100644
--- a/cpp/tests/io/row_selection_test.cpp
+++ b/cpp/tests/io/row_selection_test.cpp
@@ -127,7 +127,7 @@ TEST_F(FromOptsTest, OverFlowDetection)
 
   // Too many rows to read until the end of the file
   EXPECT_THROW(skip_rows_num_rows_from_options(0, std::nullopt, too_large_for_32bit),
-               cudf::logic_error);
+               std::overflow_error);
 
   // Should work fine with num_rows
   EXPECT_NO_THROW(
diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp
index 29cd8e971df..e186dcda9cb 100644
--- a/cpp/tests/io/text/data_chunk_source_test.cpp
+++ b/cpp/tests/io/text/data_chunk_source_test.cpp
@@ -30,14 +30,14 @@ auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
 
 struct DataChunkSourceTest : public cudf::test::BaseFixture {};
 
-std::string chunk_to_host(const cudf::io::text::device_data_chunk& chunk)
+std::string chunk_to_host(cudf::io::text::device_data_chunk const& chunk)
 {
   std::string result(chunk.size(), '\0');
   CUDF_CUDA_TRY(cudaMemcpy(result.data(), chunk.data(), chunk.size(), cudaMemcpyDefault));
   return result;
 }
 
-void test_source(const std::string& content, const cudf::io::text::data_chunk_source& source)
+void test_source(std::string const& content, cudf::io::text::data_chunk_source const& source)
 {
   {
     // full contents
@@ -162,7 +162,7 @@ uint64_t virtual_offset(std::size_t block_offset, std::size_t local_offset)
 }
 
 void write_bgzip(std::ostream& output_stream,
-                 cudf::host_span<const char> data,
+                 cudf::host_span<char const> data,
                  std::default_random_engine& rng,
                  compression compress,
                  eof add_eof)
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 882de994e67..7a5a9eae91c 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -109,7 +109,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
   template <typename T_output>
   void evaluate(T_output expected,
                 rmm::device_uvector<T_output> const& dev_result,
-                const char* msg = nullptr)
+                char const* msg = nullptr)
   {
     auto host_result = cudf::detail::make_host_vector_sync(dev_result, cudf::get_default_stream());
 
@@ -118,7 +118,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
 
   template <typename T_output>
   void values_equal_test(thrust::host_vector<T_output> const& expected,
-                         const cudf::column_device_view& col)
+                         cudf::column_device_view const& col)
   {
     if (col.nullable()) {
       auto it_dev = cudf::detail::make_null_replacement_iterator(
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index b6c68ce062e..d70a074692e 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -61,7 +61,7 @@ struct transformer_optional_meanvar {
 
 template <typename T>
 struct optional_to_meanvar {
-  CUDF_HOST_DEVICE inline T operator()(const thrust::optional<T>& v) { return v.value_or(T{0}); }
+  CUDF_HOST_DEVICE inline T operator()(thrust::optional<T> const& v) { return v.value_or(T{0}); }
 };
 
 // TODO: enable this test also at __CUDACC_DEBUG__
@@ -74,7 +74,7 @@ TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
   using T_output = cudf::meanvar<T>;
   transformer_optional_meanvar<T> transformer{};
 
-  const int column_size{50};
+  int const column_size{50};
   const T init{0};
 
   // data and valid arrays
diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu
index 1483abcf860..420b3d4aff5 100644
--- a/cpp/tests/iterator/pair_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu
@@ -53,8 +53,8 @@ struct transformer_pair_meanvar {
 
 struct sum_if_not_null {
   template <typename T>
-  CUDF_HOST_DEVICE inline thrust::pair<T, bool> operator()(const thrust::pair<T, bool>& lhs,
-                                                           const thrust::pair<T, bool>& rhs)
+  CUDF_HOST_DEVICE inline thrust::pair<T, bool> operator()(thrust::pair<T, bool> const& lhs,
+                                                           thrust::pair<T, bool> const& rhs)
   {
     if (lhs.second & rhs.second)
       return {lhs.first + rhs.first, true};
@@ -77,7 +77,7 @@ TYPED_TEST(NumericPairIteratorTest, mean_var_output)
   using T_output = cudf::meanvar<T>;
   transformer_pair_meanvar<T> transformer{};
 
-  const int column_size{5000};
+  int const column_size{5000};
   const T init{0};
 
   // data and valid arrays
diff --git a/cpp/tests/iterator/sizes_to_offsets_iterator_test.cu b/cpp/tests/iterator/sizes_to_offsets_iterator_test.cu
index 191096606ac..3b412b76dde 100644
--- a/cpp/tests/iterator/sizes_to_offsets_iterator_test.cu
+++ b/cpp/tests/iterator/sizes_to_offsets_iterator_test.cu
@@ -63,8 +63,8 @@ TYPED_TEST(SizesToOffsetsIteratorTestTyped, ExclusiveScan)
 
   auto expected =
     cudf::test::fixed_width_column_wrapper<T>(expected_values.begin(), expected_values.end());
-  auto result_col =
-    cudf::column_view(cudf::data_type(cudf::type_to_id<T>()), d_view.size(), result.data());
+  auto result_col = cudf::column_view(
+    cudf::data_type(cudf::type_to_id<T>()), d_view.size(), result.data(), nullptr, 0);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result_col, expected);
   EXPECT_EQ(last.value(stream), expected_reduce);
diff --git a/cpp/tests/iterator/value_iterator_test_transform.cu b/cpp/tests/iterator/value_iterator_test_transform.cu
index b2c3a9ac66d..a309cfd6327 100644
--- a/cpp/tests/iterator/value_iterator_test_transform.cu
+++ b/cpp/tests/iterator/value_iterator_test_transform.cu
@@ -25,7 +25,7 @@ struct TransformedIteratorTest : public IteratorTest<int8_t> {};
 // cudf::detail::make_null_replacement_iterator(col, T{0})
 TEST_F(TransformedIteratorTest, null_iterator_upcast)
 {
-  const int column_size{1000};
+  int const column_size{1000};
   using T        = int8_t;
   using T_upcast = int64_t;
   T init{0};
@@ -67,7 +67,7 @@ TEST_F(TransformedIteratorTest, null_iterator_upcast)
 //        cudf::detail::transformer_squared<T_upcast>)
 TEST_F(TransformedIteratorTest, null_iterator_square)
 {
-  const int column_size{1000};
+  int const column_size{1000};
   using T        = int8_t;
   using T_upcast = int64_t;
   T init{0};
@@ -109,7 +109,7 @@ TEST_F(TransformedIteratorTest, large_size_reduction)
 {
   using T = int64_t;
 
-  const int column_size{1000000};
+  int const column_size{1000000};
   const T init{0};
 
   // data and valid arrays
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 920c497f850..ad5a33157fd 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,8 +55,8 @@ constexpr cudf::size_type JoinNoneValue =
   std::numeric_limits<cudf::size_type>::min();  // TODO: how to test if this isn't public?
 
 // Common column references.
-const auto col_ref_left_0  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
-const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+auto const col_ref_left_0  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
 
 // Common expressions.
 auto left_zero_eq_right_zero =
@@ -138,13 +138,13 @@ struct index_pair {
     : first(first), second(second){};
 };
 
-__device__ inline bool operator<(const index_pair& lhs, const index_pair& rhs)
+__device__ inline bool operator<(index_pair const& lhs, index_pair const& rhs)
 {
   if (lhs.first > rhs.first) return false;
   return (lhs.first < rhs.first) || (lhs.second < rhs.second);
 }
 
-__device__ inline bool operator==(const index_pair& lhs, const index_pair& rhs)
+__device__ inline bool operator==(index_pair const& lhs, index_pair const& rhs)
 {
   return lhs.first == rhs.first && lhs.second == rhs.second;
 }
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index a712f3444b0..089db315748 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -133,10 +133,14 @@ struct JoinTest : public cudf::test::BaseFixture {
     auto result_table =
       cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
                                           static_cast<cudf::size_type>(result.first->size()),
-                                          result.first->data()},
+                                          result.first->data(),
+                                          nullptr,
+                                          0},
                         cudf::column_view{cudf::data_type{cudf::type_id::INT32},
                                           static_cast<cudf::size_type>(result.second->size()),
-                                          result.second->data()}});
+                                          result.second->data(),
+                                          nullptr,
+                                          0}});
     auto result_sort_order = cudf::sorted_order(result_table);
     auto sorted_result     = cudf::gather(result_table, *result_sort_order);
 
@@ -1493,10 +1497,14 @@ TEST_F(JoinTest, HashJoinWithNullsOneSide)
   auto const sort_result = [](auto const& result) {
     auto const left_cv  = cudf::column_view{cudf::data_type{cudf::type_id::INT32},
                                            static_cast<cudf::size_type>(result.first->size()),
-                                           result.first->data()};
+                                           result.first->data(),
+                                           nullptr,
+                                           0};
     auto const right_cv = cudf::column_view{cudf::data_type{cudf::type_id::INT32},
                                             static_cast<cudf::size_type>(result.second->size()),
-                                            result.second->data()};
+                                            result.second->data(),
+                                            nullptr,
+                                            0};
     auto sorted_left    = cudf::sort(cudf::table_view{{left_cv}});
     auto sorted_right   = cudf::sort(cudf::table_view{{right_cv}});
     return std::pair{std::move(sorted_left), std::move(sorted_right)};
@@ -1577,7 +1585,8 @@ TEST_F(JoinTest, HashJoinLargeOutputSize)
   rmm::device_buffer zeroes(col_size * sizeof(int32_t), cudf::get_default_stream());
   CUDF_CUDA_TRY(
     cudaMemsetAsync(zeroes.data(), 0, zeroes.size(), cudf::get_default_stream().value()));
-  cudf::column_view col_zeros(cudf::data_type{cudf::type_id::INT32}, col_size, zeroes.data());
+  cudf::column_view col_zeros(
+    cudf::data_type{cudf::type_id::INT32}, col_size, zeroes.data(), nullptr, 0);
   cudf::table_view tview{{col_zeros}};
   cudf::hash_join hash_join(tview, cudf::nullable_join::NO, cudf::null_equality::UNEQUAL);
   std::size_t output_size = hash_join.inner_join_size(tview);
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index d252ded6627..eb450d44efd 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,8 +54,8 @@ constexpr cudf::size_type JoinNoneValue =
   std::numeric_limits<cudf::size_type>::min();  // TODO: how to test if this isn't public?
 
 // Common column references.
-const auto col_ref_left_0  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
-const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+auto const col_ref_left_0  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
 
 // Common expressions.
 auto left_zero_eq_right_zero =
@@ -218,7 +218,9 @@ struct MixedJoinPairReturnTest : public MixedJoinTest<T> {
     auto const actual_counts_view =
       cudf::column_view(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                         actual_counts->size(),
-                        actual_counts->data());
+                        actual_counts->data(),
+                        nullptr,
+                        0);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_counts_cw, actual_counts_view);
 
     auto result = this->join(
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 8183b694359..0e0c92bc4a2 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -100,8 +100,11 @@ TEST_F(JoinTest, TestSimple)
   auto right = cudf::table_view{{right_col0}};
 
   auto result    = left_semi_join(left, right);
-  auto result_cv = cudf::column_view(
-    cudf::data_type{cudf::type_to_id<cudf::size_type>()}, result->size(), result->data());
+  auto result_cv = cudf::column_view(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                                     result->size(),
+                                     result->data(),
+                                     nullptr,
+                                     0);
   column_wrapper<cudf::size_type> expected{0, 1};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result_cv);
 }
diff --git a/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp
index e40aae8a48e..f62d2a08f78 100644
--- a/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp
+++ b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp
@@ -492,3 +492,318 @@ TEST_F(ConcatenateListElementsTest, SlicedStringsColumnsInputWithNulls)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, verbosity);
   }
 }
+
+TEST_F(ConcatenateListElementsTest, ListsOfListsOfStructsNoNull)
+{
+  using structs_col = cudf::test::structs_column_wrapper;
+  using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using strings_col = cudf::test::strings_column_wrapper;
+
+  // Input:
+  // [ [{1, 11, "1"}, {2, 12, "2"}], [{3, 13, "3"}], [{4, 14, "4"}, {5, 15, "5"}, {6, 16, "6"}] ]
+  // [ [] ]
+  // [ [{7, 17, "7"}, {8, 18, "8"}], [{9, 19, "9"}, {10, 110, "10"}] ]
+  auto const input = [] {
+    auto child = [] {
+      auto child1  = int32s_col{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+      auto child2  = int32s_col{11, 12, 13, 14, 15, 16, 17, 18, 19, 110};
+      auto child3  = strings_col{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10"};
+      auto structs = structs_col{{child1, child2, child3}};
+      auto offsets = int32s_col{0, 2, 3, 6, 6, 8, 10};
+      return cudf::make_lists_column(6, offsets.release(), structs.release(), 0, {});
+    }();
+
+    auto offsets = int32s_col{0, 3, 4, 6};
+    return cudf::make_lists_column(3, offsets.release(), std::move(child), 0, {});
+  }();
+
+  // Output:
+  // [{1, 11, "1"}, {2, 12, "2"}, {3, 13, "3"}, {4, 14, "4"}, {5, 15, "5"}, {6, 16, "6"}]
+  // []
+  // [{7, 17, "7"}, {8, 18, "8"}, {9, 19, "9"}, {10, 110, "10"}]
+  auto const expected = [] {
+    auto child1  = int32s_col{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    auto child2  = int32s_col{11, 12, 13, 14, 15, 16, 17, 18, 19, 110};
+    auto child3  = strings_col{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10"};
+    auto structs = structs_col{{child1, child2, child3}};
+    auto offsets = int32s_col{0, 6, 6, 10};
+    return cudf::make_lists_column(3, offsets.release(), structs.release(), 0, {});
+  }();
+
+  auto const results = cudf::lists::concatenate_list_elements(*input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results, verbosity);
+}
+
+TEST_F(ConcatenateListElementsTest, ListsOfListsOfStructsWithNull)
+{
+  using structs_col = cudf::test::structs_column_wrapper;
+  using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using strings_col = cudf::test::strings_column_wrapper;
+
+  // Input:
+  // [ [{1, 11, "1"}, {null, null, null}], [{3, 13, "3"}], NULL ]
+  // [ [{4, 14, "4"}, {5, 15, "5"}, {null, null, null}] ]
+  // [ [{7, 17, "7"}, {null, null, null}], [{9, 19, "9"}, {10, 110, "10"}] ]
+  auto const input = [] {
+    auto child = [] {
+      auto child1                  = int32s_col{1, null, 3, 4, 5, null, 7, null, 9, 10};
+      auto child2                  = int32s_col{11, null, 13, 14, 15, null, 17, null, 19, 110};
+      auto child3                  = strings_col{"1", "", "3", "4", "5", "", "7", "", "9", "10"};
+      auto structs                 = structs_col{{child1, child2, child3}, nulls_at({1, 5, 7})};
+      auto offsets                 = int32s_col{0, 2, 3, 3, 6, 8, 10};
+      auto const null_it           = null_at(2);  // null list
+      auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 6);
+      return cudf::make_lists_column(
+        6, offsets.release(), structs.release(), null_count, std::move(null_mask));
+    }();
+
+    auto offsets = int32s_col{0, 3, 4, 6};
+    return cudf::make_lists_column(3, offsets.release(), std::move(child), 0, {});
+  }();
+
+  // Concatenate with ignoring null lists.
+  {
+    // Output:
+    // [{1, 11, "1"}, {null, null, null}, {3, 13, "3"}]
+    // [{4, 14, "4"}, {5, 15, "5"}, {null, null, null}]
+    // [{7, 17, "7"}, {null, null, null}, {9, 19, "9"}, {10, 110, "10"}]
+    auto const expected = [] {
+      auto child1  = int32s_col{1, null, 3, 4, 5, null, 7, null, 9, 10};
+      auto child2  = int32s_col{11, null, 13, 14, 15, null, 17, null, 19, 110};
+      auto child3  = strings_col{"1", "", "3", "4", "5", "", "7", "", "9", "10"};
+      auto structs = structs_col{{child1, child2, child3}, nulls_at({1, 5, 7})};
+      auto offsets = int32s_col{0, 3, 6, 10};
+      return cudf::make_lists_column(3, offsets.release(), structs.release(), 0, {});
+    }();
+
+    auto const results = cudf::lists::concatenate_list_elements(*input);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results, verbosity);
+  }
+
+  // Concatenate with ignoring null lists and sliced input.
+  {
+    // Output:
+    // [{4, 14, "4"}, {5, 15, "5"}, {null, null, null}]
+    auto const expected = [] {
+      auto child1  = int32s_col{4, 5, null};
+      auto child2  = int32s_col{14, 15, null};
+      auto child3  = strings_col{"4", "5", ""};
+      auto structs = structs_col{{child1, child2, child3}, null_at(2)};
+      auto offsets = int32s_col{0, 3};
+      return cudf::make_lists_column(1, offsets.release(), structs.release(), 0, {});
+    }();
+
+    auto const sliced_input = cudf::slice(*input, {1, 2})[0];
+    auto const results      = cudf::lists::concatenate_list_elements(sliced_input);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results, verbosity);
+  }
+
+  // Concatenate with `concatenate_null_policy::NULLIFY_OUTPUT_ROW`.
+  {
+    // Output:
+    // NULL
+    // [{4, 14, "4"}, {5, 15, "5"}, {null, null, null}]
+    // [{7, 17, "7"}, {null, null, null}, {9, 19, "9"}, {10, 110, "10"}]
+    auto const expected = [] {
+      auto child1                  = int32s_col{4, 5, null, 7, null, 9, 10};
+      auto child2                  = int32s_col{14, 15, null, 17, null, 19, 110};
+      auto child3                  = strings_col{"4", "5", "", "7", "", "9", "10"};
+      auto structs                 = structs_col{{child1, child2, child3}, nulls_at({2, 4})};
+      auto offsets                 = int32s_col{0, 0, 3, 7};
+      auto const null_it           = null_at(0);  // null row
+      auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 3);
+      return cudf::make_lists_column(
+        3, offsets.release(), structs.release(), null_count, std::move(null_mask));
+    }();
+
+    auto const results = cudf::lists::concatenate_list_elements(
+      *input, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results, verbosity);
+  }
+
+  // Concatenate with `concatenate_null_policy::NULLIFY_OUTPUT_ROW` and sliced input.
+  {
+    // Output:
+    // NULL
+    // [{4, 14, "4"}, {5, 15, "5"}, {null, null, null}]
+    auto const expected = [] {
+      auto child1                  = int32s_col{4, 5, null};
+      auto child2                  = int32s_col{14, 15, null};
+      auto child3                  = strings_col{"4", "5", ""};
+      auto structs                 = structs_col{{child1, child2, child3}, null_at(2)};
+      auto offsets                 = int32s_col{0, 0, 3};
+      auto const null_it           = null_at(0);  // null row
+      auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 2);
+      return cudf::make_lists_column(
+        2, offsets.release(), structs.release(), null_count, std::move(null_mask));
+    }();
+
+    auto const sliced_input = cudf::slice(*input, {0, 2})[0];
+    auto const results      = cudf::lists::concatenate_list_elements(
+      sliced_input, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results, verbosity);
+  }
+}
+
+TEST_F(ConcatenateListElementsTest, ListsOfListsOfStructsHavingListsNoNull)
+{
+  using structs_col = cudf::test::structs_column_wrapper;
+  using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using lists_col   = cudf::test::lists_column_wrapper<int32_t>;
+
+  // clang-format off
+  // Input:
+  // [ [{1, 11, [1, 1]}, {2, 12, [2]}], [{3, 13, [3, 3]}], [{4, 14, []}, {5, 15, [5, 5, 5]}, {6, 16, [6, 6]}] ]
+  // [ [] ]
+  // [ [{7, 17, [7]}, {8, 18, [8]}], [{9, 19, [9, 9]}, {10, 110, [10, 10, 10, 10]}] ]
+  // clang-format on
+  auto const input = [] {
+    auto child = [] {
+      auto child1 = int32s_col{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+      auto child2 = int32s_col{11, 12, 13, 14, 15, 16, 17, 18, 19, 110};
+      auto child3 =
+        lists_col{{1, 1}, {2}, {3, 3}, {}, {5, 5, 5}, {6, 6}, {7}, {8}, {9, 9}, {10, 10, 10, 10}};
+      auto structs = structs_col{{child1, child2, child3}};
+      auto offsets = int32s_col{0, 2, 3, 6, 6, 8, 10};
+      return cudf::make_lists_column(6, offsets.release(), structs.release(), 0, {});
+    }();
+
+    auto offsets = int32s_col{0, 3, 4, 6};
+    return cudf::make_lists_column(3, offsets.release(), std::move(child), 0, {});
+  }();
+
+  // clang-format off
+  // Output:
+  // [{1, 11, [1, 1]}, {2, 12, [2]}, {3, 13, [3, 3]}, {4, 14, []}, {5, 15, [5, 5, 5]}, {6, 16, [6, 6]}]
+  // []
+  // [{7, 17, [7]}, {8, 18, [8]}, {9, 19, [9, 9]}, {10, 110, [10, 10, 10, 10]}]
+  // clang-format on
+  auto const expected = [] {
+    auto child1 = int32s_col{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    auto child2 = int32s_col{11, 12, 13, 14, 15, 16, 17, 18, 19, 110};
+    auto child3 =
+      lists_col{{1, 1}, {2}, {3, 3}, {}, {5, 5, 5}, {6, 6}, {7}, {8}, {9, 9}, {10, 10, 10, 10}};
+    auto structs = structs_col{{child1, child2, child3}};
+    auto offsets = int32s_col{0, 6, 6, 10};
+    return cudf::make_lists_column(3, offsets.release(), structs.release(), 0, {});
+  }();
+
+  auto const results = cudf::lists::concatenate_list_elements(*input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results, verbosity);
+}
+
+TEST_F(ConcatenateListElementsTest, ListsOfListsOfStructsHavingListsWithNulls)
+{
+  using structs_col = cudf::test::structs_column_wrapper;
+  using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using lists_col   = cudf::test::lists_column_wrapper<int32_t>;
+
+  // Input:
+  // [ [{1, 11, [1, 1]}, {2, 12, [2]}], [{3, 13, [3, 3]}] ]
+  // [ [{4, 14, null}, {5, 15, [5, 5, 5]}, {6, 16, [6, 6]}], NULL ]
+  // [ [{7, 17, [7]}, {8, 18, [8]}], [{9, 19, [9, 9]}, {10, 110, [10, 10, 10, 10]}] ]
+  auto const input = [] {
+    auto child = [] {
+      auto child1 = int32s_col{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+      auto child2 = int32s_col{11, 12, 13, 14, 15, 16, 17, 18, 19, 110};
+      auto child3 =
+        lists_col{{{1, 1}, {2}, {3, 3}, {}, {5, 5, 5}, {6, 6}, {7}, {8}, {9, 9}, {10, 10, 10, 10}},
+                  null_at(3)};
+      auto structs                 = structs_col{{child1, child2, child3}};
+      auto offsets                 = int32s_col{0, 2, 3, 6, 6, 8, 10};
+      auto const null_it           = null_at(3);  // null list
+      auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 6);
+      return cudf::make_lists_column(
+        6, offsets.release(), structs.release(), null_count, std::move(null_mask));
+    }();
+
+    auto offsets = int32s_col{0, 2, 4, 6};
+    return cudf::make_lists_column(3, offsets.release(), std::move(child), 0, {});
+  }();
+
+  // Concatenate with ignoring null lists.
+  {
+    // Output:
+    // [{1, 11, [1, 1]}, {2, 12, [2]}, {3, 13, [3, 3]}]
+    // [{4, 14, null}, {5, 15, [5, 5, 5]}, {6, 16, [6, 6]}]
+    // [{7, 17, [7]}, {8, 18, [8]}, {9, 19, [9, 9]}, {10, 110, [10, 10, 10, 10]}]
+    auto const expected = [] {
+      auto child1 = int32s_col{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+      auto child2 = int32s_col{11, 12, 13, 14, 15, 16, 17, 18, 19, 110};
+      auto child3 =
+        lists_col{{{1, 1}, {2}, {3, 3}, {}, {5, 5, 5}, {6, 6}, {7}, {8}, {9, 9}, {10, 10, 10, 10}},
+                  null_at(3)};
+      auto structs = structs_col{{child1, child2, child3}};
+      auto offsets = int32s_col{0, 3, 6, 10};
+      return cudf::make_lists_column(3, offsets.release(), structs.release(), 0, {});
+    }();
+
+    auto const results = cudf::lists::concatenate_list_elements(*input);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results, verbosity);
+  }
+
+  // Concatenate with ignoring null lists and sliced input.
+  {
+    // Output:
+    // [{4, 14, null}, {5, 15, [5, 5, 5]}, {6, 16, [6, 6]}]
+    auto const expected = [] {
+      auto child1  = int32s_col{4, 5, 6};
+      auto child2  = int32s_col{14, 15, 16};
+      auto child3  = lists_col{{{}, {5, 5, 5}, {6, 6}}, null_at(0)};
+      auto structs = structs_col{{child1, child2, child3}};
+      auto offsets = int32s_col{0, 3};
+      return cudf::make_lists_column(1, offsets.release(), structs.release(), 0, {});
+    }();
+
+    auto const sliced_input = cudf::slice(*input, {1, 2})[0];
+    auto const results      = cudf::lists::concatenate_list_elements(sliced_input);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results, verbosity);
+  }
+
+  // Concatenate with `concatenate_null_policy::NULLIFY_OUTPUT_ROW`.
+  {
+    // Output:
+    // [{1, 11, [1, 1]}, {2, 12, [2]}, {3, 13, [3, 3]}]
+    // NULL
+    // [{7, 17, [7]}, {8, 18, [8]}, {9, 19, [9, 9]}, {10, 110, [10, 10, 10, 10]}]
+    auto const expected = [] {
+      auto child1 = int32s_col{1, 2, 3, 7, 8, 9, 10};
+      auto child2 = int32s_col{11, 12, 13, 17, 18, 19, 110};
+      auto child3 =
+        lists_col{{{1, 1}, {2}, {3, 3}, {7}, {8}, {9, 9}, {10, 10, 10, 10}}, no_nulls()};
+      auto structs                 = structs_col{{child1, child2, child3}};
+      auto offsets                 = int32s_col{0, 3, 3, 7};
+      auto const null_it           = null_at(1);  // null row
+      auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 3);
+      return cudf::make_lists_column(
+        3, offsets.release(), structs.release(), null_count, std::move(null_mask));
+    }();
+
+    auto const results = cudf::lists::concatenate_list_elements(
+      *input, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results, verbosity);
+  }
+
+  // Concatenate with `concatenate_null_policy::NULLIFY_OUTPUT_ROW` and sliced input.
+  {
+    // Output:
+    // NULL
+    // [{7, 17, [7]}, {8, 18, [8]}, {9, 19, [9, 9]}, {10, 110, [10, 10, 10, 10]}]
+    auto const expected = [] {
+      auto child1                  = int32s_col{7, 8, 9, 10};
+      auto child2                  = int32s_col{17, 18, 19, 110};
+      auto child3                  = lists_col{{{7}, {8}, {9, 9}, {10, 10, 10, 10}}, no_nulls()};
+      auto structs                 = structs_col{{child1, child2, child3}};
+      auto offsets                 = int32s_col{0, 0, 4};
+      auto const null_it           = null_at(0);  // null row
+      auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 2);
+      return cudf::make_lists_column(
+        2, offsets.release(), structs.release(), null_count, std::move(null_mask));
+    }();
+
+    auto const sliced_input = cudf::slice(*input, {1, 3})[0];
+    auto const results      = cudf::lists::concatenate_list_elements(
+      sliced_input, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results, verbosity);
+  }
+}
diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index 1ceb450a4eb..62e6653347b 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -859,13 +859,13 @@ TYPED_TEST_SUITE(TypedContainsNaNsTest, cudf::test::FloatingPointTypes);
 
 namespace {
 template <typename T>
-T get_nan(const char* nan_contents)
+T get_nan(char const* nan_contents)
 {
   return std::nan(nan_contents);
 }
 
 template <>
-float get_nan<float>(const char* nan_contents)
+float get_nan<float>(char const* nan_contents)
 {
   return std::nanf(nan_contents);
 }
diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp
index 2c2b3c8b29c..017cd471e01 100644
--- a/cpp/tests/lists/extract_tests.cpp
+++ b/cpp/tests/lists/extract_tests.cpp
@@ -269,7 +269,7 @@ TYPED_TEST(ListsExtractColumnIndicesTypedTest, ExtractElement)
 {
   using LCW     = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
   using FWCW    = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
-  using indices = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using indices = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto input_column = LCW({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}, LCW{}},
                           cudf::test::iterators::null_at(1));
@@ -329,7 +329,7 @@ TYPED_TEST(ListsExtractColumnIndicesTypedTest, ExtractElement)
 TYPED_TEST(ListsExtractColumnIndicesTypedTest, FailureCases)
 {
   using LCW     = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
-  using indices = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using indices = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   {
     // Non-empty input, with mismatched size of indices.
@@ -361,7 +361,7 @@ TEST_F(ListsExtractColumnIndicesTest, ExtractStrings)
 {
   using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
   using strings = cudf::test::strings_column_wrapper;
-  using indices = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using indices = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   auto input_column = LCW(
     {LCW{"3", "2", "1"}, LCW{}, LCW{"30", "20", "10", "50"}, LCW{"100", "120"}, LCW{"0"}, LCW{}},
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 2b23b477438..f3d131ae50c 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -304,7 +304,7 @@ void run_fixed_width_test(size_t cols,
   constexpr cudf::data_type dtype{cudf::type_id::INT32};
   auto d_partitions = cudf::detail::make_device_uvector_sync(
     partitions, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-  cudf::column_view partitions_col(dtype, rows, d_partitions.data());
+  cudf::column_view partitions_col(dtype, rows, d_partitions.data(), nullptr, 0);
   cudf::table_view partitions_table({partitions_col});
 
   // Sort partition numbers by the corresponding row hashes of each output
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index c6069acad8a..46d4066ddff 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -383,7 +383,7 @@ TEST_F(PercentileApproxTest, EmptyInput)
   cudf::tdigest::tdigest_column_view tdv(*empty);
   auto result = cudf::percentile_approx(tdv, percentiles);
 
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{0, 0, 0, 0};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 0, 0, 0};
   std::vector<bool> nulls{0, 0, 0};
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(nulls.begin(), nulls.end());
 
@@ -415,7 +415,7 @@ TEST_F(PercentileApproxTest, EmptyPercentiles)
   cudf::tdigest::tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
   auto result = cudf::percentile_approx(tdv, percentiles);
 
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{0, 0, 0};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 0, 0};
   std::vector<bool> nulls{0, 0};
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(nulls.begin(), nulls.end());
 
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 8c9e5d4021a..2561f3f9886 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -145,9 +145,9 @@ TYPED_TEST(MinMaxReductionTest, MinMaxTypes)
   T expected_min_result      = *(std::min_element(v.begin(), v.end()));
   T expected_max_result      = *(std::max_element(v.begin(), v.end()));
   T expected_min_init_result = std::accumulate(
-    v.begin(), v.end(), init_value, [](const T& a, const T& b) { return std::min<T>(a, b); });
+    v.begin(), v.end(), init_value, [](T const& a, T const& b) { return std::min<T>(a, b); });
   T expected_max_init_result = std::accumulate(
-    v.begin(), v.end(), init_value, [](const T& a, const T& b) { return std::max<T>(a, b); });
+    v.begin(), v.end(), init_value, [](T const& a, T const& b) { return std::max<T>(a, b); });
 
   EXPECT_EQ(
     this->template reduction_test<T>(col, *cudf::make_min_aggregation<reduce_aggregation>()).first,
@@ -183,11 +183,11 @@ TYPED_TEST(MinMaxReductionTest, MinMaxTypes)
   T expected_min_null_result = *(std::min_element(r_min.begin(), r_min.end()));
   T expected_max_null_result = *(std::max_element(r_max.begin(), r_max.end()));
   T expected_min_init_null_result =
-    std::accumulate(r_min.begin(), r_min.end(), init_value, [](const T& a, const T& b) {
+    std::accumulate(r_min.begin(), r_min.end(), init_value, [](T const& a, T const& b) {
       return std::min<T>(a, b);
     });
   T expected_max_init_null_result =
-    std::accumulate(r_max.begin(), r_max.end(), init_value, [](const T& a, const T& b) {
+    std::accumulate(r_max.begin(), r_max.end(), init_value, [](T const& a, T const& b) {
       return std::max<T>(a, b);
     });
 
@@ -2077,88 +2077,43 @@ TYPED_TEST(DictionaryAnyAllTest, AnyAll)
   std::vector<T> v_some = convert_values<T>(some_values);
   cudf::data_type output_dtype(cudf::type_id::BOOL8);
 
+  auto any_agg = cudf::make_any_aggregation<reduce_aggregation>();
+  auto all_agg = cudf::make_all_aggregation<reduce_aggregation>();
+
   // without nulls
   {
     cudf::test::dictionary_column_wrapper<T> all_col(v_all.begin(), v_all.end());
-    EXPECT_TRUE(this
-                  ->template reduction_test<bool>(
-                    all_col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
-                  .first);
-    EXPECT_TRUE(this
-                  ->template reduction_test<bool>(
-                    all_col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
-                  .first);
+    EXPECT_TRUE(this->template reduction_test<bool>(all_col, *any_agg, output_dtype).first);
+    EXPECT_TRUE(this->template reduction_test<bool>(all_col, *all_agg, output_dtype).first);
     cudf::test::dictionary_column_wrapper<T> none_col(v_none.begin(), v_none.end());
-    EXPECT_FALSE(this
-                   ->template reduction_test<bool>(
-                     none_col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
-                   .first);
-    EXPECT_FALSE(this
-                   ->template reduction_test<bool>(
-                     none_col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
-                   .first);
+    EXPECT_FALSE(this->template reduction_test<bool>(none_col, *any_agg, output_dtype).first);
+    EXPECT_FALSE(this->template reduction_test<bool>(none_col, *all_agg, output_dtype).first);
     cudf::test::dictionary_column_wrapper<T> some_col(v_some.begin(), v_some.end());
-    EXPECT_TRUE(this
-                  ->template reduction_test<bool>(
-                    some_col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
-                  .first);
-    EXPECT_FALSE(this
-                   ->template reduction_test<bool>(
-                     some_col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
-                   .first);
+    EXPECT_TRUE(this->template reduction_test<bool>(some_col, *any_agg, output_dtype).first);
+    EXPECT_FALSE(this->template reduction_test<bool>(some_col, *all_agg, output_dtype).first);
     // sliced test
-    EXPECT_TRUE(this
-                  ->template reduction_test<bool>(cudf::slice(some_col, {1, 3}).front(),
-                                                  *cudf::make_any_aggregation<reduce_aggregation>(),
-                                                  output_dtype)
-                  .first);
-    EXPECT_TRUE(this
-                  ->template reduction_test<bool>(cudf::slice(some_col, {1, 2}).front(),
-                                                  *cudf::make_all_aggregation<reduce_aggregation>(),
-                                                  output_dtype)
-                  .first);
+    auto slice1 = cudf::slice(some_col, {1, 3}).front();
+    auto slice2 = cudf::slice(some_col, {1, 2}).front();
+    EXPECT_TRUE(this->template reduction_test<bool>(slice1, *any_agg, output_dtype).first);
+    EXPECT_TRUE(this->template reduction_test<bool>(slice2, *all_agg, output_dtype).first);
   }
   // with nulls
   {
     std::vector<bool> valid({1, 1, 0, 1});
     cudf::test::dictionary_column_wrapper<T> all_col(v_all.begin(), v_all.end(), valid.begin());
-    EXPECT_TRUE(this
-                  ->template reduction_test<bool>(
-                    all_col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
-                  .first);
-    EXPECT_TRUE(this
-                  ->template reduction_test<bool>(
-                    all_col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
-                  .first);
+    EXPECT_TRUE(this->template reduction_test<bool>(all_col, *any_agg, output_dtype).first);
+    EXPECT_TRUE(this->template reduction_test<bool>(all_col, *all_agg, output_dtype).first);
     cudf::test::dictionary_column_wrapper<T> none_col(v_none.begin(), v_none.end(), valid.begin());
-    EXPECT_FALSE(this
-                   ->template reduction_test<bool>(
-                     none_col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
-                   .first);
-    EXPECT_FALSE(this
-                   ->template reduction_test<bool>(
-                     none_col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
-                   .first);
+    EXPECT_FALSE(this->template reduction_test<bool>(none_col, *any_agg, output_dtype).first);
+    EXPECT_FALSE(this->template reduction_test<bool>(none_col, *all_agg, output_dtype).first);
     cudf::test::dictionary_column_wrapper<T> some_col(v_some.begin(), v_some.end(), valid.begin());
-    EXPECT_TRUE(this
-                  ->template reduction_test<bool>(
-                    some_col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
-                  .first);
-    EXPECT_FALSE(this
-                   ->template reduction_test<bool>(
-                     some_col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
-                   .first);
+    EXPECT_TRUE(this->template reduction_test<bool>(some_col, *any_agg, output_dtype).first);
+    EXPECT_FALSE(this->template reduction_test<bool>(some_col, *all_agg, output_dtype).first);
     // sliced test
-    EXPECT_TRUE(this
-                  ->template reduction_test<bool>(cudf::slice(some_col, {0, 3}).front(),
-                                                  *cudf::make_any_aggregation<reduce_aggregation>(),
-                                                  output_dtype)
-                  .first);
-    EXPECT_TRUE(this
-                  ->template reduction_test<bool>(cudf::slice(some_col, {1, 4}).front(),
-                                                  *cudf::make_all_aggregation<reduce_aggregation>(),
-                                                  output_dtype)
-                  .first);
+    auto slice1 = cudf::slice(some_col, {0, 3}).front();
+    auto slice2 = cudf::slice(some_col, {1, 4}).front();
+    EXPECT_TRUE(this->template reduction_test<bool>(slice1, *any_agg, output_dtype).first);
+    EXPECT_TRUE(this->template reduction_test<bool>(slice2, *all_agg, output_dtype).first);
   }
 }
 
@@ -2593,6 +2548,105 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement)
     *cudf::make_nth_element_aggregation<reduce_aggregation>(0, cudf::null_policy::INCLUDE));
 }
 
+TEST_F(ListReductionTest, ReductionMinMaxNoNull)
+{
+  using INTS_CW          = cudf::test::fixed_width_column_wrapper<int>;
+  using LISTS_CW         = cudf::test::lists_column_wrapper<int>;
+  using STRINGS_CW       = cudf::test::strings_column_wrapper;
+  using LISTS_STRINGS_CW = cudf::test::lists_column_wrapper<cudf::string_view>;
+
+  {
+    auto const input = LISTS_CW{{3, 4}, {1, 2}, {5, 6, 7}, {0, 8}, {9, 10}, {1, 0}};
+    this->reduction_test(
+      input, INTS_CW{0, 8}, true, true, *cudf::make_min_aggregation<reduce_aggregation>());
+    this->reduction_test(
+      input, INTS_CW{9, 10}, true, true, *cudf::make_max_aggregation<reduce_aggregation>());
+  }
+  {
+    auto const input = LISTS_STRINGS_CW{
+      {"34", "43"}, {"12", "21"}, {"567", "6", "765"}, {"08", "8"}, {"109", "10"}, {"10", "00"}};
+    this->reduction_test(
+      input, STRINGS_CW{"08", "8"}, true, true, *cudf::make_min_aggregation<reduce_aggregation>());
+    this->reduction_test(input,
+                         STRINGS_CW{"567", "6", "765"},
+                         true,
+                         true,
+                         *cudf::make_max_aggregation<reduce_aggregation>());
+  }
+}
+
+TEST_F(ListReductionTest, ReductionMinMaxSlicedInput)
+{
+  using INTS_CW          = cudf::test::fixed_width_column_wrapper<int>;
+  using LISTS_CW         = cudf::test::lists_column_wrapper<int>;
+  using STRINGS_CW       = cudf::test::strings_column_wrapper;
+  using LISTS_STRINGS_CW = cudf::test::lists_column_wrapper<cudf::string_view>;
+
+  {
+    auto const input_original = LISTS_CW{{9, 9} /*don't care*/,
+                                         {0, 0} /*don't care*/,
+                                         {3, 4},
+                                         {1, 2},
+                                         {5, 6, 7},
+                                         {0, 8},
+                                         {9, 10},
+                                         {1, 0},
+                                         {0, 7} /*don't care*/};
+    auto const input          = cudf::slice(input_original, {2, 8})[0];
+    this->reduction_test(
+      input, INTS_CW{0, 8}, true, true, *cudf::make_min_aggregation<reduce_aggregation>());
+    this->reduction_test(
+      input, INTS_CW{9, 10}, true, true, *cudf::make_max_aggregation<reduce_aggregation>());
+  }
+  {
+    auto const input_original = LISTS_STRINGS_CW{{"08", "8"} /*don't care*/,
+                                                 {"999", "8"} /*don't care*/,
+                                                 {"34", "43"},
+                                                 {"12", "21"},
+                                                 {"567", "6", "765"},
+                                                 {"08", "8"},
+                                                 {"109", "10"},
+                                                 {"10", "00"}};
+    auto const input          = cudf::slice(input_original, {2, 8})[0];
+    this->reduction_test(
+      input, STRINGS_CW{"08", "8"}, true, true, *cudf::make_min_aggregation<reduce_aggregation>());
+    this->reduction_test(input,
+                         STRINGS_CW{"567", "6", "765"},
+                         true,
+                         true,
+                         *cudf::make_max_aggregation<reduce_aggregation>());
+  }
+}
+
+TEST_F(ListReductionTest, ReductionMinMaxWithNulls)
+{
+  using INTS_CW  = cudf::test::fixed_width_column_wrapper<int>;
+  using LISTS_CW = cudf::test::lists_column_wrapper<int>;
+  using cudf::test::iterators::null_at;
+  using cudf::test::iterators::nulls_at;
+  constexpr int null{0};
+
+  auto const input = LISTS_CW{{LISTS_CW{3, 4},
+                               LISTS_CW{1, 2},
+                               LISTS_CW{{1, null}, null_at(1)},
+                               LISTS_CW{} /*null*/,
+                               LISTS_CW{5, 6, 7},
+                               LISTS_CW{1, 8},
+                               LISTS_CW{{9, null}, null_at(1)},
+                               LISTS_CW{} /*null*/},
+                              nulls_at({3, 7})};
+  this->reduction_test(input,
+                       INTS_CW{{1, null}, null_at(1)},
+                       true,
+                       true,
+                       *cudf::make_min_aggregation<reduce_aggregation>());
+  this->reduction_test(input,
+                       INTS_CW{{9, null}, null_at(1)},
+                       true,
+                       true,
+                       *cudf::make_max_aggregation<reduce_aggregation>());
+}
+
 struct StructReductionTest : public cudf::test::BaseFixture {
   using SCW = cudf::test::structs_column_wrapper;
 
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index 8f843f0d2b3..161b1ee61ac 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -57,6 +57,7 @@ struct ScanTest : public BaseScanTest<T> {
       auto expected_col_out = this->make_expected(v, b, agg, inclusive, null_handling, scale);
       auto col_out          = scan(*col_in, agg, inclusive, null_handling);
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_col_out, *col_out);
+      EXPECT_FALSE(cudf::has_nonempty_nulls(col_out->view()));
     }
   }
 
diff --git a/cpp/tests/reductions/tdigest_tests.cu b/cpp/tests/reductions/tdigest_tests.cu
index b0087766c0c..c8fec51e1c9 100644
--- a/cpp/tests/reductions/tdigest_tests.cu
+++ b/cpp/tests/reductions/tdigest_tests.cu
@@ -94,7 +94,7 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids)
   cudf::test::fixed_width_column_wrapper<double> c0c{1.0, 2.0};
   cudf::test::fixed_width_column_wrapper<double> c0w{100.0, 50.0};
   cudf::test::structs_column_wrapper c0s({c0c, c0w});
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> c0_offsets{0, 2};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> c0_offsets{0, 2};
   auto c0l =
     cudf::make_lists_column(1, c0_offsets.release(), c0s.release(), 0, rmm::device_buffer{});
   cudf::test::fixed_width_column_wrapper<double> c0min{1.0};
@@ -111,7 +111,7 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids)
   cudf::test::fixed_width_column_wrapper<double> c1c{3.0, 4.0};
   cudf::test::fixed_width_column_wrapper<double> c1w{200.0, 50.0};
   cudf::test::structs_column_wrapper c1s({c1c, c1w});
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> c1_offsets{0, 2};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> c1_offsets{0, 2};
   auto c1l =
     cudf::make_lists_column(1, c1_offsets.release(), c1s.release(), 0, rmm::device_buffer{});
   cudf::test::fixed_width_column_wrapper<double> c1min{3.0};
@@ -147,7 +147,7 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids)
   cudf::test::fixed_width_column_wrapper<double> ec{1.0, 2.0, 3.0, 4.0};
   cudf::test::fixed_width_column_wrapper<double> ew{100.0, 50.0, 200.0, 50.0};
   cudf::test::structs_column_wrapper es({ec, ew});
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> e_offsets{0, 4};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> e_offsets{0, 4};
   auto el = cudf::make_lists_column(1, e_offsets.release(), es.release(), 0, rmm::device_buffer{});
   cudf::test::fixed_width_column_wrapper<double> emin{1.0};
   cudf::test::fixed_width_column_wrapper<double> emax{4.0};
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 7e84a0695e3..a7c54145708 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -46,8 +46,7 @@ TEST_F(ReplaceErrorTest, SizeMismatch)
                                                                {0, 0, 1, 1, 1, 1, 1, 1}};
   cudf::test::fixed_width_column_wrapper<int32_t> values_to_replace_column{{10, 11, 12, 13}};
 
-  ASSERT_THROW(cudf::replace_nulls(input_column, values_to_replace_column, mr()),
-               cudf::logic_error);
+  ASSERT_THROW(cudf::replace_nulls(input_column, values_to_replace_column), cudf::logic_error);
 }
 
 // Error: column type mismatch
@@ -58,8 +57,7 @@ TEST_F(ReplaceErrorTest, TypeMismatch)
   cudf::test::fixed_width_column_wrapper<float> values_to_replace_column{
     {10, 11, 12, 13, 14, 15, 16, 17}};
 
-  EXPECT_THROW(cudf::replace_nulls(input_column, values_to_replace_column, mr()),
-               cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input_column, values_to_replace_column), cudf::logic_error);
 }
 
 // Error: column type mismatch
@@ -69,7 +67,7 @@ TEST_F(ReplaceErrorTest, TypeMismatchScalar)
                                                                {0, 0, 1, 1, 1, 1, 1, 1}};
   cudf::numeric_scalar<float> replacement(1);
 
-  EXPECT_THROW(cudf::replace_nulls(input_column, replacement, mr()), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input_column, replacement), cudf::logic_error);
 }
 
 struct ReplaceNullsStringsTest : public cudf::test::BaseFixture {};
@@ -88,7 +86,7 @@ TEST_F(ReplaceNullsStringsTest, SimpleReplace)
     replacement.begin(), replacement.end(), replacement_v.begin()};
 
   std::unique_ptr<cudf::column> result;
-  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w, mr()));
+  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_w);
 }
@@ -107,7 +105,7 @@ TEST_F(ReplaceNullsStringsTest, ReplaceWithNulls)
     replacement.begin(), replacement.end(), replacement_v.begin()};
 
   std::unique_ptr<cudf::column> result;
-  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w, mr()));
+  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_w);
 }
@@ -125,7 +123,7 @@ TEST_F(ReplaceNullsStringsTest, ReplaceWithAllNulls)
   cudf::test::strings_column_wrapper expected_w{input.begin(), input.end(), input_v.begin()};
 
   std::unique_ptr<cudf::column> result;
-  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w, mr()));
+  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_w);
 }
@@ -143,7 +141,7 @@ TEST_F(ReplaceNullsStringsTest, ReplaceWithAllEmpty)
   cudf::test::strings_column_wrapper expected_w{input.begin(), input.end(), replacement_v.begin()};
 
   std::unique_ptr<cudf::column> result;
-  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w, mr()));
+  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_w);
 }
@@ -161,7 +159,7 @@ TEST_F(ReplaceNullsStringsTest, ReplaceNone)
   cudf::test::strings_column_wrapper expected_w{input.begin(), input.end()};
 
   std::unique_ptr<cudf::column> result;
-  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w, mr()));
+  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, replacement_w));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected_w);
 }
@@ -170,8 +168,7 @@ TEST_F(ReplaceNullsStringsTest, SimpleReplaceScalar)
 {
   std::vector<std::string> input{"", "", "", "", "", "", "", ""};
   std::vector<cudf::valid_type> input_v{0, 0, 0, 0, 0, 0, 0, 0};
-  std::unique_ptr<cudf::scalar> repl =
-    cudf::make_string_scalar("rep", cudf::get_default_stream(), mr());
+  std::unique_ptr<cudf::scalar> repl = cudf::make_string_scalar("rep");
   repl->set_valid_async(true, cudf::get_default_stream());
   std::vector<std::string> expected{"rep", "rep", "rep", "rep", "rep", "rep", "rep", "rep"};
 
@@ -179,7 +176,7 @@ TEST_F(ReplaceNullsStringsTest, SimpleReplaceScalar)
   cudf::test::strings_column_wrapper expected_w{expected.begin(), expected.end()};
 
   std::unique_ptr<cudf::column> result;
-  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, *repl, mr()));
+  ASSERT_NO_THROW(result = cudf::replace_nulls(input_w, *repl));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_w);
 }
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index b7981d9f5f5..63460b0cb15 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -47,9 +47,9 @@ TEST_F(ReplaceErrorTest, SizeMismatch)
   cudf::test::fixed_width_column_wrapper<int32_t> values_to_replace_column{{10, 11, 12, 13}};
   cudf::test::fixed_width_column_wrapper<int32_t> replacement_values_column{{15, 16, 17}};
 
-  EXPECT_THROW(cudf::find_and_replace_all(
-                 input_column, values_to_replace_column, replacement_values_column, mr()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::find_and_replace_all(input_column, values_to_replace_column, replacement_values_column),
+    cudf::logic_error);
 }
 
 // Error: column type mismatch
@@ -59,9 +59,9 @@ TEST_F(ReplaceErrorTest, TypeMismatch)
   cudf::test::fixed_width_column_wrapper<float> values_to_replace_column{{10, 11, 12}};
   cudf::test::fixed_width_column_wrapper<int32_t> replacement_values_column{{15, 16, 17}};
 
-  EXPECT_THROW(cudf::find_and_replace_all(
-                 input_column, values_to_replace_column, replacement_values_column, mr()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::find_and_replace_all(input_column, values_to_replace_column, replacement_values_column),
+    cudf::logic_error);
 }
 
 // Error: nulls in old-values
@@ -72,9 +72,9 @@ TEST_F(ReplaceErrorTest, NullInOldValues)
                                                                            {0, 1, 0, 1}};
   cudf::test::fixed_width_column_wrapper<int32_t> replacement_values_column{{15, 16, 17, 18}};
 
-  EXPECT_THROW(cudf::find_and_replace_all(
-                 input_column, values_to_replace_column, replacement_values_column, mr()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::find_and_replace_all(input_column, values_to_replace_column, replacement_values_column),
+    cudf::logic_error);
 }
 
 struct ReplaceStringsTest : public cudf::test::BaseFixture {};
@@ -93,7 +93,7 @@ TEST_F(ReplaceStringsTest, Strings)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   std::vector<std::string> expected{"z", "b", "c", "d", "e", "f", "g", "h"};
   std::vector<cudf::valid_type> ex_valid{1, 1, 1, 1, 1, 1, 1, 1};
   cudf::test::strings_column_wrapper expected_wrapper{
@@ -117,7 +117,7 @@ TEST_F(ReplaceStringsTest, StringsReplacementNulls)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   std::vector<std::string> expected{"z", "", "c", "d", "e", "f", "g", "h"};
   std::vector<cudf::valid_type> ex_valid{1, 0, 1, 1, 1, 1, 1, 1};
   cudf::test::strings_column_wrapper expected_wrapper{
@@ -143,7 +143,7 @@ TEST_F(ReplaceStringsTest, StringsResultAllNulls)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   cudf::test::strings_column_wrapper expected_wrapper{
     expected.begin(), expected.end(), ex_valid.begin()};
 
@@ -167,7 +167,7 @@ TEST_F(ReplaceStringsTest, StringsResultAllEmpty)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   cudf::test::strings_column_wrapper expected_wrapper{
     expected.begin(), expected.end(), ex_valid.begin()};
 
@@ -188,7 +188,7 @@ TEST_F(ReplaceStringsTest, StringsInputNulls)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   std::vector<std::string> expected{"z", "y", "", "", "e", "f", "g", "h"};
   std::vector<cudf::valid_type> ex_valid{1, 1, 0, 0, 1, 1, 1, 1};
   cudf::test::strings_column_wrapper expected_wrapper{
@@ -213,7 +213,7 @@ TEST_F(ReplaceStringsTest, StringsInputAndReplacementNulls)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   std::vector<std::string> expected{"z", "", "", "", "e", "f", "g", "h"};
   std::vector<cudf::valid_type> ex_valid{1, 0, 0, 0, 1, 1, 1, 1};
   cudf::test::strings_column_wrapper expected_wrapper{
@@ -236,7 +236,7 @@ TEST_F(ReplaceStringsTest, StringsEmptyReplacement)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
   std::vector<std::string> expected{"a", "b", "", "", "e", "f", "g", "h"};
   std::vector<cudf::valid_type> ex_valid{1, 1, 0, 0, 1, 1, 1, 1};
   cudf::test::strings_column_wrapper expected_wrapper{
@@ -281,7 +281,7 @@ TEST_F(ReplaceStringsTest, StringsLargeScale)
 
   std::unique_ptr<cudf::column> result;
   ASSERT_NO_THROW(result = cudf::find_and_replace_all(
-                    input_wrapper, values_to_replace_wrapper, replacement_wrapper, mr()));
+                    input_wrapper, values_to_replace_wrapper, replacement_wrapper));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_wrapper);
 }
@@ -352,8 +352,8 @@ void test_replace(cudf::host_span<T const> input_column,
     expected_valid.assign(input_column.size(), true);
   }
 
-  const bool input_has_nulls       = (input_column_valid.size() > 0);
-  const bool replacement_has_nulls = (replacement_values_valid.size() > 0);
+  bool const input_has_nulls       = (input_column_valid.size() > 0);
+  bool const replacement_has_nulls = (replacement_values_valid.size() > 0);
 
   for (size_t i = 0; i < values_to_replace_column.size(); i++) {
     size_t k  = 0;
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index e2697567c38..eba6c961bbb 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -189,7 +189,7 @@ struct InterleaveStringsColumnsTest : public cudf::test::BaseFixture {};
 
 TEST_F(InterleaveStringsColumnsTest, ZeroSizedColumns)
 {
-  cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const col0 = cudf::make_empty_column(cudf::type_id::STRING)->view();
 
   auto results = cudf::interleave_columns(cudf::table_view{{col0}});
   cudf::test::expect_column_empty(results->view());
diff --git a/cpp/tests/rolling/grouped_rolling_range_test.cpp b/cpp/tests/rolling/grouped_rolling_range_test.cpp
index 6321b193e26..fcfbd0eee78 100644
--- a/cpp/tests/rolling/grouped_rolling_range_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_range_test.cpp
@@ -46,20 +46,359 @@ using bigints_column  = fwcw<int64_t>;
 using strings_column  = cudf::test::strings_column_wrapper;
 using column_ptr      = std::unique_ptr<cudf::column>;
 
-struct BaseGroupedRollingRangeOrderByDecimalTest : public cudf::test::BaseFixture {
+template <typename T>
+struct BaseGroupedRollingRangeOrderByTest : cudf::test::BaseFixture {
   // Stand-in for std::pow(10, n), but for integral return.
   static constexpr std::array<int32_t, 6> pow10{1, 10, 100, 1000, 10000, 100000};
+
   // Test data.
   column_ptr const grouping_keys = ints_column{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2}.release();
   column_ptr const agg_values    = ints_column{1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}.release();
   cudf::size_type const num_rows = grouping_keys->size();
+
+  /**
+   * @brief Get grouped rolling results for specified order-by column and range bounds.
+   */
+  [[nodiscard]] column_ptr get_grouped_range_rolling_result(
+    cudf::range_window_bounds const& preceding,
+    cudf::range_window_bounds const& following,
+    cudf::column_view const& order_by_column,
+    cudf::rolling_aggregation const& agg,
+    cudf::order const& order = cudf::order::ASCENDING) const
+  {
+    return cudf::grouped_range_rolling_window(cudf::table_view{{grouping_keys->view()}},
+                                              order_by_column,
+                                              order,
+                                              agg_values->view(),
+                                              preceding,
+                                              following,
+                                              1,  // min_periods
+                                              agg);
+  }
+
+  [[nodiscard]] column_ptr get_grouped_range_rolling_sum_result(
+    cudf::range_window_bounds const& preceding,
+    cudf::range_window_bounds const& following,
+    cudf::column_view const& order_by_column,
+    cudf::order const& order = cudf::order::ASCENDING) const
+  {
+    return get_grouped_range_rolling_result(
+      preceding,
+      following,
+      order_by_column,
+      *cudf::make_sum_aggregation<cudf::rolling_aggregation>(),
+      order);
+  }
+};
+
+template <typename T>
+struct GroupedRollingRangeOrderByNumericTest : public BaseGroupedRollingRangeOrderByTest<T> {
+  using base = BaseGroupedRollingRangeOrderByTest<T>;
+
+  using base::agg_values;
+  using base::get_grouped_range_rolling_sum_result;
+  using base::grouping_keys;
+  using base::num_rows;
+
+  static auto constexpr inf = std::numeric_limits<T>::infinity();
+  static auto constexpr nan = std::numeric_limits<T>::quiet_NaN();
+
+  [[nodiscard]] auto make_range_bounds(T const& value) const
+  {
+    return cudf::range_window_bounds::get(*cudf::make_fixed_width_scalar(value));
+  }
+
+  [[nodiscard]] auto make_unbounded_range_bounds() const
+  {
+    return cudf::range_window_bounds::unbounded(cudf::data_type{cudf::type_to_id<T>()});
+  }
+
+  /// Generate order-by column with values: [0, 100,   200,   300,   ... 1100,   1200,   1300]
+  [[nodiscard]] column_ptr generate_order_by_column() const
+  {
+    auto const begin = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<cudf::size_type>(0), [&](T const& i) -> T { return i * 100; });
+
+    return fwcw<T>(begin, begin + num_rows).release();
+  }
+
+  /// Generate order-by column with values: [-1400, -1300, -1200 ... -300, -200, -100]
+  [[nodiscard]] column_ptr generate_negative_order_by_column() const
+  {
+    auto const begin =
+      thrust::make_transform_iterator(thrust::make_counting_iterator<cudf::size_type>(0),
+                                      [&](T const& i) -> T { return (i - num_rows) * 100; });
+
+    return fwcw<T>(begin, begin + num_rows).release();
+  }
+
+  /**
+   * @brief Run grouped_rolling test with no nulls in the order-by column
+   */
+  void run_test_no_null_oby() const
+  {
+    auto const preceding = make_range_bounds(T{200});
+    auto const following = make_range_bounds(T{100});
+    auto const order_by  = generate_order_by_column();
+    auto const results   = get_grouped_range_rolling_sum_result(preceding, following, *order_by);
+    auto const expected_results = bigints_column{{2, 3, 4, 4, 4, 3, 4, 6, 8, 6, 6, 9, 12, 9},
+                                                 cudf::test::iterators::no_nulls()};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+  }
+
+  /**
+   * @brief Run grouped_rolling test with no nulls in the order-by column
+   */
+  void run_test_negative_oby() const
+  {
+    auto const preceding = make_range_bounds(T{200});
+    auto const following = make_range_bounds(T{100});
+    auto const order_by  = generate_negative_order_by_column();
+    auto const results   = get_grouped_range_rolling_sum_result(preceding, following, *order_by);
+    auto const expected_results = bigints_column{{2, 3, 4, 4, 4, 3, 4, 6, 8, 6, 6, 9, 12, 9},
+                                                 cudf::test::iterators::no_nulls()};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+  }
+
+  /**
+   * @brief Run grouped_rolling test with nulls in the order-by column
+   * (i.e. 2 nulls at the beginning of each group)
+   *
+   */
+  void run_test_nulls_in_oby() const
+  {
+    auto const preceding = make_range_bounds(T{200});
+    auto const following = make_range_bounds(T{100});
+
+    // Nullify the first two rows of each group in the order_by column.
+    auto const nulled_order_by = [&] {
+      auto col           = generate_order_by_column();
+      auto new_null_mask = create_null_mask(col->size(), cudf::mask_state::ALL_VALID);
+      cudf::set_null_mask(static_cast<cudf::bitmask_type*>(new_null_mask.data()),
+                          0,
+                          2,
+                          false);  // Nulls in first group.
+      cudf::set_null_mask(static_cast<cudf::bitmask_type*>(new_null_mask.data()),
+                          6,
+                          8,
+                          false);  // Nulls in second group.
+      cudf::set_null_mask(static_cast<cudf::bitmask_type*>(new_null_mask.data()),
+                          10,
+                          12,
+                          false);  // Nulls in third group.
+      col->set_null_mask(std::move(new_null_mask), 6);
+      return col;
+    }();
+
+    auto const results =
+      get_grouped_range_rolling_sum_result(preceding, following, *nulled_order_by);
+    auto const expected_results =
+      bigints_column{{2, 2, 2, 3, 4, 3, 4, 4, 4, 4, 6, 6, 6, 6}, cudf::test::iterators::no_nulls()};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+  }
+
+  /**
+   * @brief Run grouped_rolling test with unbounded preceding and unbounded following.
+   */
+  void run_test_unbounded_preceding_to_unbounded_following()
+  {
+    auto const order_by  = generate_order_by_column();
+    auto const preceding = make_unbounded_range_bounds();
+    auto const following = make_unbounded_range_bounds();
+    auto const results   = get_grouped_range_rolling_sum_result(preceding, following, *order_by);
+
+    auto const expected_results = bigints_column{6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 12, 12, 12, 12};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+  }
+
+  /**
+   * @brief Run grouped_rolling test with unbounded preceding and current row.
+   */
+  void run_test_unbounded_preceding_to_current_row()
+  {
+    auto const order_by            = generate_order_by_column();
+    auto const unbounded_preceding = make_unbounded_range_bounds();
+    auto const current_row         = make_range_bounds(T{0});
+    auto const results =
+      get_grouped_range_rolling_sum_result(unbounded_preceding, current_row, *order_by);
+
+    auto const expected_results = bigints_column{{1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 3, 6, 9, 12},
+                                                 cudf::test::iterators::no_nulls()};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+  }
+
+  /**
+   * @brief Run grouped_rolling test with current row and unbounded following.
+   */
+  void run_test_current_row_to_unbounded_following()
+  {
+    auto const order_by            = generate_order_by_column();
+    auto const unbounded_following = make_unbounded_range_bounds();
+
+    auto const current_row = make_range_bounds(T{0});
+    auto const results =
+      get_grouped_range_rolling_sum_result(current_row, unbounded_following, *order_by);
+
+    auto const expected_results = bigints_column{{6, 5, 4, 3, 2, 1, 8, 6, 4, 2, 12, 9, 6, 3},
+                                                 cudf::test::iterators::no_nulls()};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+  }
+
+  [[nodiscard]] column_ptr generate_ascending_order_by_NaNs_infinity()
+  {
+    auto const vec =
+      std::vector<T>{-inf, -inf, -50, 0, 7, nan, -inf, 0, inf, nan, 0, inf, nan, nan};
+    return fwcw<T>(vec.begin(), vec.end()).release();
+  }
+
+  void run_test_bounded_ascending_order_by_NaNs_infinity()
+  {
+    auto const order_by  = generate_ascending_order_by_NaNs_infinity();
+    auto const preceding = make_range_bounds(T{200});
+    auto const following = make_range_bounds(T{100});
+
+    auto const results = get_grouped_range_rolling_sum_result(preceding, following, *order_by);
+
+    auto const expected_results =
+      bigints_column{{2, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 6, 6}, cudf::test::iterators::no_nulls()};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+  }
+
+  void run_test_unbounded_ascending_order_by_NaNs_infinity()
+  {
+    auto const order_by = generate_ascending_order_by_NaNs_infinity();
+
+    {
+      // UNBOUNDED PRECEDING to CURRENT ROW.
+      auto const preceding = make_unbounded_range_bounds();
+      auto const following = make_range_bounds(T{0});
+
+      auto const results = get_grouped_range_rolling_sum_result(preceding, following, *order_by);
+
+      auto const expected_results = bigints_column{{2, 2, 3, 4, 5, 6, 2, 4, 6, 8, 3, 6, 12, 12},
+                                                   cudf::test::iterators::no_nulls()};
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+    }
+    {
+      // CURRENT ROW to UNBOUNDED FOLLOWING
+      auto const preceding = make_range_bounds(T{0});
+      auto const following = make_unbounded_range_bounds();
+
+      auto const results = get_grouped_range_rolling_sum_result(preceding, following, *order_by);
+
+      auto const expected_results = bigints_column{{6, 6, 4, 3, 2, 1, 8, 6, 4, 2, 12, 9, 6, 6},
+                                                   cudf::test::iterators::no_nulls()};
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+    }
+    {
+      // UNBOUNDED PRECEDING to UNBOUNDED FOLLOWING
+      auto const preceding = make_unbounded_range_bounds();
+      auto const following = make_unbounded_range_bounds();
+
+      auto const results = get_grouped_range_rolling_sum_result(preceding, following, *order_by);
+
+      auto const expected_results = bigints_column{6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 12, 12, 12, 12};
+
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+    }
+  }
+
+  [[nodiscard]] column_ptr generate_descending_order_by_NaNs_infinity()
+  {
+    auto const vec =
+      std::vector<T>{nan, 7, 0, -50, -inf, -inf, nan, inf, 0, -inf, nan, nan, inf, 0};
+    return fwcw<T>(vec.begin(), vec.end()).release();
+  }
+
+  void run_test_bounded_descending_order_by_NaNs_infinity()
+  {
+    auto const order_by  = generate_descending_order_by_NaNs_infinity();
+    auto const preceding = make_range_bounds(T{200});
+    auto const following = make_range_bounds(T{100});
+
+    auto const results = get_grouped_range_rolling_sum_result(
+      preceding, following, *order_by, cudf::order::DESCENDING);
+
+    auto const expected_results =
+      bigints_column{{1, 3, 3, 3, 2, 2, 2, 2, 2, 2, 6, 6, 3, 3}, cudf::test::iterators::no_nulls()};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+  }
+
+  void run_test_unbounded_descending_order_by_NaNs_infinity()
+  {
+    auto const order_by = generate_descending_order_by_NaNs_infinity();
+
+    {
+      // UNBOUNDED PRECEDING to CURRENT ROW.
+      auto const preceding = make_unbounded_range_bounds();
+      auto const following = make_range_bounds(T{0});
+
+      auto const results = get_grouped_range_rolling_sum_result(
+        preceding, following, *order_by, cudf::order::DESCENDING);
+
+      auto const expected_results = bigints_column{{1, 2, 3, 4, 6, 6, 2, 4, 6, 8, 6, 6, 9, 12},
+                                                   cudf::test::iterators::no_nulls()};
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+    }
+    {
+      // CURRENT ROW to UNBOUNDED FOLLOWING
+      auto const preceding = make_range_bounds(T{0});
+      auto const following = make_unbounded_range_bounds();
+
+      auto const results = get_grouped_range_rolling_sum_result(
+        preceding, following, *order_by, cudf::order::DESCENDING);
+
+      auto const expected_results = bigints_column{{6, 5, 4, 3, 2, 2, 8, 6, 4, 2, 12, 12, 6, 3},
+                                                   cudf::test::iterators::no_nulls()};
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+    }
+    {
+      // UNBOUNDED PRECEDING to UNBOUNDED FOLLOWING
+      auto const preceding = make_unbounded_range_bounds();
+      auto const following = make_unbounded_range_bounds();
+
+      auto const results = get_grouped_range_rolling_sum_result(
+        preceding, following, *order_by, cudf::order::DESCENDING);
+
+      auto const expected_results = bigints_column{6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 12, 12, 12, 12};
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
+    }
+  }
 };
 
-using base = BaseGroupedRollingRangeOrderByDecimalTest;  // Shortcut to base test class.
+template <typename FloatingPointType>
+struct GroupedRollingRangeOrderByFloatingPointTest
+  : GroupedRollingRangeOrderByNumericTest<FloatingPointType> {};
+
+TYPED_TEST_SUITE(GroupedRollingRangeOrderByFloatingPointTest, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(GroupedRollingRangeOrderByFloatingPointTest, BoundedRanges)
+{
+  this->run_test_no_null_oby();
+  this->run_test_negative_oby();
+  this->run_test_nulls_in_oby();
+  this->run_test_bounded_ascending_order_by_NaNs_infinity();
+  this->run_test_bounded_descending_order_by_NaNs_infinity();
+}
+
+TYPED_TEST(GroupedRollingRangeOrderByFloatingPointTest, UnboundedRanges)
+{
+  this->run_test_unbounded_preceding_to_unbounded_following();
+  this->run_test_unbounded_preceding_to_current_row();
+  this->run_test_current_row_to_unbounded_following();
+  this->run_test_unbounded_ascending_order_by_NaNs_infinity();
+  this->run_test_unbounded_descending_order_by_NaNs_infinity();
+}
 
 template <typename DecimalT>
-struct GroupedRollingRangeOrderByDecimalTypedTest : BaseGroupedRollingRangeOrderByDecimalTest {
-  using Rep = typename DecimalT::rep;
+struct GroupedRollingRangeOrderByDecimalTypedTest
+  : BaseGroupedRollingRangeOrderByTest<typename DecimalT::rep> {
+  using Rep  = typename DecimalT::rep;
+  using base = BaseGroupedRollingRangeOrderByTest<Rep>;
+
+  using base::agg_values;
+  using base::grouping_keys;
+  using base::num_rows;
 
   [[nodiscard]] auto make_fixed_point_range_bounds(typename DecimalT::rep value,
                                                    numeric::scale_type scale) const
@@ -108,23 +447,15 @@ struct GroupedRollingRangeOrderByDecimalTypedTest : BaseGroupedRollingRangeOrder
    * @brief Get grouped rolling results for specified order-by column and range scale
    *
    */
-  column_ptr get_grouped_range_rolling_result(cudf::column_view const& order_by_column,
-                                              numeric::scale_type const& range_scale) const
+  [[nodiscard]] column_ptr get_grouped_range_rolling_result(
+    cudf::column_view const& order_by_column, numeric::scale_type const& range_scale) const
   {
     auto const preceding =
       this->make_fixed_point_range_bounds(rescale_range_value(Rep{200}, range_scale), range_scale);
     auto const following =
       this->make_fixed_point_range_bounds(rescale_range_value(Rep{100}, range_scale), range_scale);
 
-    return cudf::grouped_range_rolling_window(
-      cudf::table_view{{grouping_keys->view()}},
-      order_by_column,
-      cudf::order::ASCENDING,
-      agg_values->view(),
-      preceding,
-      following,
-      1,  // min_periods
-      *cudf::make_sum_aggregation<cudf::rolling_aggregation>());
+    return base::get_grouped_range_rolling_sum_result(preceding, following, order_by_column);
   }
 
   /**
@@ -202,14 +533,13 @@ struct GroupedRollingRangeOrderByDecimalTypedTest : BaseGroupedRollingRangeOrder
                                          1,  // min_periods
                                          *cudf::make_sum_aggregation<cudf::rolling_aggregation>());
 
-    auto expected_results = bigints_column{{6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 12, 12, 12, 12},
-                                           cudf::test::iterators::no_nulls()};
+    auto expected_results = bigints_column{6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 12, 12, 12, 12};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_results);
   }
 
   /**
    * @brief Run grouped_rolling test for specified order-by column scale with
-   * unbounded preceding and unbounded following.
+   * unbounded preceding and current row.
    *
    */
   void run_test_unbounded_preceding_to_current_row(numeric::scale_type oby_column_scale)
@@ -239,7 +569,7 @@ struct GroupedRollingRangeOrderByDecimalTypedTest : BaseGroupedRollingRangeOrder
 
   /**
    * @brief Run grouped_rolling test for specified order-by column scale with
-   * unbounded preceding and unbounded following.
+   * current row and unbounded following.
    *
    */
   void run_test_current_row_to_unbounded_following(numeric::scale_type oby_column_scale)
@@ -368,7 +698,7 @@ TEST_F(GroupedRollingRangeOrderByStringTest, Ascending_Partitioned_NoNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *get_count_over_partitioned_window(
       *orderby, cudf::order::ASCENDING, unbounded_preceding, unbounded_following),
-    nullable_ints_column({6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4}));
+    ints_column{6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *get_count_over_partitioned_window(*orderby, cudf::order::ASCENDING, current_row, current_row),
@@ -400,7 +730,7 @@ TEST_F(GroupedRollingRangeOrderByStringTest, Ascending_NoParts_NoNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *get_count_over_unpartitioned_window(
       *orderby, cudf::order::ASCENDING, unbounded_preceding, unbounded_following),
-    nullable_ints_column({14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}));
+    ints_column{14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*get_count_over_unpartitioned_window(
                                    *orderby, cudf::order::ASCENDING, current_row, current_row),
@@ -435,7 +765,7 @@ TEST_F(GroupedRollingRangeOrderByStringTest, Ascending_Partitioned_WithNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *get_count_over_partitioned_window(
       *orderby, cudf::order::ASCENDING, unbounded_preceding, unbounded_following),
-    nullable_ints_column({6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4}));
+    ints_column{6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *get_count_over_partitioned_window(*orderby, cudf::order::ASCENDING, current_row, current_row),
@@ -470,7 +800,7 @@ TEST_F(GroupedRollingRangeOrderByStringTest, Ascending_NoParts_WithNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *get_count_over_unpartitioned_window(
       *orderby, cudf::order::ASCENDING, unbounded_preceding, unbounded_following),
-    nullable_ints_column({14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}));
+    ints_column{14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*get_count_over_unpartitioned_window(
                                    *orderby, cudf::order::ASCENDING, current_row, current_row),
@@ -502,7 +832,7 @@ TEST_F(GroupedRollingRangeOrderByStringTest, Descending_Partitioned_NoNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *get_count_over_partitioned_window(
       *orderby, cudf::order::DESCENDING, unbounded_preceding, unbounded_following),
-    nullable_ints_column({6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4}));
+    ints_column{6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *get_count_over_partitioned_window(*orderby, cudf::order::DESCENDING, current_row, current_row),
@@ -534,7 +864,7 @@ TEST_F(GroupedRollingRangeOrderByStringTest, Descending_NoParts_NoNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *get_count_over_unpartitioned_window(
       *orderby, cudf::order::DESCENDING, unbounded_preceding, unbounded_following),
-    nullable_ints_column({14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}));
+    ints_column{14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*get_count_over_unpartitioned_window(
                                    *orderby, cudf::order::DESCENDING, current_row, current_row),
@@ -568,7 +898,7 @@ TEST_F(GroupedRollingRangeOrderByStringTest, Descending_Partitioned_WithNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *get_count_over_partitioned_window(
       *orderby, cudf::order::DESCENDING, unbounded_preceding, unbounded_following),
-    nullable_ints_column({6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4}));
+    ints_column{6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *get_count_over_partitioned_window(*orderby, cudf::order::DESCENDING, current_row, current_row),
@@ -603,7 +933,7 @@ TEST_F(GroupedRollingRangeOrderByStringTest, Descending_NoParts_WithNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *get_count_over_unpartitioned_window(
       *orderby, cudf::order::DESCENDING, unbounded_preceding, unbounded_following),
-    nullable_ints_column({14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}));
+    ints_column{14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*get_count_over_unpartitioned_window(
                                    *orderby, cudf::order::DESCENDING, current_row, current_row),
diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp
index 68726344f07..774f2f7fc40 100644
--- a/cpp/tests/rolling/grouped_rolling_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_test.cpp
@@ -465,7 +465,7 @@ TEST_F(GroupedRollingErrorTest, NegativeMinPeriods)
     col_data.begin(), col_data.end(), col_valid.begin()};
 
   // Construct Grouping keys table-view.
-  const auto N_ELEMENTS{col_data.size()};
+  auto const N_ELEMENTS{col_data.size()};
   const std::vector<cudf::size_type> grouping_key_vec(N_ELEMENTS, 0);
   cudf::test::fixed_width_column_wrapper<cudf::size_type> grouping_keys_col(
     grouping_key_vec.begin(), grouping_key_vec.end(), col_valid.begin());
@@ -535,7 +535,7 @@ TYPED_TEST_SUITE(GroupedRollingTest, cudf::test::FixedWidthTypesWithoutFixedPoin
 
 TYPED_TEST(GroupedRollingTest, SimplePartitionedStaticWindowsWithGroupKeys)
 {
-  const auto col_data = {0, 10, 20, 30, 40, 50, 60, 70, 80, 90};
+  auto const col_data = {0, 10, 20, 30, 40, 50, 60, 70, 80, 90};
   const cudf::size_type DATA_SIZE{static_cast<cudf::size_type>(col_data.size())};
   const std::vector<bool> col_mask(DATA_SIZE, true);
   cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> input(
@@ -565,7 +565,7 @@ TYPED_TEST(GroupedRollingTest, SimplePartitionedStaticWindowsWithGroupKeys)
 
 TYPED_TEST(GroupedRollingTest, SimplePartitionedStaticWindowWithNoGroupKeys)
 {
-  const auto col_data =
+  auto const col_data =
     cudf::test::make_type_param_vector<TypeParam>({0, 10, 20, 30, 40, 50, 60, 70, 80, 90});
   const cudf::size_type DATA_SIZE{static_cast<cudf::size_type>(col_data.size())};
   const std::vector<bool> col_mask(DATA_SIZE, true);
@@ -585,7 +585,7 @@ TYPED_TEST(GroupedRollingTest, SimplePartitionedStaticWindowWithNoGroupKeys)
 // all rows are invalid
 TYPED_TEST(GroupedRollingTest, AllInvalid)
 {
-  const auto col_data =
+  auto const col_data =
     cudf::test::make_type_param_vector<TypeParam>({0, 10, 20, 30, 40, 50, 60, 70, 80, 90});
   const cudf::size_type DATA_SIZE{static_cast<cudf::size_type>(col_data.size())};
   const std::vector<bool> col_mask(DATA_SIZE, false);
@@ -617,7 +617,7 @@ TYPED_TEST(GroupedRollingTest, AllInvalid)
 // window = following_window = 0
 TYPED_TEST(GroupedRollingTest, ZeroWindow)
 {
-  const auto col_data = {0, 10, 20, 30, 40, 50, 60, 70, 80, 90};
+  auto const col_data = {0, 10, 20, 30, 40, 50, 60, 70, 80, 90};
   const cudf::size_type DATA_SIZE{static_cast<cudf::size_type>(col_data.size())};
   const std::vector<bool> col_mask(DATA_SIZE, true);
   cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> input(
@@ -1663,9 +1663,9 @@ TYPED_TEST(TypedUnboundedWindowTest,
     min_periods,
     *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
-                                 cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-                                   {9, 9, 9, 9, 9, 9, 9, 9, 9, 9}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    output->view(),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{9, 9, 9, 9, 9, 9, 9, 9, 9, 9});
 }
 
 TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroupTimestampASCNullsLast)
@@ -1754,9 +1754,9 @@ TYPED_TEST(TypedUnboundedWindowTest,
     min_periods,
     *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
-                                 cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-                                   {9, 9, 9, 9, 9, 9, 9, 9, 9, 9}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    output->view(),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{9, 9, 9, 9, 9, 9, 9, 9, 9, 9});
 }
 
 TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroupTimestampDESCNullsFirst)
@@ -1845,9 +1845,9 @@ TYPED_TEST(TypedUnboundedWindowTest,
     min_periods,
     *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
-                                 cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-                                   {9, 9, 9, 9, 9, 9, 9, 9, 9, 9}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    output->view(),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{9, 9, 9, 9, 9, 9, 9, 9, 9, 9});
 }
 
 TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroupTimestampDESCNullsLast)
@@ -1936,9 +1936,9 @@ TYPED_TEST(TypedUnboundedWindowTest,
     min_periods,
     *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
-                                 cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-                                   {9, 9, 9, 9, 9, 9, 9, 9, 9, 9}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    output->view(),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{9, 9, 9, 9, 9, 9, 9, 9, 9, 9});
 }
 
 TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingCountMultiGroupTimestampASCNullsFirst)
@@ -2024,9 +2024,9 @@ TYPED_TEST(TypedUnboundedWindowTest,
     min_periods,
     *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
-                                 cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-                                   {5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    output->view(),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{5, 5, 5, 5, 5, 5, 5, 5, 5, 5});
 }
 
 TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingCountMultiGroupTimestampASCNullsLast)
@@ -2112,9 +2112,9 @@ TYPED_TEST(TypedUnboundedWindowTest,
     min_periods,
     *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
-                                 cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-                                   {5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    output->view(),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{5, 5, 5, 5, 5, 5, 5, 5, 5, 5});
 }
 
 TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingCountMultiGroupTimestampDESCNullsFirst)
@@ -2200,9 +2200,9 @@ TYPED_TEST(TypedUnboundedWindowTest,
     min_periods,
     *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
-                                 cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-                                   {5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    output->view(),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{5, 5, 5, 5, 5, 5, 5, 5, 5, 5});
 }
 
 TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingCountMultiGroupTimestampDESCNullsLast)
@@ -2288,9 +2288,9 @@ TYPED_TEST(TypedUnboundedWindowTest,
     min_periods,
     *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
-                                 cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-                                   {5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    output->view(),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{5, 5, 5, 5, 5, 5, 5, 5, 5, 5});
 }
 
 TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroup)
@@ -2363,9 +2363,9 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingAndFollowingWindowSingleG
                                  min_periods,
                                  *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
-                                 cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-                                   {9, 9, 9, 9, 9, 9, 9, 9, 9, 9}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    output->view(),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{9, 9, 9, 9, 9, 9, 9, 9, 9, 9});
 }
 
 TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowMultiGroup)
@@ -2438,9 +2438,9 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingAndFollowingWindowMultiGr
                                  min_periods,
                                  *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
-                                 cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-                                   {3, 3, 3, 3, 3, 4, 4, 4, 4, 4}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    output->view(),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{3, 3, 3, 3, 3, 4, 4, 4, 4, 4});
 }
 
 TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingAndFollowingStructGroup)
@@ -2471,6 +2471,5 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingAndFollowingStructGroup)
                                  min_periods,
                                  *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    output->view(), result_t{{3, 3, 3, 3, 3, 4, 4, 4, 4, 4}, cudf::test::iterators::no_nulls()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), result_t{3, 3, 3, 3, 3, 4, 4, 4, 4, 4});
 }
diff --git a/cpp/tests/rolling/range_comparator_test.cu b/cpp/tests/rolling/range_comparator_test.cu
new file mode 100644
index 00000000000..96c3fe77cf6
--- /dev/null
+++ b/cpp/tests/rolling/range_comparator_test.cu
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <src/rolling/detail/range_comparator_utils.cuh>
+
+struct RangeComparatorTest : cudf::test::BaseFixture {};
+
+template <typename T>
+struct RangeComparatorTypedTest : RangeComparatorTest {};
+
+using TestTypes =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+
+TYPED_TEST_SUITE(RangeComparatorTypedTest, TestTypes);
+
+TYPED_TEST(RangeComparatorTypedTest, TestLessComparator)
+{
+  auto const less     = cudf::detail::nan_aware_less{};
+  auto constexpr nine = TypeParam{9};
+  auto constexpr ten  = TypeParam{10};
+
+  EXPECT_TRUE(less(nine, ten));
+  EXPECT_FALSE(less(ten, nine));
+  EXPECT_FALSE(less(ten, ten));
+
+  if constexpr (std::is_floating_point_v<TypeParam>) {
+    auto constexpr NaN = std::numeric_limits<TypeParam>::quiet_NaN();
+    auto constexpr Inf = std::numeric_limits<TypeParam>::infinity();
+    // NaN.
+    EXPECT_FALSE(less(NaN, ten));
+    EXPECT_FALSE(less(NaN, NaN));
+    EXPECT_FALSE(less(NaN, Inf));
+    EXPECT_FALSE(less(NaN, -Inf));
+    // Infinity.
+    EXPECT_TRUE(less(Inf, NaN));
+    EXPECT_FALSE(less(Inf, Inf));
+    EXPECT_FALSE(less(Inf, ten));
+    EXPECT_FALSE(less(Inf, -Inf));
+    // -Infinity.
+    EXPECT_TRUE(less(-Inf, NaN));
+    EXPECT_TRUE(less(-Inf, Inf));
+    EXPECT_TRUE(less(-Inf, ten));
+    EXPECT_FALSE(less(-Inf, -Inf));
+    // Finite.
+    EXPECT_TRUE(less(ten, NaN));
+    EXPECT_TRUE(less(ten, Inf));
+    EXPECT_FALSE(less(ten, -Inf));
+  }
+}
+
+TYPED_TEST(RangeComparatorTypedTest, TestGreaterComparator)
+{
+  auto const greater  = cudf::detail::nan_aware_greater{};
+  auto constexpr nine = TypeParam{9};
+  auto constexpr ten  = TypeParam{10};
+
+  EXPECT_FALSE(greater(nine, ten));
+  EXPECT_TRUE(greater(ten, nine));
+  EXPECT_FALSE(greater(ten, ten));
+
+  if constexpr (std::is_floating_point_v<TypeParam>) {
+    auto constexpr NaN = std::numeric_limits<TypeParam>::quiet_NaN();
+    auto constexpr Inf = std::numeric_limits<TypeParam>::infinity();
+    // NaN.
+    EXPECT_TRUE(greater(NaN, ten));
+    EXPECT_FALSE(greater(NaN, NaN));
+    EXPECT_TRUE(greater(NaN, Inf));
+    EXPECT_TRUE(greater(NaN, -Inf));
+    // Infinity.
+    EXPECT_FALSE(greater(Inf, NaN));
+    EXPECT_FALSE(greater(Inf, Inf));
+    EXPECT_TRUE(greater(Inf, ten));
+    EXPECT_TRUE(greater(Inf, -Inf));
+    // -Infinity.
+    EXPECT_FALSE(greater(-Inf, NaN));
+    EXPECT_FALSE(greater(-Inf, Inf));
+    EXPECT_FALSE(greater(-Inf, ten));
+    EXPECT_FALSE(greater(-Inf, -Inf));
+    // Finite.
+    EXPECT_FALSE(greater(ten, NaN));
+    EXPECT_FALSE(greater(ten, Inf));
+    EXPECT_TRUE(greater(ten, -Inf));
+  }
+}
+
+TYPED_TEST(RangeComparatorTypedTest, TestAddSafe)
+{
+  using T = TypeParam;
+  EXPECT_EQ(cudf::detail::add_safe(T{3}, T{4}), T{7});
+
+  if constexpr (cuda::std::numeric_limits<T>::is_signed) {
+    EXPECT_EQ(cudf::detail::add_safe(T{-3}, T{4}), T{1});
+  }
+
+  auto constexpr max = cuda::std::numeric_limits<T>::max();
+  EXPECT_EQ(cudf::detail::add_safe(T{max - 5}, T{4}), max - 1);
+  EXPECT_EQ(cudf::detail::add_safe(T{max - 4}, T{4}), max);
+  EXPECT_EQ(cudf::detail::add_safe(T{max - 3}, T{4}), max);
+  EXPECT_EQ(cudf::detail::add_safe(max, T{4}), max);
+
+  if constexpr (std::is_floating_point_v<T>) {
+    auto const NaN = std::numeric_limits<T>::quiet_NaN();
+    auto const Inf = std::numeric_limits<T>::infinity();
+    EXPECT_TRUE(std::isnan(cudf::detail::add_safe(NaN, T{4})));
+    EXPECT_EQ(cudf::detail::add_safe(Inf, T{4}), Inf);
+  }
+}
+
+TYPED_TEST(RangeComparatorTypedTest, TestSubtractSafe)
+{
+  using T = TypeParam;
+  EXPECT_EQ(cudf::detail::subtract_safe(T{4}, T{3}), T{1});
+
+  if constexpr (cuda::std::numeric_limits<T>::is_signed) {
+    EXPECT_EQ(cudf::detail::subtract_safe(T{3}, T{4}), T{-1});
+  }
+
+  auto constexpr min = cuda::std::numeric_limits<T>::lowest();
+  EXPECT_EQ(cudf::detail::subtract_safe(T{min + 5}, T{4}), min + 1);
+  EXPECT_EQ(cudf::detail::subtract_safe(T{min + 4}, T{4}), min);
+  EXPECT_EQ(cudf::detail::subtract_safe(T{min + 3}, T{4}), min);
+  EXPECT_EQ(cudf::detail::subtract_safe(min, T{4}), min);
+
+  if constexpr (std::is_floating_point_v<T>) {
+    auto const NaN = std::numeric_limits<T>::quiet_NaN();
+    auto const Inf = std::numeric_limits<T>::infinity();
+    EXPECT_TRUE(std::isnan(cudf::detail::subtract_safe(NaN, T{4})));
+    EXPECT_EQ(cudf::detail::subtract_safe(-Inf, T{4}), -Inf);
+  }
+}
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index 66b70ebfd79..e410e2488b3 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -413,8 +413,8 @@ class RollingTest : public cudf::test::BaseFixture {
  protected:
   // input as column_wrapper
   void run_test_col(cudf::column_view const& input,
-                    const std::vector<cudf::size_type>& preceding_window,
-                    const std::vector<cudf::size_type>& following_window,
+                    std::vector<cudf::size_type> const& preceding_window,
+                    std::vector<cudf::size_type> const& following_window,
                     cudf::size_type min_periods,
                     cudf::rolling_aggregation const& op)
   {
@@ -443,8 +443,8 @@ class RollingTest : public cudf::test::BaseFixture {
 
   // helper function to test all aggregators
   void run_test_col_agg(cudf::column_view const& input,
-                        const std::vector<cudf::size_type>& preceding_window,
-                        const std::vector<cudf::size_type>& following_window,
+                        std::vector<cudf::size_type> const& preceding_window,
+                        std::vector<cudf::size_type> const& following_window,
                         cudf::size_type min_periods)
   {
     // test all supported aggregators
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index ee3be2a90ff..8d0e54f024f 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -114,7 +114,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, SetNull)
 struct StringScalarDeviceViewTest : public cudf::test::BaseFixture {};
 
 __global__ void test_string_value(cudf::string_scalar_device_view s,
-                                  const char* value,
+                                  char const* value,
                                   cudf::size_type size,
                                   bool* result)
 {
diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp
index 2de9a15d6ff..efff7cd31c6 100644
--- a/cpp/tests/search/search_test.cpp
+++ b/cpp/tests/search/search_test.cpp
@@ -645,8 +645,8 @@ TEST_F(SearchTest, contains_nullable_column_false)
 
 TEST_F(SearchTest, empty_table_string)
 {
-  std::vector<const char*> h_col_strings{};
-  std::vector<const char*> h_val_strings{"0", "10", "11", "30", "32", "40", "47", "50", "7", "90"};
+  std::vector<char const*> h_col_strings{};
+  std::vector<char const*> h_val_strings{"0", "10", "11", "30", "32", "40", "47", "50", "7", "90"};
 
   cudf::test::strings_column_wrapper column(
     h_col_strings.begin(),
@@ -674,8 +674,8 @@ TEST_F(SearchTest, empty_table_string)
 
 TEST_F(SearchTest, empty_values_string)
 {
-  std::vector<const char*> h_col_strings{"10", "20", "30", "40", "50"};
-  std::vector<const char*> h_val_strings{};
+  std::vector<char const*> h_col_strings{"10", "20", "30", "40", "50"};
+  std::vector<char const*> h_val_strings{};
 
   cudf::test::strings_column_wrapper column(
     h_col_strings.begin(),
@@ -703,8 +703,8 @@ TEST_F(SearchTest, empty_values_string)
 
 TEST_F(SearchTest, non_null_column__find_first_string)
 {
-  std::vector<const char*> h_col_strings{"10", "20", "30", "40", "50"};
-  std::vector<const char*> h_val_strings{
+  std::vector<char const*> h_col_strings{"10", "20", "30", "40", "50"};
+  std::vector<char const*> h_val_strings{
     "00", "07", "10", "11", "30", "32", "40", "47", "50", "90"};
 
   cudf::test::strings_column_wrapper column(
@@ -733,8 +733,8 @@ TEST_F(SearchTest, non_null_column__find_first_string)
 
 TEST_F(SearchTest, non_null_column__find_last_string)
 {
-  std::vector<const char*> h_col_strings{"10", "20", "30", "40", "50"};
-  std::vector<const char*> h_val_strings{
+  std::vector<char const*> h_col_strings{"10", "20", "30", "40", "50"};
+  std::vector<char const*> h_val_strings{
     "00", "07", "10", "11", "30", "32", "40", "47", "50", "90"};
 
   cudf::test::strings_column_wrapper column(
@@ -763,8 +763,8 @@ TEST_F(SearchTest, non_null_column__find_last_string)
 
 TEST_F(SearchTest, non_null_column_desc__find_first_string)
 {
-  std::vector<const char*> h_col_strings{"50", "40", "30", "20", "10"};
-  std::vector<const char*> h_val_strings{
+  std::vector<char const*> h_col_strings{"50", "40", "30", "20", "10"};
+  std::vector<char const*> h_val_strings{
     "00", "07", "10", "11", "30", "32", "40", "47", "50", "90"};
 
   cudf::test::strings_column_wrapper column(
@@ -793,8 +793,8 @@ TEST_F(SearchTest, non_null_column_desc__find_first_string)
 
 TEST_F(SearchTest, non_null_column_desc__find_last_string)
 {
-  std::vector<const char*> h_col_strings{"50", "40", "30", "20", "10"};
-  std::vector<const char*> h_val_strings{
+  std::vector<char const*> h_col_strings{"50", "40", "30", "20", "10"};
+  std::vector<char const*> h_val_strings{
     "00", "07", "10", "11", "30", "32", "40", "47", "50", "90"};
 
   cudf::test::strings_column_wrapper column(
@@ -823,8 +823,8 @@ TEST_F(SearchTest, non_null_column_desc__find_last_string)
 
 TEST_F(SearchTest, nullable_column__find_last__nulls_as_smallest_string)
 {
-  std::vector<const char*> h_col_strings{nullptr, nullptr, "10", "20", "30", "40", "50"};
-  std::vector<const char*> h_val_strings{
+  std::vector<char const*> h_col_strings{nullptr, nullptr, "10", "20", "30", "40", "50"};
+  std::vector<char const*> h_val_strings{
     nullptr, "08", "10", "11", "30", "32", "40", "47", "50", "90"};
 
   cudf::test::strings_column_wrapper column(
@@ -853,8 +853,8 @@ TEST_F(SearchTest, nullable_column__find_last__nulls_as_smallest_string)
 
 TEST_F(SearchTest, nullable_column__find_first__nulls_as_smallest_string)
 {
-  std::vector<const char*> h_col_strings{nullptr, nullptr, "10", "20", "30", "40", "50"};
-  std::vector<const char*> h_val_strings{
+  std::vector<char const*> h_col_strings{nullptr, nullptr, "10", "20", "30", "40", "50"};
+  std::vector<char const*> h_val_strings{
     nullptr, "08", "10", "11", "30", "32", "40", "47", "50", "90"};
 
   cudf::test::strings_column_wrapper column(
@@ -883,8 +883,8 @@ TEST_F(SearchTest, nullable_column__find_first__nulls_as_smallest_string)
 
 TEST_F(SearchTest, nullable_column__find_last__nulls_as_largest_string)
 {
-  std::vector<const char*> h_col_strings{"10", "20", "30", "40", "50", nullptr, nullptr};
-  std::vector<const char*> h_val_strings{
+  std::vector<char const*> h_col_strings{"10", "20", "30", "40", "50", nullptr, nullptr};
+  std::vector<char const*> h_val_strings{
     "08", "10", "11", "30", "32", "40", "47", "50", "90", nullptr};
 
   cudf::test::strings_column_wrapper column(
@@ -932,8 +932,8 @@ TEST_F(SearchTest, non_null_column__nullable_values__find_last__nulls_as_largest
 
 TEST_F(SearchTest, nullable_column__find_first__nulls_as_largest_string)
 {
-  std::vector<const char*> h_col_strings{"10", "20", "30", "40", "50", nullptr, nullptr};
-  std::vector<const char*> h_val_strings{
+  std::vector<char const*> h_col_strings{"10", "20", "30", "40", "50", nullptr, nullptr};
+  std::vector<char const*> h_val_strings{
     "08", "10", "11", "30", "32", "40", "47", "50", "90", nullptr};
 
   cudf::test::strings_column_wrapper column(
@@ -962,13 +962,13 @@ TEST_F(SearchTest, nullable_column__find_first__nulls_as_largest_string)
 
 TEST_F(SearchTest, table__find_first_string)
 {
-  std::vector<const char*> h_col_0_strings{"10", "20", "20", "20", "20", "20", "50"};
-  std::vector<const char*> h_col_2_strings{"90", "77", "78", "61", "62", "63", "41"};
+  std::vector<char const*> h_col_0_strings{"10", "20", "20", "20", "20", "20", "50"};
+  std::vector<char const*> h_col_2_strings{"90", "77", "78", "61", "62", "63", "41"};
 
-  std::vector<const char*> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
+  std::vector<char const*> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
                                            "10", "10", "10", "11", "20", "20", "20", "20", "20",
                                            "20", "20", "20", "20", "20", "20", "30", "50", "60"};
-  std::vector<const char*> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
+  std::vector<char const*> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
                                            "80", "90", "91", "91", "00", "76", "77", "78", "30",
                                            "65", "77", "78", "80", "62", "78", "64", "41", "20"};
 
@@ -1031,13 +1031,13 @@ TEST_F(SearchTest, table__find_first_string)
 
 TEST_F(SearchTest, table__find_last_string)
 {
-  std::vector<const char*> h_col_0_strings{"10", "20", "20", "20", "20", "20", "50"};
-  std::vector<const char*> h_col_2_strings{"90", "77", "78", "61", "62", "63", "41"};
+  std::vector<char const*> h_col_0_strings{"10", "20", "20", "20", "20", "20", "50"};
+  std::vector<char const*> h_col_2_strings{"90", "77", "78", "61", "62", "63", "41"};
 
-  std::vector<const char*> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
+  std::vector<char const*> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
                                            "10", "10", "10", "11", "20", "20", "20", "20", "20",
                                            "20", "20", "20", "20", "20", "20", "30", "50", "60"};
-  std::vector<const char*> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
+  std::vector<char const*> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
                                            "80", "90", "91", "91", "00", "76", "77", "78", "30",
                                            "65", "77", "78", "80", "62", "78", "64", "41", "20"};
 
@@ -1100,13 +1100,13 @@ TEST_F(SearchTest, table__find_last_string)
 
 TEST_F(SearchTest, table_partial_desc__find_first_string)
 {
-  std::vector<const char*> h_col_0_strings{"50", "20", "20", "20", "20", "20", "10"};
-  std::vector<const char*> h_col_2_strings{"41", "78", "77", "63", "62", "61", "90"};
+  std::vector<char const*> h_col_0_strings{"50", "20", "20", "20", "20", "20", "10"};
+  std::vector<char const*> h_col_2_strings{"41", "78", "77", "63", "62", "61", "90"};
 
-  std::vector<const char*> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
+  std::vector<char const*> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
                                            "10", "10", "10", "11", "20", "20", "20", "20", "20",
                                            "20", "20", "20", "20", "20", "20", "30", "50", "60"};
-  std::vector<const char*> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
+  std::vector<char const*> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
                                            "80", "90", "91", "91", "00", "76", "77", "78", "30",
                                            "65", "77", "78", "80", "62", "78", "64", "41", "20"};
 
@@ -1169,13 +1169,13 @@ TEST_F(SearchTest, table_partial_desc__find_first_string)
 
 TEST_F(SearchTest, table_partial_desc__find_last_string)
 {
-  std::vector<const char*> h_col_0_strings{"50", "20", "20", "20", "20", "20", "10"};
-  std::vector<const char*> h_col_2_strings{"41", "78", "77", "63", "62", "61", "90"};
+  std::vector<char const*> h_col_0_strings{"50", "20", "20", "20", "20", "20", "10"};
+  std::vector<char const*> h_col_2_strings{"41", "78", "77", "63", "62", "61", "90"};
 
-  std::vector<const char*> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
+  std::vector<char const*> h_val_0_strings{"0",  "0",  "0",  "0",  "10", "10", "10", "10", "10",
                                            "10", "10", "10", "11", "20", "20", "20", "20", "20",
                                            "20", "20", "20", "20", "20", "20", "30", "50", "60"};
-  std::vector<const char*> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
+  std::vector<char const*> h_val_2_strings{"0",  "91", "0",  "91", "0",  "79", "90", "91", "77",
                                            "80", "90", "91", "91", "00", "76", "77", "78", "30",
                                            "65", "77", "78", "80", "62", "78", "64", "41", "20"};
 
@@ -1239,13 +1239,13 @@ TEST_F(SearchTest, table_partial_desc__find_last_string)
 
 TEST_F(SearchTest, table__find_first__nulls_as_smallest_string)
 {
-  std::vector<const char*> h_col_0_strings{
+  std::vector<char const*> h_col_0_strings{
     nullptr, "10", "10", "20", "20", "20", "20", "20", "20", "20", "50"};
-  std::vector<const char*> h_col_2_strings{
+  std::vector<char const*> h_col_2_strings{
     "50", "95", "90", nullptr, nullptr, "77", "78", "61", "62", "63", "41"};
 
-  std::vector<const char*> h_val_0_strings{"10", nullptr, "20"};
-  std::vector<const char*> h_val_2_strings{"95", "50", nullptr};
+  std::vector<char const*> h_val_0_strings{"10", nullptr, "20"};
+  std::vector<char const*> h_val_2_strings{"95", "50", nullptr};
 
   fixed_width_column_wrapper<float> column_1{{.5, 6.0, 5.0, .5, .5, .5, .5, .7, .7, .7, .7},
                                              {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
@@ -1306,13 +1306,13 @@ TEST_F(SearchTest, table__find_first__nulls_as_smallest_string)
 
 TEST_F(SearchTest, table__find_last__nulls_as_smallest_string)
 {
-  std::vector<const char*> h_col_0_strings{
+  std::vector<char const*> h_col_0_strings{
     nullptr, "10", "10", "20", "20", "20", "20", "20", "20", "20", "50"};
-  std::vector<const char*> h_col_2_strings{
+  std::vector<char const*> h_col_2_strings{
     "50", "90", "95", nullptr, nullptr, "77", "78", "61", "62", "63", "41"};
 
-  std::vector<const char*> h_val_0_strings{"10", nullptr, "20"};
-  std::vector<const char*> h_val_2_strings{"95", "50", nullptr};
+  std::vector<char const*> h_val_0_strings{"10", nullptr, "20"};
+  std::vector<char const*> h_val_2_strings{"95", "50", nullptr};
 
   fixed_width_column_wrapper<float> column_1{{.5, 6.0, 5.0, .5, .5, .5, .5, .7, .7, .7, .7},
                                              {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
@@ -1373,13 +1373,13 @@ TEST_F(SearchTest, table__find_last__nulls_as_smallest_string)
 
 TEST_F(SearchTest, table__find_first__nulls_as_largest_string)
 {
-  std::vector<const char*> h_col_0_strings{
+  std::vector<char const*> h_col_0_strings{
     "10", "10", "20", "20", "20", "20", "20", "20", "20", "50", nullptr};
-  std::vector<const char*> h_col_2_strings{
+  std::vector<char const*> h_col_2_strings{
     "90", "95", "77", "78", nullptr, nullptr, "61", "62", "63", "41", "50"};
 
-  std::vector<const char*> h_val_0_strings{"10", nullptr, "20"};
-  std::vector<const char*> h_val_2_strings{"95", "50", nullptr};
+  std::vector<char const*> h_val_0_strings{"10", nullptr, "20"};
+  std::vector<char const*> h_val_2_strings{"95", "50", nullptr};
 
   fixed_width_column_wrapper<float> column_1{{5.0, 6.0, .5, .5, .5, .5, .7, .7, .7, .7, .5},
                                              {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
@@ -1440,13 +1440,13 @@ TEST_F(SearchTest, table__find_first__nulls_as_largest_string)
 
 TEST_F(SearchTest, table__find_last__nulls_as_largest_string)
 {
-  std::vector<const char*> h_col_0_strings{
+  std::vector<char const*> h_col_0_strings{
     "10", "10", "20", "20", "20", "20", "20", "20", "20", "50", nullptr};
-  std::vector<const char*> h_col_2_strings{
+  std::vector<char const*> h_col_2_strings{
     "90", "95", "77", "78", nullptr, nullptr, "61", "62", "63", "41", "50"};
 
-  std::vector<const char*> h_val_0_strings{"10", nullptr, "20"};
-  std::vector<const char*> h_val_2_strings{"95", "50", nullptr};
+  std::vector<char const*> h_val_0_strings{"10", nullptr, "20"};
+  std::vector<char const*> h_val_2_strings{"95", "50", nullptr};
 
   fixed_width_column_wrapper<float> column_1{{5.0, 6.0, .5, .5, .5, .5, .7, .7, .7, .7, .5},
                                              {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
@@ -1507,7 +1507,7 @@ TEST_F(SearchTest, table__find_last__nulls_as_largest_string)
 
 TEST_F(SearchTest, contains_true_string)
 {
-  std::vector<const char*> h_col_strings{"00", "01", "17", "19", "23", "29", "71"};
+  std::vector<char const*> h_col_strings{"00", "01", "17", "19", "23", "29", "71"};
   string_scalar scalar{"23"};
 
   cudf::test::strings_column_wrapper column(
@@ -1526,7 +1526,7 @@ TEST_F(SearchTest, contains_true_string)
 
 TEST_F(SearchTest, contains_false_string)
 {
-  std::vector<const char*> h_col_strings{"0", "1", "17", "19", "23", "29", "71"};
+  std::vector<char const*> h_col_strings{"0", "1", "17", "19", "23", "29", "71"};
   string_scalar scalar{"24"};
 
   cudf::test::strings_column_wrapper column(
@@ -1545,7 +1545,7 @@ TEST_F(SearchTest, contains_false_string)
 
 TEST_F(SearchTest, contains_empty_value_string)
 {
-  std::vector<const char*> h_col_strings{"0", "1", "17", "19", "23", "29", "71"};
+  std::vector<char const*> h_col_strings{"0", "1", "17", "19", "23", "29", "71"};
   string_scalar scalar{"23", false};
 
   cudf::test::strings_column_wrapper column(
@@ -1564,7 +1564,7 @@ TEST_F(SearchTest, contains_empty_value_string)
 
 TEST_F(SearchTest, contains_empty_column_string)
 {
-  std::vector<const char*> h_col_strings{};
+  std::vector<char const*> h_col_strings{};
   string_scalar scalar{"24"};
 
   cudf::test::strings_column_wrapper column(
@@ -1583,7 +1583,7 @@ TEST_F(SearchTest, contains_empty_column_string)
 
 TEST_F(SearchTest, contains_nullable_column_true_string)
 {
-  std::vector<const char*> h_col_strings{nullptr, nullptr, "17", "19", "23", "29", "71"};
+  std::vector<char const*> h_col_strings{nullptr, nullptr, "17", "19", "23", "29", "71"};
   string_scalar scalar{"23"};
 
   cudf::test::strings_column_wrapper column(
@@ -1602,7 +1602,7 @@ TEST_F(SearchTest, contains_nullable_column_true_string)
 
 TEST_F(SearchTest, contains_nullable_column_false_string)
 {
-  std::vector<const char*> h_col_strings{nullptr, nullptr, "17", "19", nullptr, "29", "71"};
+  std::vector<char const*> h_col_strings{nullptr, nullptr, "17", "19", nullptr, "29", "71"};
   string_scalar scalar{"23"};
 
   cudf::test::strings_column_wrapper column(
@@ -1649,8 +1649,8 @@ TEST_F(SearchTest, multi_contains_none)
 
 TEST_F(SearchTest, multi_contains_some_string)
 {
-  std::vector<const char*> h_haystack_strings{"0", "1", "17", "19", "23", "29", "71"};
-  std::vector<const char*> h_needles_strings{"17", "19", "45", "72"};
+  std::vector<char const*> h_haystack_strings{"0", "1", "17", "19", "23", "29", "71"};
+  std::vector<char const*> h_needles_strings{"17", "19", "45", "72"};
 
   cudf::test::strings_column_wrapper haystack(h_haystack_strings.begin(), h_haystack_strings.end());
 
@@ -1665,8 +1665,8 @@ TEST_F(SearchTest, multi_contains_some_string)
 
 TEST_F(SearchTest, multi_contains_none_string)
 {
-  std::vector<const char*> h_haystack_strings{"0", "1", "17", "19", "23", "29", "71"};
-  std::vector<const char*> h_needles_strings{"2", "3"};
+  std::vector<char const*> h_haystack_strings{"0", "1", "17", "19", "23", "29", "71"};
+  std::vector<char const*> h_needles_strings{"2", "3"};
 
   cudf::test::strings_column_wrapper haystack(h_haystack_strings.begin(), h_haystack_strings.end());
 
@@ -1711,8 +1711,8 @@ TEST_F(SearchTest, multi_contains_none_with_nulls)
 
 TEST_F(SearchTest, multi_contains_some_string_with_nulls)
 {
-  std::vector<const char*> h_haystack_strings{"0", "1", nullptr, "19", "23", "29", "71"};
-  std::vector<const char*> h_needles_strings{"17", "23", nullptr, "72"};
+  std::vector<char const*> h_haystack_strings{"0", "1", nullptr, "19", "23", "29", "71"};
+  std::vector<char const*> h_needles_strings{"17", "23", nullptr, "72"};
 
   fixed_width_column_wrapper<bool> expect{{0, 1, 0, 0}, {1, 1, 0, 1}};
 
@@ -1735,8 +1735,8 @@ TEST_F(SearchTest, multi_contains_some_string_with_nulls)
 
 TEST_F(SearchTest, multi_contains_none_string_with_nulls)
 {
-  std::vector<const char*> h_haystack_strings{"0", "1", nullptr, "19", "23", "29", "71"};
-  std::vector<const char*> h_needles_strings{"2", nullptr};
+  std::vector<char const*> h_haystack_strings{"0", "1", nullptr, "19", "23", "29", "71"};
+  std::vector<char const*> h_needles_strings{"2", nullptr};
 
   fixed_width_column_wrapper<bool> expect{{0, 0}, {1, 0}};
 
@@ -1773,8 +1773,8 @@ TEST_F(SearchTest, multi_contains_empty_column)
 
 TEST_F(SearchTest, multi_contains_empty_column_string)
 {
-  std::vector<const char*> h_haystack_strings{};
-  std::vector<const char*> h_needles_strings{"17", "19", "45", "72"};
+  std::vector<char const*> h_haystack_strings{};
+  std::vector<char const*> h_needles_strings{"17", "19", "45", "72"};
 
   cudf::test::strings_column_wrapper haystack(h_haystack_strings.begin(), h_haystack_strings.end());
 
@@ -1803,8 +1803,8 @@ TEST_F(SearchTest, multi_contains_empty_input_set)
 
 TEST_F(SearchTest, multi_contains_empty_input_set_string)
 {
-  std::vector<const char*> h_haystack_strings{"0", "1", "17", "19", "23", "29", "71"};
-  std::vector<const char*> h_needles_strings{};
+  std::vector<char const*> h_haystack_strings{"0", "1", "17", "19", "23", "29", "71"};
+  std::vector<char const*> h_needles_strings{};
 
   cudf::test::strings_column_wrapper haystack(h_haystack_strings.begin(), h_haystack_strings.end());
 
diff --git a/cpp/tests/sort/sort_nested_types_tests.cpp b/cpp/tests/sort/sort_nested_types_tests.cpp
index ff6256c2408..8ab23936ceb 100644
--- a/cpp/tests/sort/sort_nested_types_tests.cpp
+++ b/cpp/tests/sort/sort_nested_types_tests.cpp
@@ -441,3 +441,23 @@ TEST_F(NestedListTest, ListsOfListsOfStructsNoNulls)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view());
   }
 }
+
+TEST_F(NestedListTest, MultipleListsColumnsWithNulls)
+{
+  // A STRUCT<LIST<INT>> column with all nulls.
+  auto const col0 = [] {
+    auto child = int32s_lists{{int32s_lists{}, int32s_lists{}}, all_nulls()};
+    return structs_col{{child}, all_nulls()};
+  }();
+
+  auto const col1 = [] {
+    auto child = int32s_lists{{0, 1, 2}, {10, 11, 12}};
+    return structs_col{{child}};
+  }();
+
+  auto const col2 = int32s_col{1, 0};
+
+  auto const expected_order = int32s_col{0, 1};
+  auto const order          = cudf::sorted_order(cudf::table_view{{col0, col1, col2}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_order, order->view());
+}
diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp
index e80244cee41..864ac8f84c6 100644
--- a/cpp/tests/stream_compaction/distinct_count_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp
@@ -274,6 +274,16 @@ TEST_F(DistinctCount, TableWithNull)
   EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL));
 }
 
+TEST_F(DistinctCount, TableWithSomeNull)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{{1, 2, 3, 4, 5, 6}, {1, 0, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2{{1, 1, 1, 1, 1, 1}};
+  cudf::table_view input{{col1, col2}};
+
+  EXPECT_EQ(4, cudf::distinct_count(input, null_equality::EQUAL));
+  EXPECT_EQ(6, cudf::distinct_count(input, null_equality::UNEQUAL));
+}
+
 TEST_F(DistinctCount, EmptyColumnedTable)
 {
   std::vector<cudf::column_view> cols{};
diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp
index 85955ce7fc9..586792b4b30 100644
--- a/cpp/tests/stream_compaction/distinct_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_tests.cpp
@@ -126,7 +126,7 @@ TEST_F(DistinctKeepAny, EmptyInputTable)
 {
   int32s_col col(std::initializer_list<int32_t>{});
   cudf::table_view input{{col}};
-  std::vector<cudf::size_type> key_idx{1, 2};
+  std::vector<cudf::size_type> key_idx{0};
 
   auto got = cudf::distinct(input, key_idx, KEEP_ANY);
   CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view());
@@ -1217,11 +1217,11 @@ TEST_F(DistinctKeepAny, StructsOfStructs)
   // 8 |  { {2, 1}, 5}  |
 
   auto s1 = [&] {
-    auto a  = int32s_col{1, 1, XXX, XXX, 2, 1, 1, XXX, 2};
-    auto b  = int32s_col{1, 2, XXX, XXX, 2, 1, 1, XXX, 1};
+    auto a  = int32s_col{1, 1, XXX, XXX, XXX, XXX, 1, XXX, 2};
+    auto b  = int32s_col{1, 2, XXX, XXX, XXX, XXX, 1, XXX, 1};
     auto s2 = structs_col{{a, b}, nulls_at({2, 3, 7})};
 
-    auto c = int32s_col{5, 4, 6, 4, 3, 3, 5, 4, 5};
+    auto c = int32s_col{5, 4, 6, 4, XXX, XXX, 5, 4, 5};
     std::vector<std::unique_ptr<cudf::column>> s1_children;
     s1_children.emplace_back(s2.release());
     s1_children.emplace_back(c.release());
@@ -1270,11 +1270,11 @@ TEST_F(DistinctKeepAny, SlicedStructsOfStructs)
   // 8 |  { {2, 1}, 5}  |
 
   auto s1 = [&] {
-    auto a  = int32s_col{1, 1, 2, 2, 2, 1, 1, 1, 2};
-    auto b  = int32s_col{1, 2, 1, 2, 2, 1, 1, 1, 1};
+    auto a  = int32s_col{1, 1, XXX, XXX, XXX, XXX, 1, XXX, 2};
+    auto b  = int32s_col{1, 2, XXX, XXX, XXX, XXX, 1, XXX, 1};
     auto s2 = structs_col{{a, b}, nulls_at({2, 3, 7})};
 
-    auto c = int32s_col{5, 4, 6, 4, 3, 3, 5, 4, 5};
+    auto c = int32s_col{5, 4, 6, 4, XXX, XXX, 5, 4, 5};
     std::vector<std::unique_ptr<cudf::column>> s1_children;
     s1_children.emplace_back(s2.release());
     s1_children.emplace_back(c.release());
diff --git a/cpp/tests/stream_compaction/stable_distinct_tests.cpp b/cpp/tests/stream_compaction/stable_distinct_tests.cpp
new file mode 100644
index 00000000000..e28b96fc8be
--- /dev/null
+++ b/cpp/tests/stream_compaction/stable_distinct_tests.cpp
@@ -0,0 +1,1354 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cmath>
+
+auto constexpr null{0};  // null at current level
+auto constexpr XXX{0};   // null pushed down from parent level
+auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
+auto constexpr KEEP_ANY     = cudf::duplicate_keep_option::KEEP_ANY;
+auto constexpr KEEP_FIRST   = cudf::duplicate_keep_option::KEEP_FIRST;
+auto constexpr KEEP_LAST    = cudf::duplicate_keep_option::KEEP_LAST;
+auto constexpr KEEP_NONE    = cudf::duplicate_keep_option::KEEP_NONE;
+auto constexpr NULL_EQUAL   = cudf::null_equality::EQUAL;
+auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL;
+auto constexpr NAN_EQUAL    = cudf::nan_equality::ALL_EQUAL;
+auto constexpr NAN_UNEQUAL  = cudf::nan_equality::UNEQUAL;
+
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+using floats_col  = cudf::test::fixed_width_column_wrapper<float>;
+using lists_col   = cudf::test::lists_column_wrapper<int32_t>;
+using strings_col = cudf::test::strings_column_wrapper;
+using structs_col = cudf::test::structs_column_wrapper;
+
+using cudf::nan_policy;
+using cudf::null_equality;
+using cudf::null_policy;
+using cudf::test::iterators::no_nulls;
+using cudf::test::iterators::null_at;
+using cudf::test::iterators::nulls_at;
+
+struct StableDistinctKeepAny : public cudf::test::BaseFixture {};
+
+struct StableDistinctKeepFirstLastNone : public cudf::test::BaseFixture {};
+
+TEST_F(StableDistinctKeepAny, StringKeyColumn)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col = int32s_col{{5, 5, null, null, 5, 8, 1}, nulls_at({2, 3})};
+  auto const keys =
+    strings_col{{"all", "all", "new", "new", "" /*NULL*/, "the", "strings"}, null_at(4)};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  auto const exp_col  = int32s_col{{5, null, 5, 8, 1}, null_at(1)};
+  auto const exp_keys = strings_col{{"all", "new", "" /*NULL*/, "the", "strings"}, null_at(2)};
+  auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+  auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, StringKeyColumn)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col = int32s_col{{0, null, 2, 3, 4, 5, 6}, null_at(1)};
+  auto const keys =
+    strings_col{{"all", "new", "new", "all", "" /*NULL*/, "the", "strings"}, null_at(4)};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{{0, null, 4, 5, 6}, null_at(1)};
+    auto const exp_keys = strings_col{{"all", "new", "" /*NULL*/, "the", "strings"}, null_at(2)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{{2, 3, 4, 5, 6}, no_nulls()};
+    auto const exp_keys = strings_col{{"new", "all", "" /*NULL*/, "the", "strings"}, null_at(2)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{{4, 5, 6}, no_nulls()};
+    auto const exp_keys = strings_col{{"" /*NULL*/, "the", "strings"}, null_at(0)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, EmptyInputTable)
+{
+  int32s_col col(std::initializer_list<int32_t>{});
+  cudf::table_view input{{col}};
+  std::vector<cudf::size_type> key_idx{0};
+
+  auto got = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view());
+}
+
+TEST_F(StableDistinctKeepAny, NoColumnInputTable)
+{
+  cudf::table_view input{std::vector<cudf::column_view>()};
+  std::vector<cudf::size_type> key_idx{1, 2};
+
+  auto got = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view());
+}
+
+TEST_F(StableDistinctKeepAny, EmptyKeys)
+{
+  int32s_col col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
+  int32s_col empty_col{};
+  cudf::table_view input{{col}};
+  std::vector<cudf::size_type> key_idx{};
+
+  auto got = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view());
+}
+
+TEST_F(StableDistinctKeepAny, NoNullsTable)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col1  = int32s_col{6, 6, 6, 3, 5, 8, 5};
+  auto const col2  = floats_col{6, 6, 6, 3, 4, 9, 4};
+  auto const keys1 = int32s_col{20, 20, 20, 20, 19, 21, 9};
+  auto const keys2 = int32s_col{19, 19, 19, 20, 20, 9, 21};
+
+  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
+  auto const key_idx = std::vector<cudf::size_type>{2, 3};
+
+  auto const exp_col1  = int32s_col{6, 3, 5, 8, 5};
+  auto const exp_col2  = floats_col{6, 3, 4, 9, 4};
+  auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9};
+  auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21};
+  auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+  auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+}
+
+TEST_F(StableDistinctKeepAny, NoNullsTableWithNaNs)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col1  = int32s_col{6, 6, 6, 1, 1, 1, 3, 5, 8, 5};
+  auto const col2  = floats_col{6, 6, 6, 1, 1, 1, 3, 4, 9, 4};
+  auto const keys1 = int32s_col{20, 20, 20, 15, 15, 15, 20, 19, 21, 9};
+  auto const keys2 = floats_col{19., 19., 19., NaN, NaN, NaN, 20., 20., 9., 21.};
+
+  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
+  auto const key_idx = std::vector<cudf::size_type>{2, 3};
+
+  // NaNs are unequal.
+  {
+    auto const exp_col1  = int32s_col{6, 1, 1, 1, 3, 5, 8, 5};
+    auto const exp_col2  = floats_col{6, 1, 1, 1, 3, 4, 9, 4};
+    auto const exp_keys1 = int32s_col{20, 15, 15, 15, 20, 19, 21, 9};
+    auto const exp_keys2 = floats_col{19., NaN, NaN, NaN, 20., 20., 9., 21.};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // NaNs are equal.
+  {
+    auto const exp_col1  = int32s_col{6, 1, 3, 5, 8, 5};
+    auto const exp_col2  = floats_col{6, 1, 3, 4, 9, 4};
+    auto const exp_keys1 = int32s_col{20, 15, 20, 19, 21, 9};
+    auto const exp_keys2 = floats_col{19., NaN, 20., 20., 9., 21.};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, NoNullsTable)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col1  = int32s_col{0, 1, 2, 3, 4, 5, 6};
+  auto const col2  = floats_col{10, 11, 12, 13, 14, 15, 16};
+  auto const keys1 = int32s_col{20, 20, 20, 20, 19, 21, 9};
+  auto const keys2 = int32s_col{19, 19, 19, 20, 20, 9, 21};
+
+  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
+  auto const key_idx = std::vector<cudf::size_type>{2, 3};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col1  = int32s_col{0, 3, 4, 5, 6};
+    auto const exp_col2  = floats_col{10, 13, 14, 15, 16};
+    auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9};
+    auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col1  = int32s_col{2, 3, 4, 5, 6};
+    auto const exp_col2  = floats_col{12, 13, 14, 15, 16};
+    auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9};
+    auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col1  = int32s_col{3, 4, 5, 6};
+    auto const exp_col2  = floats_col{13, 14, 15, 16};
+    auto const exp_keys1 = int32s_col{20, 19, 21, 9};
+    auto const exp_keys2 = int32s_col{20, 20, 9, 21};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, SlicedNoNullsTable)
+{
+  auto constexpr dont_care = int32_t{0};
+
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col1  = int32s_col{dont_care, dont_care, 6, 6, 6, 3, 5, 8, 5, dont_care};
+  auto const col2  = floats_col{dont_care, dont_care, 6, 6, 6, 3, 4, 9, 4, dont_care};
+  auto const keys1 = int32s_col{dont_care, dont_care, 20, 20, 20, 20, 19, 21, 9, dont_care};
+  auto const keys2 = int32s_col{dont_care, dont_care, 19, 19, 19, 20, 20, 9, 21, dont_care};
+
+  auto const input_original = cudf::table_view{{col1, col2, keys1, keys2}};
+  auto const input          = cudf::slice(input_original, {2, 9})[0];
+  auto const key_idx        = std::vector<cudf::size_type>{2, 3};
+
+  auto const exp_col1  = int32s_col{6, 3, 5, 8, 5};
+  auto const exp_col2  = floats_col{6, 3, 4, 9, 4};
+  auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9};
+  auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21};
+  auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+  auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, SlicedNoNullsTable)
+{
+  auto constexpr dont_care = int32_t{0};
+
+  // Column(s) used to test needs to have different rows for the same keys.
+  // clang-format off
+  auto const col1  = int32s_col{0, 1, 2, // <- don't care
+                                3, 4, 5, 6, 7, 8, 9, dont_care};
+  auto const col2  = floats_col{10, 11, 12, // <- don't care
+                                13, 14, 15, 16, 17, 18, 19, dont_care};
+  auto const keys1 = int32s_col{20, 20, 20, // <- don't care
+                                20, 20, 20, 20, 19, 21, 9, dont_care};
+  auto const keys2 = int32s_col{19, 19, 19, // <- don't care
+                                19, 19, 19, 20, 20, 9, 21, dont_care};
+  // clang-format on
+  auto const input_original = cudf::table_view{{col1, col2, keys1, keys2}};
+  auto const input          = cudf::slice(input_original, {3, 10})[0];
+  auto const key_idx        = std::vector<cudf::size_type>{2, 3};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col1  = int32s_col{3, 6, 7, 8, 9};
+    auto const exp_col2  = floats_col{13, 16, 17, 18, 19};
+    auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9};
+    auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col1  = int32s_col{5, 6, 7, 8, 9};
+    auto const exp_col2  = floats_col{15, 16, 17, 18, 19};
+    auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9};
+    auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col1  = int32s_col{6, 7, 8, 9};
+    auto const exp_col2  = floats_col{16, 17, 18, 19};
+    auto const exp_keys1 = int32s_col{20, 19, 21, 9};
+    auto const exp_keys2 = int32s_col{20, 20, 9, 21};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, InputWithNulls)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col     = int32s_col{5, 4, 4, 1, 1, 8};
+  auto const keys    = int32s_col{{20, null, null, 19, 19, 21}, nulls_at({1, 2})};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 1, 8};
+    auto const exp_keys = int32s_col{{20, null, 19, 21}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are unequal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 4, 1, 8};
+    auto const exp_keys = int32s_col{{20, null, null, 19, 21}, nulls_at({1, 2})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs)
+{
+  auto constexpr null{0.0};  // shadow the global `null` variable of type int
+
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col   = int32s_col{5, 4, 4, 1, 1, 1, 8, 8, 1};
+  auto const keys  = floats_col{{20., null, null, NaN, NaN, NaN, 19., 19., 21.}, nulls_at({1, 2})};
+  auto const input = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal, NaNs are unequal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 1, 1, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, NaN, NaN, NaN, 19., 21.}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are equal, NaNs are equal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, NaN, 19., 21.}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are unequal, NaNs are unequal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 4, 1, 1, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, null, NaN, NaN, NaN, 19., 21.}, nulls_at({1, 2})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are unequal, NaNs are equal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 4, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, null, NaN, 19., 21.}, nulls_at({1, 2})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_EQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, InputWithNullsEqual)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6};
+  auto const keys    = int32s_col{{20, null, null, 19, 21, 19, 22}, nulls_at({1, 2})};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{0, 1, 3, 4, 6};
+    auto const exp_keys = int32s_col{{20, null, 19, 21, 22}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_EQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{0, 2, 4, 5, 6};
+    auto const exp_keys = int32s_col{{20, null, 21, 19, 22}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_EQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{0, 4, 6};
+    auto const exp_keys = int32s_col{{20, 21, 22}, no_nulls()};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_EQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, InputWithNullsUnequal)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6, 7};
+  auto const keys    = int32s_col{{20, null, null, 19, 21, 19, 22, 20}, nulls_at({1, 2})};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{0, 1, 2, 3, 4, 6};
+    auto const exp_keys = int32s_col{{20, null, null, 19, 21, 22}, nulls_at({1, 2})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{1, 2, 4, 5, 6, 7};
+    auto const exp_keys = int32s_col{{null, null, 21, 19, 22, 20}, nulls_at({0, 1})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{1, 2, 4, 6};
+    auto const exp_keys = int32s_col{{null, null, 21, 22}, nulls_at({0, 1})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6};
+  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22.};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{0, 1, 3, 4, 6};
+    auto const exp_keys = floats_col{20., NaN, 19., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_EQUAL, NAN_EQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{0, 2, 4, 5, 6};
+    auto const exp_keys = floats_col{20., NaN, 21., 19., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_EQUAL, NAN_EQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{0, 4, 6};
+    auto const exp_keys = floats_col{20., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_EQUAL, NAN_EQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsUnequal)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6, 7};
+  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22., 20.};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{0, 1, 2, 3, 4, 6};
+    auto const exp_keys = floats_col{20., NaN, NaN, 19., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result =
+      cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_UNEQUAL, NAN_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{1, 2, 4, 5, 6, 7};
+    auto const exp_keys = floats_col{NaN, NaN, 21., 19., 22., 20.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_UNEQUAL, NAN_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{1, 2, 4, 6};
+    auto const exp_keys = floats_col{NaN, NaN, 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_UNEQUAL, NAN_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, BasicLists)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  // clang-format off
+  auto const idx = int32s_col{ 0,  0,   1,   1,      2,      3,      4,      4,      4,   5,   5,      6};
+  auto const keys = lists_col{{}, {}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}};
+  // clang-format on
+  auto const input   = cudf::table_view{{idx, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  auto const exp_idx  = int32s_col{0, 1, 2, 3, 4, 5, 6};
+  auto const exp_keys = lists_col{{}, {1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}};
+  auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+  auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, BasicLists)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  // clang-format off
+  auto const idx = int32s_col{ 0,  1,  2,      3,   4,      5,      6,   7,   8,       9,     10,     11};
+  auto const keys = lists_col{{}, {}, {1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}};
+  // clang-format on
+  auto const input   = cudf::table_view{{idx, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_idx  = int32s_col{0, 2, 3, 5, 6, 7, 9};
+    auto const exp_keys = lists_col{{}, {1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}};
+    auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_idx  = int32s_col{1, 3, 4, 5, 8, 9, 11};
+    auto const exp_keys = lists_col{{}, {1, 1}, {1}, {1, 2}, {2}, {2, 1}, {2, 2}};
+    auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_idx  = int32s_col{3, 5, 9};
+    auto const exp_keys = lists_col{{1, 1}, {1, 2}, {2, 1}};
+    auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, SlicedBasicLists)
+{
+  auto constexpr dont_care = int32_t{0};
+
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const idx  = int32s_col{dont_care, dont_care, 1, 1, 2, 3, 4, 4, 4, 5, 5, 6, dont_care};
+  auto const keys = lists_col{
+    {0, 0}, {0, 0}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}, {5, 5}};
+  auto const input_original = cudf::table_view{{idx, keys}};
+  auto const input          = cudf::slice(input_original, {2, 12})[0];
+  auto const key_idx        = std::vector<cudf::size_type>{1};
+
+  auto const exp_idx  = int32s_col{1, 2, 3, 4, 5, 6};
+  auto const exp_val  = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}};
+  auto const expected = cudf::table_view{{exp_idx, exp_val}};
+
+  auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+}
+
+TEST_F(StableDistinctKeepAny, NullableLists)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const idx = int32s_col{0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4};
+  auto const keys =
+    lists_col{{{}, {}, {1}, {1}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {} /*NULL*/, {} /*NULL*/},
+              nulls_at({9, 10})};
+  auto const input   = cudf::table_view{{idx, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal.
+  {
+    auto const exp_idx  = int32s_col{0, 1, 2, 3, 4};
+    auto const exp_keys = lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/}, null_at(4)};
+    auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are unequal.
+  {
+    auto const exp_idx = int32s_col{0, 1, 2, 3, 4, 4};
+    auto const exp_keys =
+      lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/, {} /*NULL*/}, nulls_at({4, 5})};
+    auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, ListsWithNullsEqual)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const idx = int32s_col{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  auto const keys =
+    lists_col{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {} /*NULL*/, {2, 2}, {2, 2}, {} /*NULL*/},
+              nulls_at({7, 10})};
+  auto const input   = cudf::table_view{{idx, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_idx  = int32s_col{0, 2, 4, 5, 7};
+    auto const exp_keys = lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/}, null_at(4)};
+    auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_EQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_idx  = int32s_col{1, 3, 6, 9, 10};
+    auto const exp_keys = lists_col{{{}, {1}, {2}, {2, 2}, {} /*NULL*/}, null_at(4)};
+    auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_EQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_idx  = int32s_col{};
+    auto const exp_keys = lists_col{};
+    auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_EQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, ListsWithNullsUnequal)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const idx = int32s_col{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  auto const keys =
+    lists_col{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {} /*NULL*/, {2, 2}, {2, 2}, {} /*NULL*/},
+              nulls_at({7, 10})};
+  auto const input   = cudf::table_view{{idx, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_idx = int32s_col{0, 2, 4, 5, 7, 10};
+    auto const exp_keys =
+      lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/, {} /*NULL*/}, nulls_at({4, 5})};
+    auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_idx = int32s_col{1, 3, 6, 7, 9, 10};
+    auto const exp_keys =
+      lists_col{{{}, {1}, {2}, {} /*NULL*/, {2, 2}, {} /*NULL*/}, nulls_at({3, 5})};
+    auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_idx  = int32s_col{7, 10};
+    auto const exp_keys = lists_col{{lists_col{} /*NULL*/, lists_col{} /*NULL*/}, nulls_at({0, 1})};
+    auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, ListsOfStructs)
+{
+  // Constructing a list of structs of two elements
+  // 0.   []                  ==
+  // 1.   []                  !=
+  // 2.   Null                ==
+  // 3.   Null                !=
+  // 4.   [Null, Null]        !=
+  // 5.   [Null]              ==
+  // 6.   [Null]              ==
+  // 7.   [Null]              !=
+  // 8.   [{Null, Null}]      !=
+  // 9.   [{1,'a'}, {2,'b'}]  !=
+  // 10.  [{0,'a'}, {2,'b'}]  !=
+  // 11.  [{0,'a'}, {2,'c'}]  ==
+  // 12.  [{0,'a'}, {2,'c'}]  !=
+  // 13.  [{0,Null}]          ==
+  // 14.  [{0,Null}]          !=
+  // 15.  [{Null, 'b'}]       ==
+  // 16.  [{Null, 'b'}]
+
+  auto const structs = [] {
+    auto child1 =
+      int32s_col{{XXX, XXX, XXX, XXX, XXX, null, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, null, null},
+                 nulls_at({5, 16, 17})};
+    auto child2 = strings_col{{"" /*XXX*/,
+                               "" /*XXX*/,
+                               "" /*XXX*/,
+                               "" /*XXX*/,
+                               "" /*XXX*/,
+                               "" /*null*/,
+                               "a",
+                               "b",
+                               "a",
+                               "b",
+                               "a",
+                               "c",
+                               "a",
+                               "c",
+                               "" /*null*/,
+                               "" /*null*/,
+                               "b",
+                               "b"},
+                              nulls_at({5, 14, 15})};
+
+    return structs_col{{child1, child2}, nulls_at({0, 1, 2, 3, 4})};
+  }();
+
+  auto const offsets = int32s_col{0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18};
+  auto const null_it = nulls_at({2, 3});
+
+  auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 17);
+
+  auto const keys = cudf::column_view(cudf::data_type(cudf::type_id::LIST),
+                                      17,
+                                      nullptr,
+                                      static_cast<cudf::bitmask_type const*>(null_mask.data()),
+                                      null_count,
+                                      0,
+                                      {offsets, structs});
+
+  auto const idx     = int32s_col{1, 1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10};
+  auto const input   = cudf::table_view{{idx, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal.
+  {
+    auto const expect_map   = int32s_col{0, 2, 4, 5, 8, 9, 10, 11, 13, 15};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+
+  // Nulls are unequal.
+  {
+    auto const expect_map   = int32s_col{0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, ListsOfStructs)
+{
+  // Constructing a list of structs of two elements
+  // 0.   []                  ==
+  // 1.   []                  !=
+  // 2.   Null                ==
+  // 3.   Null                !=
+  // 4.   [Null, Null]        !=
+  // 5.   [Null]              ==
+  // 6.   [Null]              ==
+  // 7.   [Null]              !=
+  // 8.   [{Null, Null}]      !=
+  // 9.   [{1,'a'}, {2,'b'}]  !=
+  // 10.  [{0,'a'}, {2,'b'}]  !=
+  // 11.  [{0,'a'}, {2,'c'}]  ==
+  // 12.  [{0,'a'}, {2,'c'}]  !=
+  // 13.  [{0,Null}]          ==
+  // 14.  [{0,Null}]          !=
+  // 15.  [{Null, 'b'}]       ==
+  // 16.  [{Null, 'b'}]
+
+  auto const structs = [] {
+    auto child1 =
+      int32s_col{{XXX, XXX, XXX, XXX, XXX, null, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, null, null},
+                 nulls_at({5, 16, 17})};
+    auto child2 = strings_col{{"" /*XXX*/,
+                               "" /*XXX*/,
+                               "" /*XXX*/,
+                               "" /*XXX*/,
+                               "" /*XXX*/,
+                               "" /*null*/,
+                               "a",
+                               "b",
+                               "a",
+                               "b",
+                               "a",
+                               "c",
+                               "a",
+                               "c",
+                               "" /*null*/,
+                               "" /*null*/,
+                               "b",
+                               "b"},
+                              nulls_at({5, 14, 15})};
+
+    return structs_col{{child1, child2}, nulls_at({0, 1, 2, 3, 4})};
+  }();
+
+  auto const offsets = int32s_col{0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18};
+  auto const null_it = nulls_at({2, 3});
+
+  auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 17);
+
+  auto const keys = cudf::column_view(cudf::data_type(cudf::type_id::LIST),
+                                      17,
+                                      nullptr,
+                                      static_cast<cudf::bitmask_type const*>(null_mask.data()),
+                                      null_count,
+                                      0,
+                                      {offsets, structs});
+
+  auto const idx     = int32s_col{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  auto const input   = cudf::table_view{{idx, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const expect_map   = int32s_col{0, 2, 4, 5, 8, 9, 10, 11, 13, 15};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const expect_map   = int32s_col{1, 3, 4, 7, 8, 9, 10, 12, 14, 16};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const expect_map   = int32s_col{4, 8, 9, 10};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, SlicedListsOfStructs)
+{
+  // Constructing a list of struct of two elements
+  // 0.   []                  ==                <- Don't care
+  // 1.   []                  !=                <- Don't care
+  // 2.   Null                ==                <- Don't care
+  // 3.   Null                !=                <- Don't care
+  // 4.   [Null, Null]        !=                <- Don't care
+  // 5.   [Null]              ==                <- Don't care
+  // 6.   [Null]              ==                <- Don't care
+  // 7.   [Null]              !=                <- Don't care
+  // 8.   [{Null, Null}]      !=
+  // 9.   [{1,'a'}, {2,'b'}]  !=
+  // 10.  [{0,'a'}, {2,'b'}]  !=
+  // 11.  [{0,'a'}, {2,'c'}]  ==
+  // 12.  [{0,'a'}, {2,'c'}]  !=
+  // 13.  [{0,Null}]          ==
+  // 14.  [{0,Null}]          !=
+  // 15.  [{Null, 'b'}]       ==                <- Don't care
+  // 16.  [{Null, 'b'}]                         <- Don't care
+
+  auto const structs = [] {
+    auto child1 =
+      int32s_col{{XXX, XXX, XXX, XXX, XXX, null, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, null, null},
+                 nulls_at({5, 16, 17})};
+    auto child2 = strings_col{{"" /*XXX*/,
+                               "" /*XXX*/,
+                               "" /*XXX*/,
+                               "" /*XXX*/,
+                               "" /*XXX*/,
+                               "" /*null*/,
+                               "a",
+                               "b",
+                               "a",
+                               "b",
+                               "a",
+                               "c",
+                               "a",
+                               "c",
+                               "" /*null*/,
+                               "" /*null*/,
+                               "b",
+                               "b"},
+                              nulls_at({5, 14, 15})};
+
+    return structs_col{{child1, child2}, nulls_at({0, 1, 2, 3, 4})};
+  }();
+
+  auto const offsets = int32s_col{0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18};
+  auto const null_it = nulls_at({2, 3});
+
+  auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 17);
+
+  auto const keys = cudf::column_view(cudf::data_type(cudf::type_id::LIST),
+                                      17,
+                                      nullptr,
+                                      static_cast<cudf::bitmask_type const*>(null_mask.data()),
+                                      null_count,
+                                      0,
+                                      {offsets, structs});
+
+  auto const idx            = int32s_col{1, 1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10};
+  auto const input_original = cudf::table_view{{idx, keys}};
+  auto const input          = cudf::slice(input_original, {8, 15})[0];
+  auto const key_idx        = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal.
+  {
+    auto const expect_map   = int32s_col{8, 9, 10, 11, 13};
+    auto const expect_table = cudf::gather(input_original, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expect_table, *result);
+  }
+
+  // Nulls are unequal.
+  {
+    auto const expect_map   = int32s_col{8, 9, 10, 11, 13, 14};
+    auto const expect_table = cudf::gather(input_original, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expect_table, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, ListsOfEmptyStructs)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+
+  // 0.  []             ==
+  // 1.  []             !=
+  // 2.  Null           ==
+  // 3.  Null           !=
+  // 4.  [Null, Null]   ==
+  // 5.  [Null, Null]   ==
+  // 6.  [Null, Null]   !=
+  // 7.  [Null]         ==
+  // 8.  [Null]         !=
+  // 9.  [{}]           ==
+  // 10. [{}]           !=
+  // 11. [{}, {}]       ==
+  // 12. [{}, {}]
+
+  auto const structs_null_it = nulls_at({0, 1, 2, 3, 4, 5, 6, 7});
+  auto [structs_null_mask, structs_null_count] =
+    cudf::test::detail::make_null_mask(structs_null_it, structs_null_it + 14);
+  auto const structs =
+    cudf::column_view(cudf::data_type(cudf::type_id::STRUCT),
+                      14,
+                      nullptr,
+                      static_cast<cudf::bitmask_type const*>(structs_null_mask.data()),
+                      structs_null_count);
+
+  auto const offsets       = int32s_col{0, 0, 0, 0, 0, 2, 4, 6, 7, 8, 9, 10, 12, 14};
+  auto const lists_null_it = nulls_at({2, 3});
+  auto [lists_null_mask, lists_null_count] =
+    cudf::test::detail::make_null_mask(lists_null_it, lists_null_it + 13);
+  auto const keys =
+    cudf::column_view(cudf::data_type(cudf::type_id::LIST),
+                      13,
+                      nullptr,
+                      static_cast<cudf::bitmask_type const*>(lists_null_mask.data()),
+                      lists_null_count,
+                      0,
+                      {offsets, structs});
+
+  auto const idx     = int32s_col{1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6};
+  auto const input   = cudf::table_view{{idx, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal.
+  {
+    auto const expect_map   = int32s_col{0, 2, 4, 7, 9, 11};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+
+  // Nulls are unequal.
+  {
+    auto const expect_map   = int32s_col{0, 2, 3, 4, 5, 6, 7, 8, 9, 11};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, EmptyDeepList)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+
+  // List<List<int>>, where all lists are empty:
+  //
+  // 0. []
+  // 1. []
+  // 2. Null
+  // 3. Null
+
+  auto const keys =
+    lists_col{{lists_col{}, lists_col{}, lists_col{}, lists_col{}}, nulls_at({2, 3})};
+
+  auto const idx     = int32s_col{1, 1, 2, 2};
+  auto const input   = cudf::table_view{{idx, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal.
+  {
+    auto const expect_map   = int32s_col{0, 2};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+
+  // Nulls are unequal.
+  {
+    auto const expect_map   = int32s_col{0, 2, 3};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, StructsOfStructs)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+
+  //  +-----------------+
+  //  |  s1{s2{a,b}, c} |
+  //  +-----------------+
+  // 0 |  { {1, 1}, 5}  |
+  // 1 |  { {1, 1}, 5}  |  // Same as 0
+  // 2 |  { {1, 2}, 4}  |
+  // 3 |  { Null,   6}  |
+  // 4 |  { Null,   4}  |
+  // 5 |  { Null,   4}  |  // Same as 4
+  // 6 |  Null          |
+  // 7 |  Null          |  // Same as 6
+  // 8 |  { {2, 1}, 5}  |
+
+  auto s1 = [&] {
+    auto a  = int32s_col{1, 1, 1, XXX, XXX, XXX, XXX, XXX, 2};
+    auto b  = int32s_col{1, 1, 2, XXX, XXX, XXX, XXX, XXX, 1};
+    auto s2 = structs_col{{a, b}, nulls_at({3, 4, 5})};
+
+    auto c = int32s_col{5, 5, 4, 6, 4, 4, XXX, XXX, 5};
+    std::vector<std::unique_ptr<cudf::column>> s1_children;
+    s1_children.emplace_back(s2.release());
+    s1_children.emplace_back(c.release());
+    auto const null_it = nulls_at({6, 7});
+    return structs_col(std::move(s1_children), std::vector<bool>{null_it, null_it + 9});
+  }();
+
+  auto const idx     = int32s_col{0, 0, 2, 3, 4, 4, 6, 6, 8};
+  auto const input   = cudf::table_view{{idx, s1}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal.
+  {
+    auto const expect_map   = int32s_col{0, 2, 3, 4, 6, 8};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+
+  // Nulls are unequal.
+  {
+    auto const expect_map   = int32s_col{0, 2, 3, 4, 4, 6, 6, 8};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, SlicedStructsOfStructs)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+
+  //  +-----------------+
+  //  |  s1{s2{a,b}, c} |
+  //  +-----------------+
+  // 0 |  { {1, 1}, 5}  |
+  // 1 |  { {1, 1}, 5}  |  // Same as 0
+  // 2 |  { {1, 2}, 4}  |
+  // 3 |  { Null,   6}  |
+  // 4 |  { Null,   4}  |
+  // 5 |  { Null,   4}  |  // Same as 4
+  // 6 |  Null          |
+  // 7 |  Null          |  // Same as 6
+  // 8 |  { {2, 1}, 5}  |
+
+  auto s1 = [&] {
+    auto a  = int32s_col{1, 1, XXX, XXX, XXX, XXX, 1, XXX, 2};
+    auto b  = int32s_col{1, 2, XXX, XXX, XXX, XXX, 1, XXX, 1};
+    auto s2 = structs_col{{a, b}, nulls_at({3, 4, 5})};
+
+    auto c = int32s_col{5, 4, 6, 4, XXX, XXX, 5, 4, 5};
+    std::vector<std::unique_ptr<cudf::column>> s1_children;
+    s1_children.emplace_back(s2.release());
+    s1_children.emplace_back(c.release());
+    auto const null_it = nulls_at({6, 7});
+    return structs_col(std::move(s1_children), std::vector<bool>{null_it, null_it + 9});
+  }();
+
+  auto const idx            = int32s_col{0, 0, 2, 3, 4, 4, 6, 6, 8};
+  auto const input_original = cudf::table_view{{idx, s1}};
+  auto const input          = cudf::slice(input_original, {1, 7})[0];
+  auto const key_idx        = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal.
+  {
+    auto const expect_map   = int32s_col{1, 2, 3, 4, 6};
+    auto const expect_table = cudf::gather(input_original, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+
+  // Nulls are unequal.
+  {
+    auto const expect_map   = int32s_col{1, 2, 3, 4, 4, 6};
+    auto const expect_table = cudf::gather(input_original, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, StructsOfLists)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+
+  auto const idx  = int32s_col{1, 1, 2, 3, 4, 4, 4, 5, 5, 6};
+  auto const keys = [] {
+    // All child columns are identical.
+    auto child1 = lists_col{{1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}};
+    auto child2 = lists_col{{1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}};
+    auto child3 = lists_col{{1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}};
+    return structs_col{{child1, child2, child3}};
+  }();
+
+  auto const input   = cudf::table_view{{idx, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  auto const exp_idx  = int32s_col{1, 2, 3, 4, 5, 6};
+  auto const exp_keys = [] {
+    auto child1 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}};
+    auto child2 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}};
+    auto child3 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}};
+    return structs_col{{child1, child2, child3}};
+  }();
+  auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+  auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, StructsOfLists)
+{
+  auto const idx  = int32s_col{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  auto const keys = [] {
+    // All child columns are identical.
+    auto child1 = lists_col{{1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}};
+    auto child2 = lists_col{{1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}};
+    auto child3 = lists_col{{1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}};
+    return structs_col{{child1, child2, child3}};
+  }();
+
+  auto const input   = cudf::table_view{{idx, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const expect_map   = int32s_col{0, 1, 3, 4, 5, 7};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const expect_map   = int32s_col{1, 2, 3, 6, 7, 9};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const expect_map   = int32s_col{1, 3, 7};
+    auto const expect_table = cudf::gather(input, expect_map);
+
+    auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, SlicedStructsOfLists)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+
+  auto constexpr dont_care = int32_t{0};
+
+  auto const idx  = int32s_col{dont_care, dont_care, 1, 1, 2, 3, 4, 4, 4, 5, 5, 6, dont_care};
+  auto const keys = [] {
+    // All child columns are identical.
+    auto child1 = lists_col{
+      {0, 0}, {0, 0}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}, {5, 5}};
+    auto child2 = lists_col{
+      {0, 0}, {0, 0}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}, {5, 5}};
+    auto child3 = lists_col{
+      {0, 0}, {0, 0}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}, {5, 5}};
+    return structs_col{{child1, child2, child3}};
+  }();
+
+  auto const input_original = cudf::table_view{{idx, keys}};
+  auto const input          = cudf::slice(input_original, {2, 12})[0];
+  auto const key_idx        = std::vector<cudf::size_type>{1};
+
+  auto const exp_idx  = int32s_col{1, 2, 3, 4, 5, 6};
+  auto const exp_keys = [] {
+    auto child1 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}};
+    auto child2 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}};
+    auto child3 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}};
+    return structs_col{{child1, child2, child3}};
+  }();
+  auto const expected = cudf::table_view{{exp_idx, exp_keys}};
+
+  auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+}
diff --git a/cpp/tests/streams/concatenate_test.cpp b/cpp/tests/streams/concatenate_test.cpp
new file mode 100644
index 00000000000..6e6ff58686f
--- /dev/null
+++ b/cpp/tests/streams/concatenate_test.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/concatenate.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class ConcatenateTest : public cudf::test::BaseFixture {};
+
+TEST_F(ConcatenateTest, Column)
+{
+  cudf::test::fixed_width_column_wrapper<int> const input1({0, 0, 0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<int> const input2({1, 1, 1, 1, 1});
+  std::vector<cudf::column_view> views{input1, input2};
+  auto result = cudf::concatenate(views, cudf::test::get_default_stream());
+}
+
+TEST_F(ConcatenateTest, Table)
+{
+  cudf::test::fixed_width_column_wrapper<int> const input1({0, 0, 0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<int> const input2({1, 1, 1, 1, 1});
+  cudf::table_view tbl1({input1, input2});
+  cudf::table_view tbl2({input2, input1});
+  std::vector<cudf::table_view> views{tbl1, tbl2};
+  auto result = cudf::concatenate(views, cudf::test::get_default_stream());
+}
+
+TEST_F(ConcatenateTest, Masks)
+{
+  cudf::test::fixed_width_column_wrapper<int> const input1(
+    {{0, 0, 0, 0, 0}, {false, false, false, false, false}});
+  cudf::test::fixed_width_column_wrapper<int> const input2(
+    {{0, 0, 0, 0, 0}, {true, true, true, true, true}});
+  std::vector<cudf::column_view> views{input1, input2};
+  auto result = cudf::concatenate_masks(views, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/copying_test.cpp b/cpp/tests/streams/copying_test.cpp
new file mode 100644
index 00000000000..cb09331e87d
--- /dev/null
+++ b/cpp/tests/streams/copying_test.cpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/detail/null_mask.hpp>
+
+#include <limits>
+
+class CopyingTest : public cudf::test::BaseFixture {};
+
+TEST_F(CopyingTest, Gather)
+{
+  constexpr cudf::size_type source_size{1000};
+
+  auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  cudf::test::fixed_width_column_wrapper<int32_t> source_column(data, data + source_size);
+  cudf::test::fixed_width_column_wrapper<int32_t> gather_map(data, data + source_size);
+
+  cudf::table_view source_table({source_column});
+
+  cudf::gather(source_table,
+               gather_map,
+               cudf::out_of_bounds_policy::DONT_CHECK,
+               cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, ReverseTable)
+{
+  constexpr cudf::size_type num_values{10};
+
+  auto input = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_values);
+
+  auto input_table = cudf::table_view{{input}};
+  cudf::reverse(input_table, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, ReverseColumn)
+{
+  constexpr cudf::size_type num_values{10};
+
+  auto input = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_values);
+
+  cudf::reverse(input, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, ScatterTable)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> source({1, 2, 3, 4, 5, 6});
+  cudf::test::fixed_width_column_wrapper<int32_t> target({10, 20, 30, 40, 50, 60, 70, 80});
+  cudf::test::fixed_width_column_wrapper<int32_t> scatter_map({-3, 3, 1, -1});
+
+  auto const source_table = cudf::table_view({source, source});
+  auto const target_table = cudf::table_view({target, target});
+
+  cudf::scatter(source_table, scatter_map, target_table, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, ScatterScalars)
+{
+  auto const source = cudf::scalar_type_t<int32_t>(100, true, cudf::test::get_default_stream());
+  std::reference_wrapper<const cudf::scalar> slr_ref{source};
+  std::vector<std::reference_wrapper<const cudf::scalar>> source_vector{slr_ref};
+
+  cudf::test::fixed_width_column_wrapper<int32_t> target({10, 20, 30, 40, 50, 60, 70, 80});
+  cudf::test::fixed_width_column_wrapper<int32_t> scatter_map({-3, 3, 1, -1});
+
+  auto const target_table = cudf::table_view({target});
+
+  cudf::scatter(source_vector, scatter_map, target_table, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, AllocateLike)
+{
+  // For same size as input
+  cudf::size_type size = 10;
+
+  auto input = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<int32_t>()},
+                                         size,
+                                         cudf::mask_state::UNALLOCATED,
+                                         cudf::test::get_default_stream());
+  cudf::allocate_like(
+    input->view(), cudf::mask_allocation_policy::RETAIN, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, AllocateLikeSize)
+{
+  // For same size as input
+  cudf::size_type size     = 10;
+  cudf::size_type new_size = 10;
+
+  auto input = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<int32_t>()},
+                                         size,
+                                         cudf::mask_state::UNALLOCATED,
+                                         cudf::test::get_default_stream());
+  cudf::allocate_like(input->view(),
+                      new_size,
+                      cudf::mask_allocation_policy::RETAIN,
+                      cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, CopyRangeInPlace)
+{
+  constexpr cudf::size_type size{1000};
+
+  cudf::test::fixed_width_column_wrapper<int32_t, int32_t> target(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + size);
+
+  auto source_elements =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
+  cudf::test::fixed_width_column_wrapper<int32_t, typename decltype(source_elements)::value_type>
+    source(source_elements, source_elements + size);
+
+  cudf::mutable_column_view target_view{target};
+
+  constexpr cudf::size_type source_begin{9};
+  constexpr cudf::size_type source_end{size - 50};
+  constexpr cudf::size_type target_begin{30};
+  cudf::copy_range_in_place(
+    source, target_view, source_begin, source_end, target_begin, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, CopyRange)
+{
+  constexpr cudf::size_type size{1000};
+
+  cudf::test::fixed_width_column_wrapper<int32_t, int32_t> target(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + size);
+
+  auto source_elements =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
+  cudf::test::fixed_width_column_wrapper<int32_t, typename decltype(source_elements)::value_type>
+    source(source_elements, source_elements + size);
+
+  cudf::mutable_column_view target_view{target};
+  const cudf::column_view immutable_view{target_view};
+
+  cudf::size_type source_begin{9};
+  cudf::size_type source_end{size - 50};
+  cudf::size_type target_begin{30};
+  cudf::copy_range(source,
+                   immutable_view,
+                   source_begin,
+                   source_end,
+                   target_begin,
+                   cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, Shift)
+{
+  auto input =
+    cudf::test::fixed_width_column_wrapper<int32_t>{std::numeric_limits<int32_t>::min(),
+                                                    cudf::test::make_type_param_scalar<int32_t>(1),
+                                                    cudf::test::make_type_param_scalar<int32_t>(2),
+                                                    cudf::test::make_type_param_scalar<int32_t>(3),
+                                                    cudf::test::make_type_param_scalar<int32_t>(4),
+                                                    cudf::test::make_type_param_scalar<int32_t>(5),
+                                                    std::numeric_limits<int32_t>::max()};
+  auto fill = cudf::scalar_type_t<int32_t>(
+    cudf::test::make_type_param_scalar<int32_t>(7), true, cudf::test::get_default_stream());
+  cudf::shift(input, 2, fill, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, SliceColumn)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2, 3, 4, 5};
+
+  std::vector<cudf::size_type> indices{1, 3, 2, 2, 2, 5};
+  cudf::slice(col, indices, cudf::test::get_default_stream());
+  cudf::slice(col, {1, 3, 2, 2, 2, 5}, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, SliceTable)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2, 3, 4, 5};
+
+  std::vector<cudf::size_type> indices{1, 3, 2, 2, 2, 5};
+  cudf::table_view tbl({col});
+  cudf::slice(tbl, indices, cudf::test::get_default_stream());
+  cudf::slice(tbl, {1, 3, 2, 2, 2, 5}, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, SplitColumn)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2, 3, 4, 5};
+
+  std::vector<cudf::size_type> indices{1, 3, 5};
+  cudf::split(col, indices, cudf::test::get_default_stream());
+  cudf::split(col, {1, 3, 5}, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, SplitTable)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2, 3, 4, 5};
+
+  std::vector<cudf::size_type> indices{1, 3, 5};
+  cudf::table_view tbl({col});
+  cudf::split(tbl, indices, cudf::test::get_default_stream());
+  cudf::split(tbl, {1, 3, 5}, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, CopyIfElseColumnColumn)
+{
+  cudf::test::fixed_width_column_wrapper<bool> mask_w{1, 0, 0, 0};
+  cudf::test::fixed_width_column_wrapper<int32_t, int32_t> lhs_w{5, 5, 5, 5};
+  cudf::test::fixed_width_column_wrapper<int32_t, int32_t> rhs_w{6, 6, 6, 6};
+  cudf::copy_if_else(lhs_w, rhs_w, mask_w, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, CopyIfElseScalarColumn)
+{
+  auto scalar = cudf::scalar_type_t<int32_t>(
+    cudf::test::make_type_param_scalar<int32_t>(7), true, cudf::test::get_default_stream());
+  cudf::test::fixed_width_column_wrapper<int32_t, int32_t> column{5, 5, 5, 5};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w{1, 0, 0, 0};
+  cudf::copy_if_else(scalar, column, mask_w, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, CopyIfElseColumnScalar)
+{
+  auto scalar = cudf::scalar_type_t<int32_t>(
+    cudf::test::make_type_param_scalar<int32_t>(7), true, cudf::test::get_default_stream());
+  cudf::test::fixed_width_column_wrapper<int32_t, int32_t> column{5, 5, 5, 5};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w{1, 0, 0, 0};
+  cudf::copy_if_else(column, scalar, mask_w, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, CopyIfElseScalarScalar)
+{
+  auto lhs = cudf::scalar_type_t<int32_t>(
+    cudf::test::make_type_param_scalar<int32_t>(7), true, cudf::test::get_default_stream());
+  auto rhs = cudf::scalar_type_t<int32_t>(
+    cudf::test::make_type_param_scalar<int32_t>(6), true, cudf::test::get_default_stream());
+  cudf::test::fixed_width_column_wrapper<bool> mask_w{1, 0, 0, 0};
+  cudf::copy_if_else(lhs, rhs, mask_w, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, BooleanMaskScatter)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t, int32_t> source({1, 5, 6, 8, 9});
+  cudf::test::fixed_width_column_wrapper<int32_t, int32_t> target(
+    {2, 2, 3, 4, 11, 12, 7, 7, 10, 10});
+  cudf::test::fixed_width_column_wrapper<bool> mask(
+    {true, false, false, false, true, true, false, true, true, false});
+
+  auto source_table = cudf::table_view({source});
+  auto target_table = cudf::table_view({target});
+
+  cudf::boolean_mask_scatter(source_table, target_table, mask, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, BooleanMaskScatterScalars)
+{
+  std::vector<std::reference_wrapper<const cudf::scalar>> scalars;
+  auto s = cudf::scalar_type_t<int32_t>(1, true, cudf::test::get_default_stream());
+  scalars.emplace_back(s);
+  cudf::test::fixed_width_column_wrapper<int32_t, int32_t> target(
+    {2, 2, 3, 4, 11, 12, 7, 7, 10, 10});
+  cudf::test::fixed_width_column_wrapper<bool> mask(
+    {true, false, false, false, true, true, false, true, true, false});
+
+  auto target_table = cudf::table_view({target});
+
+  cudf::boolean_mask_scatter(scalars, target_table, mask, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, GetElement)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> _col{1, 2};
+  cudf::get_element(_col, 0, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, Sample)
+{
+  cudf::size_type const table_size = 1024;
+  auto const n_samples             = 10;
+  auto const multi_smpl            = cudf::sample_with_replacement::FALSE;
+
+  auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  cudf::test::fixed_width_column_wrapper<int16_t> col1(data, data + table_size);
+
+  cudf::table_view input({col1});
+  cudf::sample(input, n_samples, multi_smpl, 0, cudf::test::get_default_stream());
+}
+
+template <typename T>
+using LCW = cudf::test::lists_column_wrapper<T, int32_t>;
+
+TEST_F(CopyingTest, HasNonemptyNulls)
+{
+  auto const input =
+    LCW<int32_t>{{{{1, 2, 3, 4}, cudf::test::iterators::null_at(2)}, {5}, {}, {8, 9, 10}},
+                 cudf::test::iterators::no_nulls()}
+      .release();
+  cudf::has_nonempty_nulls(*input, cudf::test::get_default_stream());
+}
+
+TEST_F(CopyingTest, PurgeNonEmptyNulls)
+{
+  auto const input = LCW<int32_t>{{{{1, 2, 3, 4}, cudf::test::iterators::null_at(2)},
+                                   {5},
+                                   {6, 7},  // <--- Will be set to NULL. Unsanitized row.
+                                   {8, 9, 10}},
+                                  cudf::test::iterators::no_nulls()}
+                       .release();
+
+  // Set nullmask, post construction.
+  // TODO: Once set_null_mask's public API exposes a stream parameter, use that
+  // instead of the detail API.
+  cudf::detail::set_null_mask(
+    input->mutable_view().null_mask(), 2, 3, false, cudf::test::get_default_stream());
+  input->set_null_count(1);
+
+  cudf::purge_nonempty_nulls(*input, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/filling_test.cpp b/cpp/tests/streams/filling_test.cpp
new file mode 100644
index 00000000000..b822743d4ca
--- /dev/null
+++ b/cpp/tests/streams/filling_test.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class FillingTest : public cudf::test::BaseFixture {};
+
+TEST_F(FillingTest, FillInPlace)
+{
+  cudf::test::fixed_width_column_wrapper<int> col({0, 0, 0, 0, 0});
+  auto scalar = cudf::numeric_scalar<int>(5, true, cudf::test::get_default_stream());
+  cudf::mutable_column_view mut_view = col;
+  cudf::fill_in_place(mut_view, 0, 4, scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(FillingTest, Fill)
+{
+  cudf::test::fixed_width_column_wrapper<int> const col({0, 0, 0, 0, 0});
+  auto scalar = cudf::numeric_scalar<int>(5, true, cudf::test::get_default_stream());
+  cudf::fill(col, 0, 4, scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(FillingTest, RepeatVariable)
+{
+  cudf::test::fixed_width_column_wrapper<int> const col({0, 0, 0, 0, 0});
+  cudf::table_view const table({col});
+  cudf::test::fixed_width_column_wrapper<int> const counts({1, 2, 3, 4, 5});
+  cudf::repeat(table, counts, cudf::test::get_default_stream());
+}
+
+TEST_F(FillingTest, RepeatConst)
+{
+  cudf::test::fixed_width_column_wrapper<int> const col({0, 0, 0, 0, 0});
+  cudf::table_view const table({col});
+  cudf::repeat(table, 5, cudf::test::get_default_stream());
+}
+
+TEST_F(FillingTest, SequenceStep)
+{
+  auto init = cudf::numeric_scalar<int>(5, true, cudf::test::get_default_stream());
+  auto step = cudf::numeric_scalar<int>(2, true, cudf::test::get_default_stream());
+  cudf::sequence(10, init, step, cudf::test::get_default_stream());
+}
+
+TEST_F(FillingTest, Sequence)
+{
+  auto init = cudf::numeric_scalar<int>(5, true, cudf::test::get_default_stream());
+  cudf::sequence(10, init, cudf::test::get_default_stream());
+}
+
+TEST_F(FillingTest, CalendricalMonthSequence)
+{
+  cudf::timestamp_scalar<cudf::timestamp_s> init(
+    1629852896L, true, cudf::test::get_default_stream());  // 2021-08-25 00:54:56 GMT
+
+  cudf::calendrical_month_sequence(10, init, 2, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/groupby_test.cpp b/cpp/tests/streams/groupby_test.cpp
new file mode 100644
index 00000000000..03cabbc4de0
--- /dev/null
+++ b/cpp/tests/streams/groupby_test.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/groupby.hpp>
+
+using K = int32_t;  // Key type.
+
+template <typename V>
+struct groupby_stream_test : public cudf::test::BaseFixture {
+  cudf::test::fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  cudf::test::fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  void test_groupby(std::unique_ptr<cudf::groupby_aggregation>&& agg,
+                    force_use_sort_impl use_sort        = force_use_sort_impl::NO,
+                    cudf::null_policy include_null_keys = cudf::null_policy::INCLUDE,
+                    cudf::sorted keys_are_sorted        = cudf::sorted::NO)
+  {
+    auto requests = [&] {
+      auto requests = std::vector<cudf::groupby::aggregation_request>{};
+      requests.push_back(cudf::groupby::aggregation_request{});
+      requests.front().values = vals;
+      if (use_sort == force_use_sort_impl::YES) {
+        requests.front().aggregations.push_back(
+          cudf::make_nth_element_aggregation<cudf::groupby_aggregation>(0));
+      }
+      requests.front().aggregations.push_back(std::move(agg));
+      return requests;
+    }();
+
+    auto gby =
+      cudf::groupby::groupby{cudf::table_view{{keys}}, include_null_keys, keys_are_sorted, {}, {}};
+    gby.aggregate(requests, cudf::test::get_default_stream());
+    // No need to verify results, for stream test.
+  }
+};
+
+TYPED_TEST_SUITE(groupby_stream_test, cudf::test::AllTypes);
+
+TYPED_TEST(groupby_stream_test, test_count)
+{
+  auto const make_count_agg = [&](cudf::null_policy include_nulls = cudf::null_policy::EXCLUDE) {
+    return cudf::make_count_aggregation<cudf::groupby_aggregation>(include_nulls);
+  };
+
+  this->test_groupby(make_count_agg());
+  this->test_groupby(make_count_agg(), force_use_sort_impl::YES);
+  this->test_groupby(make_count_agg(cudf::null_policy::INCLUDE));
+}
diff --git a/cpp/tests/streams/hash_test.cpp b/cpp/tests/streams/hash_test.cpp
new file mode 100644
index 00000000000..0f60c506abe
--- /dev/null
+++ b/cpp/tests/streams/hash_test.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/hashing.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class HashTest : public cudf::test::BaseFixture {};
+
+TEST_F(HashTest, MultiValue)
+{
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
+
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, 100, -100, limits::min(), limits::max()});
+
+  // Different truth values should be equal
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
+
+  using ts = cudf::timestamp_s;
+  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col(
+    {ts::duration::zero(),
+     static_cast<ts::duration>(100),
+     static_cast<ts::duration>(-100),
+     ts::duration::min(),
+     ts::duration::max()});
+
+  auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1, secs_col});
+
+  auto const output1 = cudf::hash(
+    input1, cudf::hash_id::HASH_MURMUR3, cudf::DEFAULT_HASH_SEED, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/replace_test.cpp b/cpp/tests/streams/replace_test.cpp
new file mode 100644
index 00000000000..c794f99b6f6
--- /dev/null
+++ b/cpp/tests/streams/replace_test.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/replace.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/type_lists.hpp>
+
+class ReplaceTest : public cudf::test::BaseFixture {};
+
+TEST_F(ReplaceTest, ReplaceNullsColumn)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({{0, 0, 0, 0, 0}, {0, 0, 1, 1, 1}});
+  cudf::test::fixed_width_column_wrapper<int> replacement({1, 1, 1, 1, 1});
+  cudf::replace_nulls(input, replacement, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, ReplaceNullsScalar)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({{0, 0, 0, 0, 0}, {0, 0, 1, 1, 1}});
+  auto replacement = cudf::numeric_scalar<int>(1, true, cudf::test::get_default_stream());
+  cudf::replace_nulls(input, replacement, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, ReplaceNullsPolicy)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({{0, 0, 0, 0, 0}, {0, 0, 1, 1, 1}});
+  cudf::replace_nulls(input, cudf::replace_policy::FOLLOWING, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, ReplaceNansColumn)
+{
+  auto nan          = std::numeric_limits<double>::quiet_NaN();
+  auto input_column = cudf::test::make_type_param_vector<double>({0.0, 0.0, nan, nan, nan});
+  cudf::test::fixed_width_column_wrapper<double> input(input_column.begin(), input_column.end());
+  cudf::test::fixed_width_column_wrapper<double> replacement({0, 1, 2, 3, 4});
+  cudf::replace_nans(input, replacement, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, ReplaceNansScalar)
+{
+  auto nan          = std::numeric_limits<double>::quiet_NaN();
+  auto input_column = cudf::test::make_type_param_vector<double>({0.0, 0.0, nan, nan, nan});
+  cudf::test::fixed_width_column_wrapper<double> input(input_column.begin(), input_column.end());
+  auto replacement = cudf::numeric_scalar<double>(4, true, cudf::test::get_default_stream());
+  cudf::replace_nans(input, replacement, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, FindAndReplaceAll)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({0, 0, 0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<int> values_to_replace({0, 0, 0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<int> replacement_values({1, 1, 1, 1, 1});
+  cudf::find_and_replace_all(
+    input, values_to_replace, replacement_values, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, ClampWithReplace)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+  auto low          = cudf::numeric_scalar<int>(3, true, cudf::test::get_default_stream());
+  auto low_replace  = cudf::numeric_scalar<int>(5, true, cudf::test::get_default_stream());
+  auto high         = cudf::numeric_scalar<int>(7, true, cudf::test::get_default_stream());
+  auto high_replace = cudf::numeric_scalar<int>(6, true, cudf::test::get_default_stream());
+  cudf::clamp(input, low, low_replace, high, high_replace, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, Clamp)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+  auto low  = cudf::numeric_scalar<int>(3, true, cudf::test::get_default_stream());
+  auto high = cudf::numeric_scalar<int>(7, true, cudf::test::get_default_stream());
+  cudf::clamp(input, low, high, cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, NormalizeNansAndZeros)
+{
+  auto nan          = std::numeric_limits<double>::quiet_NaN();
+  auto input_column = cudf::test::make_type_param_vector<double>({-0.0, 0.0, -nan, nan, nan});
+  cudf::test::fixed_width_column_wrapper<double> input(input_column.begin(), input_column.end());
+  cudf::normalize_nans_and_zeros(static_cast<cudf::column_view>(input),
+                                 cudf::test::get_default_stream());
+}
+
+TEST_F(ReplaceTest, NormalizeNansAndZerosMutable)
+{
+  auto nan          = std::numeric_limits<double>::quiet_NaN();
+  auto input_column = cudf::test::make_type_param_vector<double>({-0.0, 0.0, -nan, nan, nan});
+  cudf::test::fixed_width_column_wrapper<double> input(input_column.begin(), input_column.end());
+  cudf::normalize_nans_and_zeros(static_cast<cudf::mutable_column_view>(input),
+                                 cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/search_test.cpp b/cpp/tests/streams/search_test.cpp
new file mode 100644
index 00000000000..fbe17fb0cc4
--- /dev/null
+++ b/cpp/tests/streams/search_test.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/search.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class SearchTest : public cudf::test::BaseFixture {};
+
+TEST_F(SearchTest, LowerBound)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> column{10, 20, 30, 40, 50};
+  cudf::test::fixed_width_column_wrapper<int32_t> values{0, 7, 10, 11, 30, 32, 40, 47, 50, 90};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expect{0, 0, 0, 1, 2, 3, 3, 4, 4, 5};
+
+  cudf::lower_bound({cudf::table_view{{column}}},
+                    {cudf::table_view{{values}}},
+                    {cudf::order::ASCENDING},
+                    {cudf::null_order::BEFORE},
+                    cudf::test::get_default_stream());
+}
+
+TEST_F(SearchTest, UpperBound)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> column{10, 20, 30, 40, 50};
+  cudf::test::fixed_width_column_wrapper<int32_t> values{0, 7, 10, 11, 30, 32, 40, 47, 50, 90};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expect{0, 0, 0, 1, 2, 3, 3, 4, 4, 5};
+
+  cudf::upper_bound({cudf::table_view{{column}}},
+                    {cudf::table_view{{values}}},
+                    {cudf::order::ASCENDING},
+                    {cudf::null_order::BEFORE},
+                    cudf::test::get_default_stream());
+}
+
+TEST_F(SearchTest, ContainsScalar)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> column{0, 1, 17, 19, 23, 29, 71};
+  cudf::numeric_scalar<int32_t> scalar{23, true, cudf::test::get_default_stream()};
+
+  cudf::contains(column, scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(SearchTest, ContainsColumn)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> haystack{0, 1, 17, 19, 23, 29, 71};
+  cudf::test::fixed_width_column_wrapper<int32_t> needles{17, 19, 45, 72};
+
+  cudf::test::fixed_width_column_wrapper<bool> expect{1, 1, 0, 0};
+
+  cudf::contains(haystack, needles, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index 1bc45aaf573..c7ceb899833 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -47,8 +47,8 @@ TEST_F(StringsColumnTest, Sort)
 
 TEST_F(StringsColumnTest, SortZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto results = cudf::sort(cudf::table_view({zero_size_strings_column}));
   cudf::test::expect_column_empty(results->view().column(0));
 }
@@ -58,7 +58,7 @@ class SliceParmsTest : public StringsColumnTest,
 
 TEST_P(SliceParmsTest, Slice)
 {
-  std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
+  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper input(
     h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
@@ -78,7 +78,7 @@ TEST_P(SliceParmsTest, Slice)
 
 TEST_P(SliceParmsTest, SliceAllNulls)
 {
-  std::vector<const char*> h_strings{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
+  std::vector<char const*> h_strings{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
   cudf::test::strings_column_wrapper input(
     h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
@@ -98,7 +98,7 @@ TEST_P(SliceParmsTest, SliceAllNulls)
 
 TEST_P(SliceParmsTest, SliceAllEmpty)
 {
-  std::vector<const char*> h_strings{"", "", "", "", "", "", ""};
+  std::vector<char const*> h_strings{"", "", "", "", "", "", ""};
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end());
 
   cudf::size_type start = 3;
@@ -117,8 +117,8 @@ INSTANTIATE_TEST_CASE_P(StringsColumnTest,
 
 TEST_F(StringsColumnTest, SliceZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto scol    = cudf::slice(zero_size_strings_column, {0, 0});
   auto results = std::make_unique<cudf::column>(scol.front());
   cudf::test::expect_column_empty(results->view());
@@ -126,14 +126,14 @@ TEST_F(StringsColumnTest, SliceZeroSizeStringsColumn)
 
 TEST_F(StringsColumnTest, Gather)
 {
-  std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
+  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
   cudf::test::fixed_width_column_wrapper<int32_t> gather_map{{4, 1}};
   auto results = cudf::gather(cudf::table_view{{strings}}, gather_map)->release();
 
-  std::vector<const char*> h_expected{"aa", "bb"};
+  std::vector<char const*> h_expected{"aa", "bb"};
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results.front()->view(), expected);
@@ -141,8 +141,8 @@ TEST_F(StringsColumnTest, Gather)
 
 TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   cudf::column_view map_view(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
   auto results = cudf::gather(cudf::table_view{{zero_size_strings_column}}, map_view)->release();
   cudf::test::expect_column_empty(results.front()->view());
@@ -152,12 +152,12 @@ TEST_F(StringsColumnTest, GatherTooBig)
 {
   std::vector<int8_t> h_chars(3000000);
   cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets({0, 3000000});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets({0, 3000000});
   auto input = cudf::column_view(
     cudf::data_type{cudf::type_id::STRING}, 1, nullptr, nullptr, 0, 0, {offsets, chars});
   auto map = thrust::constant_iterator<int8_t>(0);
   cudf::test::fixed_width_column_wrapper<int8_t> gather_map(map, map + 1000);
-  EXPECT_THROW(cudf::gather(cudf::table_view{{input}}, gather_map), cudf::logic_error);
+  EXPECT_THROW(cudf::gather(cudf::table_view{{input}}, gather_map), std::overflow_error);
 }
 
 TEST_F(StringsColumnTest, Scatter)
@@ -193,9 +193,9 @@ TEST_F(StringsColumnTest, ScatterScalar)
 
 TEST_F(StringsColumnTest, ScatterZeroSizeStringsColumn)
 {
-  cudf::column_view source(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  cudf::column_view target(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  cudf::column_view scatter_map(cudf::data_type{cudf::type_id::INT8}, 0, nullptr, nullptr, 0);
+  auto const source      = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto const target      = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto const scatter_map = cudf::make_empty_column(cudf::type_id::INT8)->view();
 
   auto results = cudf::scatter(cudf::table_view({source}), scatter_map, cudf::table_view({target}));
   cudf::test::expect_column_empty(results->view().column(0));
diff --git a/cpp/tests/strings/attrs_tests.cpp b/cpp/tests/strings/attrs_tests.cpp
index b222aefcd29..c5f38697f00 100644
--- a/cpp/tests/strings/attrs_tests.cpp
+++ b/cpp/tests/strings/attrs_tests.cpp
@@ -30,7 +30,7 @@ struct StringsAttributesTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsAttributesTest, CodePoints)
 {
-  std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
+  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -48,8 +48,8 @@ TEST_F(StringsAttributesTest, CodePoints)
 
 TEST_F(StringsAttributesTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   cudf::column_view expected_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
 
@@ -63,7 +63,7 @@ TEST_F(StringsAttributesTest, ZeroSizeStringsColumn)
 
 TEST_F(StringsAttributesTest, StringsLengths)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "eee", "bb", nullptr, "", "aa", "ééé", "something a bit longer than 32 bytes"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
diff --git a/cpp/tests/strings/booleans_tests.cpp b/cpp/tests/strings/booleans_tests.cpp
index bcf0db68545..0c7fc992065 100644
--- a/cpp/tests/strings/booleans_tests.cpp
+++ b/cpp/tests/strings/booleans_tests.cpp
@@ -29,7 +29,7 @@ struct StringsConvertTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsConvertTest, ToBooleans)
 {
-  std::vector<const char*> h_strings{"false", nullptr, "", "true", "True", "False"};
+  std::vector<char const*> h_strings{"false", nullptr, "", "true", "True", "False"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -48,7 +48,7 @@ TEST_F(StringsConvertTest, ToBooleans)
 
 TEST_F(StringsConvertTest, FromBooleans)
 {
-  std::vector<const char*> h_strings{"true", nullptr, "false", "true", "true", "false"};
+  std::vector<char const*> h_strings{"true", nullptr, "false", "true", "true", "false"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -66,16 +66,15 @@ TEST_F(StringsConvertTest, FromBooleans)
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumnBoolean)
 {
-  cudf::column_view zero_size_column(cudf::data_type{cudf::type_id::BOOL8}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::from_booleans(zero_size_column);
+  auto const zero_size_column = cudf::make_empty_column(cudf::type_id::BOOL8)->view();
+  auto results                = cudf::strings::from_booleans(zero_size_column);
   cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsConvertTest, ZeroSizeBooleansColumn)
 {
-  cudf::column_view zero_size_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::to_booleans(zero_size_column);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto results                        = cudf::strings::to_booleans(zero_size_strings_column);
   EXPECT_EQ(0, results->size());
 }
 
diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp
index 31637a6ab9a..1d82d785ae8 100644
--- a/cpp/tests/strings/case_tests.cpp
+++ b/cpp/tests/strings/case_tests.cpp
@@ -31,9 +31,9 @@ struct StringsCaseTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsCaseTest, ToLower)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "Éxamples aBc", "123 456", nullptr, "ARE THE", "tést strings", ""};
-  std::vector<const char*> h_expected{
+  std::vector<char const*> h_expected{
     "éxamples abc", "123 456", nullptr, "are the", "tést strings", ""};
 
   cudf::test::strings_column_wrapper strings(
@@ -53,9 +53,9 @@ TEST_F(StringsCaseTest, ToLower)
 
 TEST_F(StringsCaseTest, ToUpper)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "Éxamples aBc", "123 456", nullptr, "ARE THE", "tést strings", ""};
-  std::vector<const char*> h_expected{
+  std::vector<char const*> h_expected{
     "ÉXAMPLES ABC", "123 456", nullptr, "ARE THE", "TÉST STRINGS", ""};
 
   cudf::test::strings_column_wrapper strings(
@@ -75,9 +75,9 @@ TEST_F(StringsCaseTest, ToUpper)
 
 TEST_F(StringsCaseTest, Swapcase)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "Éxamples aBc", "123 456", nullptr, "ARE THE", "tést strings", ""};
-  std::vector<const char*> h_expected{
+  std::vector<char const*> h_expected{
     "éXAMPLES AbC", "123 456", nullptr, "are the", "TÉST STRINGS", ""};
 
   cudf::test::strings_column_wrapper strings(
@@ -262,8 +262,8 @@ TEST_F(StringsCaseTest, LongStrings)
 
 TEST_F(StringsCaseTest, EmptyStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
 
   auto results = cudf::strings::to_lower(strings_view);
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index 244f292c7aa..a16da41af7a 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -32,7 +32,7 @@ class CharsTypes : public StringsCharsTest,
 
 TEST_P(CharsTypes, AllTypes)
 {
-  std::vector<const char*> h_strings{"Héllo",
+  std::vector<char const*> h_strings{"Héllo",
                                      "thesé",
                                      nullptr,
                                      "HERE",
@@ -121,7 +121,7 @@ TEST_F(StringsCharsTest, LowerUpper)
 
 TEST_F(StringsCharsTest, Alphanumeric)
 {
-  std::vector<const char*> h_strings{"Héllo",
+  std::vector<char const*> h_strings{"Héllo",
                                      "thesé",
                                      nullptr,
                                      "HERE",
@@ -157,7 +157,7 @@ TEST_F(StringsCharsTest, Alphanumeric)
 
 TEST_F(StringsCharsTest, AlphaNumericSpace)
 {
-  std::vector<const char*> h_strings{"Héllo",
+  std::vector<char const*> h_strings{"Héllo",
                                      "thesé",
                                      nullptr,
                                      "HERE",
@@ -195,7 +195,7 @@ TEST_F(StringsCharsTest, AlphaNumericSpace)
 
 TEST_F(StringsCharsTest, Numerics)
 {
-  std::vector<const char*> h_strings{"Héllo",
+  std::vector<char const*> h_strings{"Héllo",
                                      "thesé",
                                      nullptr,
                                      "HERE",
diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp
index 91073847727..95993e6ecbc 100644
--- a/cpp/tests/strings/combine/concatenate_tests.cpp
+++ b/cpp/tests/strings/combine/concatenate_tests.cpp
@@ -33,12 +33,12 @@ struct StringsCombineTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsCombineTest, Concatenate)
 {
-  std::vector<const char*> h_strings1{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
+  std::vector<char const*> h_strings1{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper strings1(
     h_strings1.begin(),
     h_strings1.end(),
     thrust::make_transform_iterator(h_strings1.begin(), [](auto str) { return str != nullptr; }));
-  std::vector<const char*> h_strings2{"xyz", "abc", "d", "éa", "", nullptr, "f"};
+  std::vector<char const*> h_strings2{"xyz", "abc", "d", "éa", "", nullptr, "f"};
   cudf::test::strings_column_wrapper strings2(
     h_strings2.begin(),
     h_strings2.end(),
@@ -51,7 +51,7 @@ TEST_F(StringsCombineTest, Concatenate)
   cudf::table_view table(strings_columns);
 
   {
-    std::vector<const char*> h_expected{"eeexyz", "bbabc", nullptr, "éa", "aa", nullptr, "éééf"};
+    std::vector<char const*> h_expected{"eeexyz", "bbabc", nullptr, "éa", "aa", nullptr, "éééf"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(),
       h_expected.end(),
@@ -61,7 +61,7 @@ TEST_F(StringsCombineTest, Concatenate)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    std::vector<const char*> h_expected{
+    std::vector<char const*> h_expected{
       "eee:xyz", "bb:abc", nullptr, ":éa", "aa:", nullptr, "ééé:f"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(),
@@ -72,7 +72,7 @@ TEST_F(StringsCombineTest, Concatenate)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    std::vector<const char*> h_expected{"eee:xyz", "bb:abc", "_:d", ":éa", "aa:", "bbb:_", "ééé:f"};
+    std::vector<char const*> h_expected{"eee:xyz", "bb:abc", "_:d", ":éa", "aa:", "bbb:_", "ééé:f"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(),
       h_expected.end(),
@@ -83,7 +83,7 @@ TEST_F(StringsCombineTest, Concatenate)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    std::vector<const char*> h_expected{"eeexyz", "bbabc", "d", "éa", "aa", "bbb", "éééf"};
+    std::vector<char const*> h_expected{"eeexyz", "bbabc", "d", "éa", "aa", "bbb", "éééf"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(),
       h_expected.end(),
@@ -149,8 +149,7 @@ TEST_F(StringsCombineTest, ConcatenateSkipNulls)
 
 TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   std::vector<cudf::column_view> strings_columns;
   strings_columns.push_back(zero_size_strings_column);
   strings_columns.push_back(zero_size_strings_column);
@@ -161,8 +160,8 @@ TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns)
 
 TEST_F(StringsCombineTest, SingleColumnErrorCheck)
 {
-  cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  EXPECT_THROW(cudf::strings::concatenate(cudf::table_view{{col0}}), cudf::logic_error);
+  auto const col0 = cudf::make_empty_column(cudf::type_id::STRING);
+  EXPECT_THROW(cudf::strings::concatenate(cudf::table_view{{col0->view()}}), cudf::logic_error);
 }
 
 struct StringsConcatenateWithColSeparatorTest : public cudf::test::BaseFixture {};
@@ -180,7 +179,7 @@ TEST_F(StringsConcatenateWithColSeparatorTest, ExceptionTests)
   }
 
   {
-    cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+    auto const col0 = cudf::make_empty_column(cudf::type_id::STRING)->view();
     cudf::test::fixed_width_column_wrapper<int64_t> col1{{1}};
 
     EXPECT_THROW(
@@ -200,8 +199,7 @@ TEST_F(StringsConcatenateWithColSeparatorTest, ExceptionTests)
 
 TEST_F(StringsConcatenateWithColSeparatorTest, ZeroSizedColumns)
 {
-  cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-
+  auto const col0 = cudf::make_empty_column(cudf::type_id::STRING)->view();
   auto results =
     cudf::strings::concatenate(cudf::table_view{{col0}}, cudf::strings_column_view(col0));
   cudf::test::expect_column_empty(results->view());
diff --git a/cpp/tests/strings/combine/join_strings_tests.cpp b/cpp/tests/strings/combine/join_strings_tests.cpp
index a265803929b..ecc7432201f 100644
--- a/cpp/tests/strings/combine/join_strings_tests.cpp
+++ b/cpp/tests/strings/combine/join_strings_tests.cpp
@@ -30,7 +30,7 @@ struct JoinStringsTest : public cudf::test::BaseFixture {};
 
 TEST_F(JoinStringsTest, Join)
 {
-  std::vector<const char*> h_strings{"eee", "bb", nullptr, "zzzz", "", "aaa", "ééé"};
+  std::vector<char const*> h_strings{"eee", "bb", nullptr, "zzzz", "", "aaa", "ééé"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -58,10 +58,23 @@ TEST_F(JoinStringsTest, Join)
   }
 }
 
+TEST_F(JoinStringsTest, JoinLongStrings)
+{
+  std::string data(200, '0');
+  cudf::test::strings_column_wrapper input({data, data, data, data});
+
+  auto results =
+    cudf::strings::join_strings(cudf::strings_column_view(input), cudf::string_scalar("+"));
+
+  auto expected_data = data + "+" + data + "+" + data + "+" + data;
+  cudf::test::strings_column_wrapper expected({expected_data});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(JoinStringsTest, JoinZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   auto results      = cudf::strings::join_strings(strings_view);
   cudf::test::expect_column_empty(results->view());
diff --git a/cpp/tests/strings/concatenate_tests.cpp b/cpp/tests/strings/concatenate_tests.cpp
index a8cf9a4293d..5cf4015b9e9 100644
--- a/cpp/tests/strings/concatenate_tests.cpp
+++ b/cpp/tests/strings/concatenate_tests.cpp
@@ -28,7 +28,7 @@ struct StringsConcatenateTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsConcatenateTest, Concatenate)
 {
-  std::vector<const char*> h_strings{"aaa",
+  std::vector<char const*> h_strings{"aaa",
                                      "bb",
                                      "",
                                      "cccc",
@@ -50,8 +50,8 @@ TEST_F(StringsConcatenateTest, Concatenate)
   cudf::test::strings_column_wrapper strings2(h_strings.data() + 6, h_strings.data() + 10);
   cudf::test::strings_column_wrapper strings3(h_strings.data() + 10,
                                               h_strings.data() + h_strings.size());
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
 
   std::vector<cudf::column_view> strings_columns;
   strings_columns.push_back(strings1);
@@ -67,8 +67,7 @@ TEST_F(StringsConcatenateTest, Concatenate)
 
 TEST_F(StringsConcatenateTest, ZeroSizeStringsColumns)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   std::vector<cudf::column_view> strings_columns;
   strings_columns.push_back(zero_size_strings_column);
   strings_columns.push_back(zero_size_strings_column);
@@ -79,12 +78,11 @@ TEST_F(StringsConcatenateTest, ZeroSizeStringsColumns)
 
 TEST_F(StringsConcatenateTest, ZeroSizeStringsPlusNormal)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   std::vector<cudf::column_view> strings_columns;
   strings_columns.push_back(zero_size_strings_column);
 
-  std::vector<const char*> h_strings{"aaa",
+  std::vector<char const*> h_strings{"aaa",
                                      "bb",
                                      "",
                                      "cccc",
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 4a6e68e56e0..0cb5023a32e 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -35,7 +35,7 @@ struct StringsContainsTests : public cudf::test::BaseFixture {};
 
 TEST_F(StringsContainsTests, ContainsTest)
 {
-  std::vector<const char*> h_strings{"5",
+  std::vector<char const*> h_strings{"5",
                                      "hej",
                                      "\t \n",
                                      "12345",
@@ -154,7 +154,7 @@ TEST_F(StringsContainsTests, ContainsTest)
 
 TEST_F(StringsContainsTests, MatchesTest)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "The quick brown @fox jumps", "ovér the", "lazy @dog", "1234", "00:0:00", nullptr, ""};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
@@ -294,9 +294,9 @@ TEST_F(StringsContainsTests, HexTest)
   std::vector<char> ascii_chars(  // all possible matchable chars
     {thrust::make_counting_iterator<char>(0), thrust::make_counting_iterator<char>(127)});
   auto const count = static_cast<cudf::size_type>(ascii_chars.size());
-  std::vector<cudf::offset_type> offsets(
-    {thrust::make_counting_iterator<cudf::offset_type>(0),
-     thrust::make_counting_iterator<cudf::offset_type>(0) + count + 1});
+  std::vector<cudf::size_type> offsets(
+    {thrust::make_counting_iterator<cudf::size_type>(0),
+     thrust::make_counting_iterator<cudf::size_type>(0) + count + 1});
   auto d_chars = cudf::detail::make_device_uvector_sync(
     ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto d_offsets = cudf::detail::make_device_uvector_sync(
@@ -380,7 +380,7 @@ TEST_F(StringsContainsTests, Errors)
 
 TEST_F(StringsContainsTests, CountTest)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "The quick brown @fox jumps ovér the", "lazy @dog", "1:2:3:4", "00:0:00", nullptr, ""};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
@@ -713,7 +713,7 @@ TEST_F(StringsContainsTests, MediumRegex)
     "http://www.world.com";
   auto prog = cudf::strings::regex_program::create(medium_regex);
 
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello "
     "http://www.world.com thats all",
     "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234"
@@ -754,7 +754,7 @@ TEST_F(StringsContainsTests, LargeRegex)
     "http://www.world.com I'm here @home zzzz";
   auto prog = cudf::strings::regex_program::create(large_regex);
 
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello "
     "http://www.world.com I'm here @home zzzz",
     "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234"
diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp
index 42802ef8d66..bb5c96a09bf 100644
--- a/cpp/tests/strings/datetime_tests.cpp
+++ b/cpp/tests/strings/datetime_tests.cpp
@@ -33,7 +33,7 @@ struct StringsDatetimeTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsDatetimeTest, ToTimestamp)
 {
-  std::vector<const char*> h_strings{"1974-02-28T01:23:45Z",
+  std::vector<char const*> h_strings{"1974-02-28T01:23:45Z",
                                      "2019-07-17T21:34:37Z",
                                      nullptr,
                                      "",
@@ -330,7 +330,7 @@ TEST_F(StringsDatetimeTest, FromTimestamp)
 {
   std::vector<cudf::timestamp_s::rep> h_timestamps{
     131246625, 1563399277, 0, 1553085296, 1582934400, -1545730073, -86399};
-  std::vector<const char*> h_expected{"1974-02-28T01:23:45Z",
+  std::vector<char const*> h_expected{"1974-02-28T01:23:45Z",
                                       "2019-07-17T21:34:37Z",
                                       nullptr,
                                       "2019-03-20T12:34:56Z",
@@ -605,13 +605,11 @@ TEST_F(StringsDatetimeTest, FromTimestampAllSpecifiers)
 
 TEST_F(StringsDatetimeTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_column(
-    cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::from_timestamps(zero_size_column);
+  auto const zero_size_column = cudf::make_empty_column(cudf::type_id::TIMESTAMP_SECONDS)->view();
+  auto results                = cudf::strings::from_timestamps(zero_size_column);
   cudf::test::expect_column_empty(results->view());
 
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   results = cudf::strings::to_timestamps(cudf::strings_column_view(zero_size_strings_column),
                                          cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS},
                                          "%Y");
diff --git a/cpp/tests/strings/durations_tests.cpp b/cpp/tests/strings/durations_tests.cpp
index f380624cbb7..0c7a1ad8042 100644
--- a/cpp/tests/strings/durations_tests.cpp
+++ b/cpp/tests/strings/durations_tests.cpp
@@ -32,7 +32,7 @@ TEST_F(StringsDurationsTest, FromToDurations)
   using T = cudf::duration_s;
   std::vector<cudf::duration_s> h_durations{
     T{131246625}, T{1563399277}, T{0}, T{1553085296}, T{1582934400}, T{-1545730073}, T{-86399}};
-  std::vector<const char*> h_expected{"1519 days 01:23:45",
+  std::vector<char const*> h_expected{"1519 days 01:23:45",
                                       "18094 days 21:34:37",
                                       nullptr,
                                       "17975 days 12:34:56",
@@ -728,13 +728,11 @@ TEST_F(StringsDurationsTest, ParseEscapeCharacters)
 
 TEST_F(StringsDurationsTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_column(
-    cudf::data_type{cudf::type_id::DURATION_SECONDS}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::from_durations(zero_size_column);
+  auto const zero_size_column = cudf::make_empty_column(cudf::type_id::DURATION_SECONDS)->view();
+  auto results                = cudf::strings::from_durations(zero_size_column);
   cudf::test::expect_column_empty(results->view());
 
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   results = cudf::strings::to_durations(cudf::strings_column_view(zero_size_strings_column),
                                         cudf::data_type{cudf::type_id::DURATION_SECONDS},
                                         "%S");
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 1b232754c00..70112f7ca75 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -33,7 +33,7 @@ struct StringsExtractTests : public cudf::test::BaseFixture {};
 
 TEST_F(StringsExtractTests, ExtractTest)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "First Last", "Joe Schmoe", "John Smith", "Jane Smith", "Beyonce", "Sting", nullptr, ""};
 
   cudf::test::strings_column_wrapper strings(
@@ -42,7 +42,7 @@ TEST_F(StringsExtractTests, ExtractTest)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
   auto strings_view = cudf::strings_column_view(strings);
 
-  std::vector<const char*> h_expecteds{"First",
+  std::vector<char const*> h_expecteds{"First",
                                        "Joe",
                                        "John",
                                        "Jane",
@@ -201,7 +201,7 @@ TEST_F(StringsExtractTests, DotAll)
 
 TEST_F(StringsExtractTests, EmptyExtractTest)
 {
-  std::vector<const char*> h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""};
+  std::vector<char const*> h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -210,7 +210,7 @@ TEST_F(StringsExtractTests, EmptyExtractTest)
 
   auto pattern = std::string("([^_]*)\\Z");
 
-  std::vector<const char*> h_expected{nullptr, "AAA", "A", "", "", ""};
+  std::vector<char const*> h_expected{nullptr, "AAA", "A", "", "", ""};
   cudf::test::strings_column_wrapper expected(
     h_expected.data(),
     h_expected.data() + h_strings.size(),
@@ -225,8 +225,8 @@ TEST_F(StringsExtractTests, EmptyExtractTest)
 
 TEST_F(StringsExtractTests, ExtractAllTest)
 {
-  std::vector<const char*> h_input(
-    {"123 banana 7 eleven", "41 apple", "6 pear 0 pair", nullptr, "", "bees", "4 pare"});
+  std::vector<char const*> h_input(
+    {"123 banana 7 eleven", "41 apple", "6 péar 0 pair", nullptr, "", "bees", "4 paré"});
   auto validity =
     thrust::make_transform_iterator(h_input.begin(), [](auto str) { return str != nullptr; });
   cudf::test::strings_column_wrapper input(h_input.begin(), h_input.end(), validity);
@@ -238,11 +238,11 @@ TEST_F(StringsExtractTests, ExtractAllTest)
   using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
   LCW expected({LCW{"123", "banana", "7", "eleven"},
                 LCW{"41", "apple"},
-                LCW{"6", "pear", "0", "pair"},
+                LCW{"6", "péar", "0", "pair"},
                 LCW{},
                 LCW{},
                 LCW{},
-                LCW{"4", "pare"}},
+                LCW{"4", "paré"}},
                valids);
   auto prog    = cudf::strings::regex_program::create(pattern);
   auto results = cudf::strings::extract_all_record(sv, *prog);
@@ -269,7 +269,7 @@ TEST_F(StringsExtractTests, MediumRegex)
     "http://www.world.com";
   auto prog = cudf::strings::regex_program::create(medium_regex);
 
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello "
     "http://www.world.com thats all",
     "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234"
@@ -283,7 +283,7 @@ TEST_F(StringsExtractTests, MediumRegex)
 
   auto strings_view = cudf::strings_column_view(strings);
   auto results      = cudf::strings::extract(strings_view, *prog);
-  std::vector<const char*> h_expected{"world", nullptr, nullptr};
+  std::vector<char const*> h_expected{"world", nullptr, nullptr};
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),
     h_expected.end(),
@@ -299,7 +299,7 @@ TEST_F(StringsExtractTests, LargeRegex)
     "http://www.world.com I'm here @home zzzz";
   auto prog = cudf::strings::regex_program::create(large_regex);
 
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello "
     "http://www.world.com I'm here @home zzzz",
     "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234"
@@ -313,7 +313,7 @@ TEST_F(StringsExtractTests, LargeRegex)
 
   auto strings_view = cudf::strings_column_view(strings);
   auto results      = cudf::strings::extract(strings_view, *prog);
-  std::vector<const char*> h_expected{"quick", nullptr, nullptr};
+  std::vector<char const*> h_expected{"quick", nullptr, nullptr};
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),
     h_expected.end(),
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 7cbedac0a22..a3d392cfed0 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -44,7 +44,7 @@ struct StringsFactoriesTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsFactoriesTest, CreateColumnFromPair)
 {
-  std::vector<const char*> h_test_strings{"the quick brown fox jumps over the lazy dog",
+  std::vector<char const*> h_test_strings{"the quick brown fox jumps over the lazy dog",
                                           "the fat cat lays next to the other accénted cat",
                                           "a slow moving turtlé cannot catch the bird",
                                           "which can be composéd together to form a more complete",
@@ -59,20 +59,20 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
   cudf::size_type count = (cudf::size_type)h_test_strings.size();
   thrust::host_vector<char> h_buffer(memsize);
   rmm::device_uvector<char> d_buffer(memsize, cudf::get_default_stream());
-  thrust::host_vector<thrust::pair<const char*, cudf::size_type>> strings(count);
+  thrust::host_vector<thrust::pair<char const*, cudf::size_type>> strings(count);
   thrust::host_vector<cudf::size_type> h_offsets(count + 1);
   cudf::size_type offset = 0;
   cudf::size_type nulls  = 0;
   h_offsets[0]           = 0;
   for (cudf::size_type idx = 0; idx < count; ++idx) {
-    const char* str = h_test_strings[idx];
+    char const* str = h_test_strings[idx];
     if (!str) {
-      strings[idx] = thrust::pair<const char*, cudf::size_type>{nullptr, 0};
+      strings[idx] = thrust::pair<char const*, cudf::size_type>{nullptr, 0};
       nulls++;
     } else {
       auto length = (cudf::size_type)strlen(str);
       memcpy(h_buffer.data() + offset, str, length);
-      strings[idx] = thrust::pair<const char*, cudf::size_type>{d_buffer.data() + offset, length};
+      strings[idx] = thrust::pair<char const*, cudf::size_type>{d_buffer.data() + offset, length};
       offset += length;
     }
     h_offsets[idx + 1] = offset;
@@ -99,8 +99,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     cudf::device_span<char const>(strings_view.chars().data<char>(), strings_view.chars().size()),
     cudf::get_default_stream());
   auto h_offsets_data = cudf::detail::make_std_vector_sync(
-    cudf::device_span<cudf::offset_type const>(
-      strings_view.offsets().data<cudf::offset_type>() + strings_view.offset(),
+    cudf::device_span<cudf::size_type const>(
+      strings_view.offsets().data<cudf::size_type>() + strings_view.offset(),
       strings_view.size() + 1),
     cudf::get_default_stream());
   EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0);
@@ -110,7 +110,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
 
 TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
 {
-  std::vector<const char*> h_test_strings{"the quick brown fox jumps over the lazy dog",
+  std::vector<char const*> h_test_strings{"the quick brown fox jumps over the lazy dog",
                                           "the fat cat lays next to the other accénted cat",
                                           "a slow moving turtlé cannot catch the bird",
                                           "which can be composéd together to form a more complete",
@@ -131,7 +131,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
   cudf::size_type null_count     = 0;
   for (cudf::size_type idx = 0; idx < count; ++idx) {
     h_null_mask     = (h_null_mask << 1);
-    const char* str = h_test_strings[idx];
+    char const* str = h_test_strings[idx];
     if (str) {
       auto length = (cudf::size_type)strlen(str);
       memcpy(h_buffer.data() + offset, str, length);
@@ -164,8 +164,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
     cudf::device_span<char const>(strings_view.chars().data<char>(), strings_view.chars().size()),
     cudf::get_default_stream());
   auto h_offsets_data = cudf::detail::make_std_vector_sync(
-    cudf::device_span<cudf::offset_type const>(
-      strings_view.offsets().data<cudf::offset_type>() + strings_view.offset(),
+    cudf::device_span<cudf::size_type const>(
+      strings_view.offsets().data<cudf::size_type>() + strings_view.offset(),
       strings_view.size() + 1),
     cudf::get_default_stream());
   EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0);
@@ -194,7 +194,7 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn)
   auto results = cudf::make_strings_column(d_chars, d_offsets, d_nulls, 0);
   cudf::test::expect_column_empty(results->view());
 
-  rmm::device_uvector<thrust::pair<const char*, cudf::size_type>> d_strings{
+  rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> d_strings{
     0, cudf::get_default_stream()};
   results = cudf::make_strings_column(d_strings);
   cudf::test::expect_column_empty(results->view());
diff --git a/cpp/tests/strings/fill_tests.cpp b/cpp/tests/strings/fill_tests.cpp
index 986e5ffe879..aadd68402c8 100644
--- a/cpp/tests/strings/fill_tests.cpp
+++ b/cpp/tests/strings/fill_tests.cpp
@@ -29,14 +29,14 @@ struct StringsFillTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsFillTest, Fill)
 {
-  std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
+  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper input(
     h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
   {
     auto results = cudf::fill(input, 1, 5, cudf::string_scalar("zz"));
 
-    std::vector<const char*> h_expected{"eee", "zz", "zz", "zz", "zz", "bbb", "ééé"};
+    std::vector<char const*> h_expected{"eee", "zz", "zz", "zz", "zz", "bbb", "ééé"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
@@ -44,7 +44,7 @@ TEST_F(StringsFillTest, Fill)
   {
     auto results = cudf::fill(input, 2, 4, cudf::string_scalar("", false));
 
-    std::vector<const char*> h_expected{"eee", "bb", nullptr, nullptr, "aa", "bbb", "ééé"};
+    std::vector<char const*> h_expected{"eee", "bb", nullptr, nullptr, "aa", "bbb", "ééé"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
@@ -69,15 +69,14 @@ TEST_F(StringsFillTest, Fill)
 
 TEST_F(StringsFillTest, ZeroSizeStringsColumns)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   auto results = cudf::fill(zero_size_strings_column, 0, 0, cudf::string_scalar(""));
   cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsFillTest, FillRangeError)
 {
-  std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
+  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper input(
     h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp
index 1f334169133..986f86d2b49 100644
--- a/cpp/tests/strings/find_multiple_tests.cpp
+++ b/cpp/tests/strings/find_multiple_tests.cpp
@@ -31,14 +31,14 @@ struct StringsFindMultipleTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsFindMultipleTest, FindMultiple)
 {
-  std::vector<const char*> h_strings{"Héllo", "thesé", nullptr, "lease", "test strings", ""};
+  std::vector<char const*> h_strings{"Héllo", "thesé", nullptr, "lease", "test strings", ""};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
   auto strings_view = cudf::strings_column_view(strings);
 
-  std::vector<const char*> h_targets{"é", "a", "e", "i", "o", "u", "es"};
+  std::vector<char const*> h_targets{"é", "a", "e", "i", "o", "u", "es"};
   cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
   auto targets_view = cudf::strings_column_view(targets);
 
@@ -57,10 +57,9 @@ TEST_F(StringsFindMultipleTest, FindMultiple)
 
 TEST_F(StringsFindMultipleTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
-  std::vector<const char*> h_targets{""};
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto strings_view                   = cudf::strings_column_view(zero_size_strings_column);
+  std::vector<char const*> h_targets{""};
   cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
   auto targets_view = cudf::strings_column_view(targets);
 
@@ -73,9 +72,8 @@ TEST_F(StringsFindMultipleTest, ErrorTest)
   cudf::test::strings_column_wrapper strings({"this string intentionally left blank"}, {0});
   auto strings_view = cudf::strings_column_view(strings);
 
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto empty_view = cudf::strings_column_view(zero_size_strings_column);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto empty_view                     = cudf::strings_column_view(zero_size_strings_column);
   // targets must have at least one string
   EXPECT_THROW(cudf::strings::find_multiple(strings_view, empty_view), cudf::logic_error);
 
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index 824fc7fe349..5c0a5b760f5 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -33,16 +33,17 @@ struct StringsFindTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsFindTest, Find)
 {
-  cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""},
+  cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lest", "tést strings", ""},
                                              {1, 1, 0, 1, 1, 1});
   auto strings_view = cudf::strings_column_view(strings);
 
   {
+    auto const target = cudf::string_scalar("é");
     cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({1, 4, -1, -1, 1, -1},
                                                                      {1, 1, 0, 1, 1, 1});
-    auto results = cudf::strings::find(strings_view, cudf::string_scalar("é"));
+    auto results = cudf::strings::find(strings_view, target);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    results = cudf::strings::rfind(strings_view, cudf::string_scalar("é"));
+    results = cudf::strings::rfind(strings_view, target);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
@@ -51,6 +52,15 @@ TEST_F(StringsFindTest, Find)
     auto results = cudf::strings::rfind(strings_view, cudf::string_scalar("l"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
+  {
+    auto const target = cudf::string_scalar("es");
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({-1, 2, -1, 1, -1, -1},
+                                                                     {1, 1, 0, 1, 1, 1});
+    auto results = cudf::strings::find(strings_view, target);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    results = cudf::strings::rfind(strings_view, target);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
   {
     cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({0, 0, 0, 0, 0, 0},
                                                                      {1, 1, 0, 1, 1, 1});
@@ -58,11 +68,38 @@ TEST_F(StringsFindTest, Find)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({5, 5, 0, 5, 12, 0},
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({5, 5, 0, 4, 12, 0},
                                                                      {1, 1, 0, 1, 1, 1});
     auto results = cudf::strings::rfind(strings_view, cudf::string_scalar(""));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
+  {
+    auto const targets = cudf::test::strings_column_wrapper({"l", "t", "", "x", "é", "o"});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({2, 0, 0, -1, 1, -1},
+                                                                     {1, 1, 0, 1, 1, 1});
+    auto results = cudf::strings::find(strings_view, cudf::strings_column_view(targets));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({0, 0, 0, 0, 0, 0},
+                                                                     {1, 1, 0, 1, 1, 1});
+    auto results = cudf::strings::find(strings_view, strings_view);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
+TEST_F(StringsFindTest, FindWithNullTargets)
+{
+  cudf::test::strings_column_wrapper input({"hello hello", "thesé help", "", "helicopter", "", "x"},
+                                           {1, 1, 0, 1, 1, 1});
+  auto strings_view = cudf::strings_column_view(input);
+
+  auto const targets = cudf::test::strings_column_wrapper(
+    {"lo he", "", "hhh", "cop", "help", "xyz"}, {1, 0, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({3, -1, -1, 4, -1, -1},
+                                                                   {1, 0, 0, 1, 1, 1});
+  auto results = cudf::strings::find(strings_view, cudf::strings_column_view(targets));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(StringsFindTest, FindLongStrings)
@@ -81,9 +118,19 @@ TEST_F(StringsFindTest, FindLongStrings)
     cudf::test::fixed_width_column_wrapper<cudf::size_type>({7, 28, 0, 11, -1, -1, -1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
+  auto targets =
+    cudf::test::strings_column_wrapper({"the", "the", "the", "the", "the", "the", "the"});
+  results = cudf::strings::find(view, cudf::strings_column_view(targets));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
   results  = cudf::strings::rfind(view, cudf::string_scalar("the"));
   expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({7, 48, 0, 77, -1, -1, -1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  targets  = cudf::test::strings_column_wrapper({"there", "cat", "the", "", "the", "are", "dog"});
+  results  = cudf::strings::find(view, cudf::strings_column_view(targets));
+  expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({7, 56, 0, 0, -1, 73, -1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsFindTest, Contains)
@@ -136,7 +183,7 @@ TEST_F(StringsFindTest, StartsWith)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    std::vector<const char*> h_targets{"éa", "th", "e", "ll", "tést strings", ""};
+    std::vector<char const*> h_targets{"éa", "th", "e", "ll", "tést strings", ""};
     cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
 
     auto targets_view = cudf::strings_column_view(targets);
@@ -150,7 +197,7 @@ TEST_F(StringsFindTest, StartsWith)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    std::vector<const char*> h_targets{"éa", "th", "e", "ll", nullptr, ""};
+    std::vector<char const*> h_targets{"éa", "th", "e", "ll", nullptr, ""};
     cudf::test::strings_column_wrapper targets(
       h_targets.begin(),
       h_targets.end(),
@@ -174,7 +221,7 @@ TEST_F(StringsFindTest, EndsWith)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    std::vector<const char*> h_targets{"éa", "sé", "th", "ll", "tést strings", ""};
+    std::vector<char const*> h_targets{"éa", "sé", "th", "ll", "tést strings", ""};
     cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
 
     auto targets_view = cudf::strings_column_view(targets);
@@ -188,7 +235,7 @@ TEST_F(StringsFindTest, EndsWith)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    std::vector<const char*> h_targets{"éa", "sé", "th", nullptr, "tést strings", ""};
+    std::vector<char const*> h_targets{"éa", "sé", "th", nullptr, "tést strings", ""};
     cudf::test::strings_column_wrapper targets(
       h_targets.begin(),
       h_targets.end(),
@@ -203,10 +250,9 @@ TEST_F(StringsFindTest, EndsWith)
 
 TEST_F(StringsFindTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
-  auto results      = cudf::strings::find(strings_view, cudf::string_scalar("é"));
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto strings_view                   = cudf::strings_column_view(zero_size_strings_column);
+  auto results = cudf::strings::find(strings_view, cudf::string_scalar("é"));
   EXPECT_EQ(results->size(), 0);
   results = cudf::strings::rfind(strings_view, cudf::string_scalar("é"));
   EXPECT_EQ(results->size(), 0);
@@ -279,7 +325,7 @@ TEST_F(StringsFindTest, AllEmpty)
 
 TEST_F(StringsFindTest, AllNull)
 {
-  std::vector<const char*> h_strings{nullptr, nullptr, nullptr, nullptr};
+  std::vector<char const*> h_strings{nullptr, nullptr, nullptr, nullptr};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -331,6 +377,8 @@ TEST_F(StringsFindTest, ErrorCheck)
   EXPECT_THROW(cudf::strings::find(strings_view, cudf::string_scalar(""), 2, 1), cudf::logic_error);
   EXPECT_THROW(cudf::strings::rfind(strings_view, cudf::string_scalar(""), 2, 1),
                cudf::logic_error);
+  EXPECT_THROW(cudf::strings::find(strings_view, targets_view), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::find(strings_view, strings_view, -1), cudf::logic_error);
 }
 
 class FindParmsTest : public StringsFindTest,
@@ -372,6 +420,17 @@ TEST_P(FindParmsTest, Find)
     cudf::test::fixed_width_column_wrapper<cudf::size_type> rexpected({end, 0, end, end, end});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, rexpected);
   }
+  {
+    std::vector<std::string> h_targets({"l", "", "", "l", "s"});
+    std::vector<cudf::size_type> h_expected;
+    for (std::size_t i = 0; i < h_strings.size(); ++i)
+      h_expected.push_back(static_cast<cudf::size_type>(h_strings[i].find(h_targets[i], position)));
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(h_expected.begin(),
+                                                                     h_expected.end());
+    cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+    auto results = cudf::strings::find(strings_view, cudf::strings_column_view(targets), position);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
 }
 
 INSTANTIATE_TEST_CASE_P(StringsFindTest,
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index c7eddb69ee7..fe27beed197 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -69,12 +69,12 @@ TEST_F(StringsFindallTests, Multiline)
 
 TEST_F(StringsFindallTests, DotAll)
 {
-  cudf::test::strings_column_wrapper input({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""});
+  cudf::test::strings_column_wrapper input({"abc\nfa\nef", "fff\nabbc\nfff", "abcdéf", ""});
   auto view = cudf::strings_column_view(input);
 
   auto pattern = std::string("(b.*f)");
   using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
-  LCW expected({LCW{"bc\nfa\nef"}, LCW{"bbc\nfff"}, LCW{"bcdef"}, LCW{}});
+  LCW expected({LCW{"bc\nfa\nef"}, LCW{"bbc\nfff"}, LCW{"bcdéf"}, LCW{}});
   auto prog    = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL);
   auto results = cudf::strings::findall(view, *prog);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index ceca9a74be1..f668c384787 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -66,7 +66,7 @@ TEST_F(StringsConvertTest, IsFloat)
 
 TEST_F(StringsConvertTest, ToFloats32)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "1234",    nullptr,        "-876",     "543.2",
     "-0.12",   ".25",          "-.002",    "",
     "-0.0",    "1.2e4",        "NAN",      "abc123",
@@ -78,7 +78,7 @@ TEST_F(StringsConvertTest, ToFloats32)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
   std::vector<float> h_expected;
-  std::for_each(h_strings.begin(), h_strings.end(), [&](const char* str) {
+  std::for_each(h_strings.begin(), h_strings.end(), [&](char const* str) {
     h_expected.push_back(str ? std::atof(str) : 0);
   });
 
@@ -103,7 +103,7 @@ TEST_F(StringsConvertTest, FromFloats32)
                               std::numeric_limits<float>::quiet_NaN(),
                               839542223232.79,
                               -0.0};
-  std::vector<const char*> h_expected{
+  std::vector<char const*> h_expected{
     "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.395422433e+11", "-0.0"};
 
   cudf::test::fixed_width_column_wrapper<float> floats(
@@ -138,7 +138,7 @@ TEST_F(StringsConvertTest, ToFloats64)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
   std::vector<double> h_expected;
-  std::for_each(h_strings.begin(), h_strings.end(), [&](const char* str) {
+  std::for_each(h_strings.begin(), h_strings.end(), [&](char const* str) {
     h_expected.push_back(str ? std::atof(str) : 0);
   });
 
@@ -163,7 +163,7 @@ TEST_F(StringsConvertTest, FromFloats64)
                                std::numeric_limits<double>::quiet_NaN(),
                                839542223232.794248339,
                                -0.0};
-  std::vector<const char*> h_expected{
+  std::vector<char const*> h_expected{
     "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.395422232e+11", "-0.0"};
 
   cudf::test::fixed_width_column_wrapper<double> floats(
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 309a82edc97..59805f9cb6d 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -82,7 +82,7 @@ TEST_F(StringsConvertTest, IsIntegerBoundCheckNoNull)
 
 TEST_F(StringsConvertTest, IsIntegerBoundCheckWithNulls)
 {
-  std::vector<const char*> const h_strings{
+  std::vector<char const*> const h_strings{
     "eee", "1234", nullptr, "", "-9832", "93.24", "765é", nullptr};
   auto const strings = cudf::test::strings_column_wrapper(
     h_strings.begin(),
@@ -185,7 +185,7 @@ TEST_F(StringsConvertTest, IsIntegerBoundCheckLargeNumbers)
 
 TEST_F(StringsConvertTest, ToInteger)
 {
-  std::vector<const char*> h_strings{"eee",
+  std::vector<char const*> h_strings{"eee",
                                      "1234",
                                      nullptr,
                                      "",
@@ -241,7 +241,7 @@ TEST_F(StringsConvertTest, FromInteger)
   int32_t minint = std::numeric_limits<int32_t>::min();
   int32_t maxint = std::numeric_limits<int32_t>::max();
   std::vector<int32_t> h_integers{100, 987654321, 0, 0, -12761, 0, 5, -4, maxint, minint};
-  std::vector<const char*> h_expected{
+  std::vector<char const*> h_expected{
     "100", "987654321", nullptr, "0", "-12761", "0", "5", "-4", "2147483647", "-2147483648"};
 
   cudf::test::fixed_width_column_wrapper<int32_t> integers(
@@ -261,17 +261,16 @@ TEST_F(StringsConvertTest, FromInteger)
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::from_integers(zero_size_column);
+  auto const zero_size_column = cudf::make_empty_column(cudf::type_id::INT32)->view();
+  auto results                = cudf::strings::from_integers(zero_size_column);
   cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsConvertTest, ZeroSizeIntegersColumn)
 {
-  cudf::column_view zero_size_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
   auto results =
-    cudf::strings::to_integers(zero_size_column, cudf::data_type{cudf::type_id::INT32});
+    cudf::strings::to_integers(zero_size_strings_column, cudf::data_type{cudf::type_id::INT32});
   EXPECT_EQ(0, results->size());
 }
 
@@ -344,7 +343,7 @@ TYPED_TEST(StringsFloatConvertTest, FromToIntegerError)
 
 TEST_F(StringsConvertTest, HexToInteger)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "1234", nullptr, "98BEEF", "1a5", "CAFE", "2face", "0xAABBCCDD", "112233445566"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
@@ -389,7 +388,7 @@ TEST_F(StringsConvertTest, HexToInteger)
 
 TEST_F(StringsConvertTest, IsHex)
 {
-  std::vector<const char*> h_strings{"",
+  std::vector<char const*> h_strings{"",
                                      "1234",
                                      nullptr,
                                      "98BEEF",
diff --git a/cpp/tests/strings/ipv4_tests.cpp b/cpp/tests/strings/ipv4_tests.cpp
index f9528aa24f5..2b2d5730ca7 100644
--- a/cpp/tests/strings/ipv4_tests.cpp
+++ b/cpp/tests/strings/ipv4_tests.cpp
@@ -29,7 +29,7 @@ struct StringsConvertTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsConvertTest, IPv4ToIntegers)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     nullptr, "", "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1", "192.168.0.1"};
   cudf::test::strings_column_wrapper strings(
     h_strings.cbegin(),
@@ -51,7 +51,7 @@ TEST_F(StringsConvertTest, IPv4ToIntegers)
 
 TEST_F(StringsConvertTest, IntegersToIPv4)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "192.168.0.1", "10.0.0.1", nullptr, "0.0.0.0", "41.186.0.1", "41.197.0.1"};
   cudf::test::strings_column_wrapper strings(
     h_strings.cbegin(),
@@ -72,7 +72,8 @@ TEST_F(StringsConvertTest, IntegersToIPv4)
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumnIPV4)
 {
-  cudf::column_view zero_size_column(cudf::data_type{cudf::type_id::INT64}, 0, nullptr, nullptr, 0);
+  auto const zero_size_column = cudf::make_empty_column(cudf::type_id::INT64)->view();
+
   auto results = cudf::strings::integers_to_ipv4(zero_size_column);
   cudf::test::expect_column_empty(results->view());
   results = cudf::strings::ipv4_to_integers(results->view());
@@ -87,7 +88,7 @@ TEST_F(StringsConvertTest, IPv4Error)
 
 TEST_F(StringsConvertTest, IsIPv4)
 {
-  std::vector<const char*> h_strings{"",
+  std::vector<char const*> h_strings{"",
                                      "123.456.789.10",
                                      nullptr,
                                      "0.0.0.0",
diff --git a/cpp/tests/strings/pad_tests.cpp b/cpp/tests/strings/pad_tests.cpp
index bf962285b50..81ec87a12a8 100644
--- a/cpp/tests/strings/pad_tests.cpp
+++ b/cpp/tests/strings/pad_tests.cpp
@@ -33,7 +33,7 @@ struct StringsPadTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsPadTest, Padding)
 {
-  std::vector<const char*> h_strings{"eee ddd", "bb cc", nullptr, "", "aa", "bbb", "ééé", "o"};
+  std::vector<char const*> h_strings{"eee ddd", "bb cc", nullptr, "", "aa", "bbb", "ééé", "o"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -45,7 +45,7 @@ TEST_F(StringsPadTest, Padding)
   {
     auto results = cudf::strings::pad(strings_view, width, cudf::strings::side_type::RIGHT, phil);
 
-    std::vector<const char*> h_expected{
+    std::vector<char const*> h_expected{
       "eee ddd", "bb cc+", nullptr, "++++++", "aa++++", "bbb+++", "ééé+++", "o+++++"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(),
@@ -56,7 +56,7 @@ TEST_F(StringsPadTest, Padding)
   {
     auto results = cudf::strings::pad(strings_view, width, cudf::strings::side_type::LEFT, phil);
 
-    std::vector<const char*> h_expected{
+    std::vector<char const*> h_expected{
       "eee ddd", "+bb cc", nullptr, "++++++", "++++aa", "+++bbb", "+++ééé", "+++++o"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(),
@@ -67,7 +67,7 @@ TEST_F(StringsPadTest, Padding)
   {
     auto results = cudf::strings::pad(strings_view, width, cudf::strings::side_type::BOTH, phil);
 
-    std::vector<const char*> h_expected{
+    std::vector<char const*> h_expected{
       "eee ddd", "bb cc+", nullptr, "++++++", "++aa++", "+bbb++", "+ééé++", "++o+++"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(),
@@ -97,8 +97,8 @@ TEST_F(StringsPadTest, PaddingBoth)
 
 TEST_F(StringsPadTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   auto results      = cudf::strings::pad(strings_view, 5);
   cudf::test::expect_column_empty(results->view());
@@ -132,7 +132,7 @@ INSTANTIATE_TEST_CASE_P(StringsPadTest,
 
 TEST_F(StringsPadTest, ZFill)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "654321", "-12345", nullptr, "", "-5", "0987", "4", "+8.5", "éé", "+abé", "é+a", "100-"};
   cudf::test::strings_column_wrapper input(
     h_strings.begin(),
@@ -142,7 +142,7 @@ TEST_F(StringsPadTest, ZFill)
 
   auto results = cudf::strings::zfill(strings_view, 6);
 
-  std::vector<const char*> h_expected{"654321",
+  std::vector<char const*> h_expected{"654321",
                                       "-12345",
                                       nullptr,
                                       "000000",
@@ -163,7 +163,7 @@ TEST_F(StringsPadTest, ZFill)
 
 TEST_F(StringsPadTest, Wrap1)
 {
-  std::vector<const char*> h_strings{"12345", "thesé", nullptr, "ARE THE", "tést strings", ""};
+  std::vector<char const*> h_strings{"12345", "thesé", nullptr, "ARE THE", "tést strings", ""};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -174,7 +174,7 @@ TEST_F(StringsPadTest, Wrap1)
 
   auto results = cudf::strings::wrap(strings_view, width);
 
-  std::vector<const char*> h_expected{"12345", "thesé", nullptr, "ARE\nTHE", "tést\nstrings", ""};
+  std::vector<char const*> h_expected{"12345", "thesé", nullptr, "ARE\nTHE", "tést\nstrings", ""};
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),
     h_expected.end(),
@@ -184,7 +184,7 @@ TEST_F(StringsPadTest, Wrap1)
 
 TEST_F(StringsPadTest, Wrap2)
 {
-  std::vector<const char*> h_strings{"the quick brown fox jumped over the lazy brown dog",
+  std::vector<char const*> h_strings{"the quick brown fox jumped over the lazy brown dog",
                                      "hello, world"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
@@ -196,7 +196,7 @@ TEST_F(StringsPadTest, Wrap2)
 
   auto results = cudf::strings::wrap(strings_view, width);
 
-  std::vector<const char*> h_expected{"the quick\nbrown fox\njumped over\nthe lazy\nbrown dog",
+  std::vector<char const*> h_expected{"the quick\nbrown fox\njumped over\nthe lazy\nbrown dog",
                                       "hello, world"};
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),
@@ -207,7 +207,7 @@ TEST_F(StringsPadTest, Wrap2)
 
 TEST_F(StringsPadTest, WrapExpectFailure)
 {
-  std::vector<const char*> h_strings{"12345", "thesé", nullptr, "ARE THE", "tést strings", ""};
+  std::vector<char const*> h_strings{"12345", "thesé", nullptr, "ARE THE", "tést strings", ""};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
index b7bfad36817..9d08ac9c00c 100644
--- a/cpp/tests/strings/repeat_strings_tests.cpp
+++ b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -90,7 +90,7 @@ TYPED_TEST(RepeatStringsTypedTest, ValidStringScalar)
   // Repeat too many times.
   {
     EXPECT_THROW(cudf::strings::repeat_string(str, std::numeric_limits<int32_t>::max() / 2),
-                 cudf::logic_error);
+                 std::overflow_error);
   }
 }
 
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 0b25a026bd1..d1c545b0e2f 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -31,7 +31,7 @@ struct StringsReplaceRegexTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsReplaceRegexTest, ReplaceRegexTest)
 {
-  std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",
+  std::vector<char const*> h_strings{"the quick brown fox jumps over the lazy dog",
                                      "the fat cat lays next to the other accénted cat",
                                      "a slow moving turtlé cannot catch the bird",
                                      "which can be composéd together to form a more complete",
@@ -43,7 +43,7 @@ TEST_F(StringsReplaceRegexTest, ReplaceRegexTest)
     h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
   auto strings_view = cudf::strings_column_view(strings);
 
-  std::vector<const char*> h_expected{"= quick brown fox jumps over = lazy dog",
+  std::vector<char const*> h_expected{"= quick brown fox jumps over = lazy dog",
                                       "= fat cat lays next to = other accénted cat",
                                       "a slow moving turtlé cannot catch = bird",
                                       "which can be composéd together to form a more complete",
@@ -62,7 +62,7 @@ TEST_F(StringsReplaceRegexTest, ReplaceRegexTest)
 
 TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest)
 {
-  std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",
+  std::vector<char const*> h_strings{"the quick brown fox jumps over the lazy dog",
                                      "the fat cat lays next to the other accénted cat",
                                      "a slow moving turtlé cannot catch the bird",
                                      "which can be composéd together to form a more complete",
@@ -74,7 +74,7 @@ TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest)
     h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
   auto strings_view = cudf::strings_column_view(strings);
 
-  std::vector<const char*> h_expected{" quick brown fox jumps over  lazy dog",
+  std::vector<char const*> h_expected{" quick brown fox jumps over  lazy dog",
                                       " fat cat lays next to  other accénted cat",
                                       "** slow moving turtlé cannot catch  bird",
                                       "which can be composéd together to form ** more complete",
@@ -105,7 +105,7 @@ TEST_F(StringsReplaceRegexTest, InvalidRegex)
 
 TEST_F(StringsReplaceRegexTest, WithEmptyPattern)
 {
-  std::vector<const char*> h_strings{"asd", "xcv"};
+  std::vector<char const*> h_strings{"asd", "xcv"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
   auto strings_view = cudf::strings_column_view(strings);
@@ -247,7 +247,7 @@ TEST_F(StringsReplaceRegexTest, Multiline)
 
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
 {
-  std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",
+  std::vector<char const*> h_strings{"the quick brown fox jumps over the lazy dog",
                                      "the fat cat lays next to the other accénted cat",
                                      "a slow moving turtlé cannot catch the bird",
                                      "which can be composéd together to form a more complete",
@@ -259,7 +259,7 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
     h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
   auto sv = cudf::strings_column_view(strings);
 
-  std::vector<const char*> h_expected{"the-quick-brown-fox-jumps-over-the-lazy-dog",
+  std::vector<char const*> h_expected{"the-quick-brown-fox-jumps-over-the-lazy-dog",
                                       "the-fat-cat-lays-next-to-the-other-accénted-cat",
                                       "a-slow-moving-turtlé-cannot-catch-the-bird",
                                       "which-can-be-composéd-together-to-form-a more-complete",
@@ -354,6 +354,21 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
+// https://github.com/rapidsai/cudf/issues/13404
+TEST_F(StringsReplaceRegexTest, ReplaceBackrefsWithEmptyCapture)
+{
+  cudf::test::strings_column_wrapper input({"one\ntwo", "three\n\n", "four\r\n"});
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern       = std::string("(\r\n|\r)?$");
+  auto repl_template = std::string("[\\1]");
+
+  cudf::test::strings_column_wrapper expected({"one\ntwo[]", "three\n[]\n[]", "four[\r\n][]"});
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+}
+
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest)
 {
   cudf::test::strings_column_wrapper strings({"this string left intentionally blank"});
@@ -376,7 +391,7 @@ TEST_F(StringsReplaceRegexTest, MediumReplaceRegex)
     "http://www.world.com";
   auto prog = cudf::strings::regex_program::create(medium_regex);
 
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello "
     "http://www.world.com thats all",
     "12345678901234567890",
@@ -388,7 +403,7 @@ TEST_F(StringsReplaceRegexTest, MediumReplaceRegex)
 
   auto strings_view = cudf::strings_column_view(strings);
   auto results      = cudf::strings::replace_re(strings_view, *prog);
-  std::vector<const char*> h_expected{
+  std::vector<char const*> h_expected{
     " thats all", "12345678901234567890", "abcdefghijklmnopqrstuvwxyz"};
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),
@@ -405,7 +420,7 @@ TEST_F(StringsReplaceRegexTest, LargeReplaceRegex)
     "http://www.world.com I'm here @home zzzz";
   auto prog = cudf::strings::regex_program::create(large_regex);
 
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "zzzz hello @abc @def world The quick brown @fox jumps over the lazy @dog hello "
     "http://www.world.com I'm here @home zzzz",
     "12345678901234567890",
@@ -417,7 +432,7 @@ TEST_F(StringsReplaceRegexTest, LargeReplaceRegex)
 
   auto strings_view = cudf::strings_column_view(strings);
   auto results      = cudf::strings::replace_re(strings_view, *prog);
-  std::vector<const char*> h_expected{
+  std::vector<char const*> h_expected{
     "zzzz ", "12345678901234567890", "abcdefghijklmnopqrstuvwxyz"};
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index 85185b2deab..f143983aded 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -34,7 +34,7 @@ using algorithm = cudf::strings::detail::replace_algorithm;
 struct StringsReplaceTest : public cudf::test::BaseFixture {
   cudf::test::strings_column_wrapper build_corpus()
   {
-    std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",
+    std::vector<char const*> h_strings{"the quick brown fox jumps over the lazy dog",
                                        "the fat cat lays next to the other accénted cat",
                                        "a slow moving turtlé cannot catch the bird",
                                        "which can be composéd together to form a more complete",
@@ -54,7 +54,7 @@ TEST_F(StringsReplaceTest, Replace)
   auto input        = build_corpus();
   auto strings_view = cudf::strings_column_view(input);
   // replace all occurrences of 'the ' with '++++ '
-  std::vector<const char*> h_expected{"++++ quick brown fox jumps over ++++ lazy dog",
+  std::vector<char const*> h_expected{"++++ quick brown fox jumps over ++++ lazy dog",
                                       "++++ fat cat lays next to ++++ other accénted cat",
                                       "a slow moving turtlé cannot catch ++++ bird",
                                       "which can be composéd together to form a more complete",
@@ -86,7 +86,7 @@ TEST_F(StringsReplaceTest, ReplaceReplLimit)
   auto mr           = rmm::mr::get_current_device_resource();
 
   // only remove the first occurrence of 'the '
-  std::vector<const char*> h_expected{"quick brown fox jumps over the lazy dog",
+  std::vector<char const*> h_expected{"quick brown fox jumps over the lazy dog",
                                       "fat cat lays next to the other accénted cat",
                                       "a slow moving turtlé cannot catch bird",
                                       "which can be composéd together to form a more complete",
@@ -110,7 +110,7 @@ TEST_F(StringsReplaceTest, ReplaceReplLimitInputSliced)
 {
   auto input = build_corpus();
   // replace first two occurrences of ' ' with '--'
-  std::vector<const char*> h_expected{"the--quick--brown fox jumps over the lazy dog",
+  std::vector<char const*> h_expected{"the--quick--brown fox jumps over the lazy dog",
                                       "the--fat--cat lays next to the other accénted cat",
                                       "a--slow--moving turtlé cannot catch the bird",
                                       "which--can--be composéd together to form a more complete",
@@ -147,7 +147,7 @@ TEST_F(StringsReplaceTest, ReplaceTargetOverlap)
     corpus_view, cudf::string_scalar("the "), cudf::string_scalar("++++++++ "));
   auto strings_view = cudf::strings_column_view(*input);
   // replace all occurrences of '+++' with 'plus '
-  std::vector<const char*> h_expected{
+  std::vector<char const*> h_expected{
     "plus plus ++ quick brown fox jumps over plus plus ++ lazy dog",
     "plus plus ++ fat cat lays next to plus plus ++ other accénted cat",
     "a slow moving turtlé cannot catch plus plus ++ bird",
@@ -195,7 +195,7 @@ TEST_F(StringsReplaceTest, ReplaceTargetOverlapsStrings)
 
 TEST_F(StringsReplaceTest, ReplaceNullInput)
 {
-  std::vector<const char*> h_null_strings(128);
+  std::vector<char const*> h_null_strings(128);
   auto input = cudf::test::strings_column_wrapper(
     h_null_strings.begin(), h_null_strings.end(), thrust::make_constant_iterator(false));
   auto strings_view = cudf::strings_column_view(input);
@@ -222,7 +222,7 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString)
   auto mr           = rmm::mr::get_current_device_resource();
 
   // replace all occurrences of 'in' with  ' '
-  std::vector<const char*> h_expected{"the quick brown fox jumps over the lazy dog",
+  std::vector<char const*> h_expected{"the quick brown fox jumps over the lazy dog",
                                       "the fat cat lays next to the other accénted cat",
                                       "a slow mov g turtlé cannot catch the bird",
                                       "which can be composéd together to form a more complete",
@@ -248,7 +248,7 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString)
 
 TEST_F(StringsReplaceTest, ReplaceSlice)
 {
-  std::vector<const char*> h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""};
+  std::vector<char const*> h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""};
 
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
@@ -258,7 +258,7 @@ TEST_F(StringsReplaceTest, ReplaceSlice)
 
   {
     auto results = cudf::strings::replace_slice(strings_view, cudf::string_scalar("___"), 2, 3);
-    std::vector<const char*> h_expected{
+    std::vector<char const*> h_expected{
       "Hé___lo", "th___sé", nullptr, "AR___ THE", "té___t strings", "___"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(),
@@ -268,7 +268,7 @@ TEST_F(StringsReplaceTest, ReplaceSlice)
   }
   {
     auto results = cudf::strings::replace_slice(strings_view, cudf::string_scalar("||"), 3, 3);
-    std::vector<const char*> h_expected{
+    std::vector<char const*> h_expected{
       "Hél||lo", "the||sé", nullptr, "ARE|| THE", "tés||t strings", "||"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(),
@@ -278,7 +278,7 @@ TEST_F(StringsReplaceTest, ReplaceSlice)
   }
   {
     auto results = cudf::strings::replace_slice(strings_view, cudf::string_scalar("x"), -1, -1);
-    std::vector<const char*> h_expected{
+    std::vector<char const*> h_expected{
       "Héllox", "theséx", nullptr, "ARE THEx", "tést stringsx", "x"};
     cudf::test::strings_column_wrapper expected(
       h_expected.begin(),
@@ -310,7 +310,7 @@ TEST_F(StringsReplaceTest, ReplaceMulti)
 
     auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
 
-    std::vector<const char*> h_expected{"_ quick brown fox jumps over _ lazy dog",
+    std::vector<char const*> h_expected{"_ quick brown fox jumps over _ lazy dog",
                                         "_ fat cat lays next 2 _ other accénted cat",
                                         "A slow moving turtlé cannot catch _ bird",
                                         "which can be composéd together 2 form A more complete",
@@ -330,7 +330,7 @@ TEST_F(StringsReplaceTest, ReplaceMulti)
 
     auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
 
-    std::vector<const char*> h_expected{"* quick brown fox jumps over * lazy dog",
+    std::vector<char const*> h_expected{"* quick brown fox jumps over * lazy dog",
                                         "* fat cat lays next * * other accénted cat",
                                         "* slow moving turtlé cannot catch * bird",
                                         "which can be composéd together * form * more complete",
@@ -470,8 +470,8 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
 
 TEST_F(StringsReplaceTest, EmptyStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   auto results      = cudf::strings::replace(
     strings_view, cudf::string_scalar("not"), cudf::string_scalar("pertinent"));
diff --git a/cpp/tests/strings/reverse_tests.cpp b/cpp/tests/strings/reverse_tests.cpp
index 8c3f87709ff..3df42b61ebf 100644
--- a/cpp/tests/strings/reverse_tests.cpp
+++ b/cpp/tests/strings/reverse_tests.cpp
@@ -45,8 +45,8 @@ TEST_F(StringsReverseTest, Reverse)
 
 TEST_F(StringsReverseTest, EmptyStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto results = cudf::strings::reverse(cudf::strings_column_view(zero_size_strings_column));
   auto view    = results->view();
   cudf::test::expect_column_empty(results->view());
diff --git a/cpp/tests/strings/slice_tests.cpp b/cpp/tests/strings/slice_tests.cpp
index ca73e1791d6..92230d06672 100644
--- a/cpp/tests/strings/slice_tests.cpp
+++ b/cpp/tests/strings/slice_tests.cpp
@@ -34,12 +34,12 @@ struct StringsSliceTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsSliceTest, Substring)
 {
-  std::vector<const char*> h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""};
+  std::vector<char const*> h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  std::vector<const char*> h_expected({"llo", "esé", nullptr, "E T", "st ", ""});
+  std::vector<char const*> h_expected({"llo", "esé", nullptr, "E T", "st ", ""});
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),
     h_expected.end(),
@@ -135,7 +135,7 @@ TEST_P(Parameters, AllEmpty)
 
 TEST_P(Parameters, AllNulls)
 {
-  std::vector<const char*> h_strings{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
+  std::vector<char const*> h_strings{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -145,7 +145,7 @@ TEST_P(Parameters, AllNulls)
   auto strings_column = cudf::strings_column_view(strings);
   auto results        = cudf::strings::slice_strings(strings_column, start);
 
-  std::vector<const char*> h_expected(h_strings);
+  std::vector<char const*> h_expected(h_strings);
   cudf::test::strings_column_wrapper expected(
     h_strings.begin(),
     h_strings.end(),
@@ -274,10 +274,6 @@ TEST_F(StringsSliceTest, Error)
   auto strings_view = cudf::strings_column_view(strings);
   EXPECT_THROW(cudf::strings::slice_strings(strings_view, 0, 0, 0), cudf::logic_error);
 
-  auto delim_col = cudf::test::strings_column_wrapper({"", ""});
-  EXPECT_THROW(cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, -1),
-               cudf::logic_error);
-
   auto indexes = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2});
   EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes, indexes), cudf::logic_error);
 
@@ -292,23 +288,17 @@ TEST_F(StringsSliceTest, Error)
 
 TEST_F(StringsSliceTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+  auto strings_view                   = cudf::strings_column_view(zero_size_strings_column);
 
   auto results = cudf::strings::slice_strings(strings_view, 1, 2);
   cudf::test::expect_column_empty(results->view());
 
-  results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("foo"), 1);
-  cudf::test::expect_column_empty(results->view());
+  auto const starts_column = cudf::make_empty_column(cudf::type_id::INT32)->view();
+  auto const stops_column  = cudf::make_empty_column(cudf::type_id::INT32)->view();
 
-  cudf::column_view starts_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
-  cudf::column_view stops_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
   results = cudf::strings::slice_strings(strings_view, starts_column, stops_column);
   cudf::test::expect_column_empty(results->view());
-
-  results = cudf::strings::slice_strings(strings_view, strings_view, 1);
-  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsSliceTest, AllEmpty)
@@ -317,250 +307,8 @@ TEST_F(StringsSliceTest, AllEmpty)
   auto strings_view = cudf::strings_column_view(strings_col);
   auto exp_results  = cudf::column_view(strings_col);
 
-  auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("e"), -1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  results = cudf::strings::slice_strings(strings_view, strings_view, -1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-}
-
-TEST_F(StringsSliceTest, EmptyDelimiter)
-{
-  auto strings_col = cudf::test::strings_column_wrapper(
-    {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
-  ;
-  auto strings_view = cudf::strings_column_view(strings_col);
-
-  auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
-                                                        {true, true, false, true, true, true});
-
-  auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar(""), 1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-
-  auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
-                                                      {true, false, true, false, true, false});
-
-  results = cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, 1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-}
-
-TEST_F(StringsSliceTest, ZeroCount)
-{
-  auto strings_col = cudf::test::strings_column_wrapper(
-    {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
-  ;
-  auto strings_view = cudf::strings_column_view(strings_col);
-
-  auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
-                                                        {true, true, false, true, true, true});
-
-  auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 0);
+  auto results = cudf::strings::slice_strings(strings_view, 0, -1);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-
-  auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
-                                                      {true, false, true, false, true, false});
-
-  results = cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, 0);
+  results = cudf::strings::slice_strings(strings_view, 0, -1);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
 }
-
-TEST_F(StringsSliceTest, SearchScalarDelimiter)
-{
-  auto strings_col = cudf::test::strings_column_wrapper(
-    {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
-  ;
-  auto strings_view = cudf::strings_column_view(strings_col);
-
-  {
-    auto exp_results = cudf::test::strings_column_wrapper({"H", "thes", "", "lease", "t", ""},
-                                                          {true, true, false, true, true, true});
-
-    auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  }
-
-  {
-    auto exp_results = cudf::test::strings_column_wrapper(
-      {"llo", "", "", "lease", "st strings", ""}, {true, true, false, true, true, true});
-
-    auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  }
-
-  {
-    auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_col);
-  }
-
-  {
-    auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_col);
-  }
-
-  {
-    auto col0 = cudf::test::strings_column_wrapper(
-      {"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""},
-      {true, true, false, true, true, true});
-
-    auto exp_results = cudf::test::strings_column_wrapper({"Hello LL", "o", "", "opp", "pol", ""},
-                                                          {true, true, false, true, true, true});
-
-    auto results =
-      cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), 2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  }
-
-  {
-    auto col0 = cudf::test::strings_column_wrapper(
-      {"Hello LLollooogh", "oopppllo", "", "oppollo", "polo lop apploo po", ""},
-      {true, true, false, true, true, true});
-
-    auto exp_results = cudf::test::strings_column_wrapper({"ogh", "pppllo", "", "llo", " po", ""},
-                                                          {true, true, false, true, true, true});
-
-    auto results =
-      cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), -2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  }
-
-  {
-    auto col0 = cudf::test::strings_column_wrapper(
-      {"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""},
-      {true, true, false, true, true, true});
-
-    auto exp_results = cudf::test::strings_column_wrapper(
-      {"Héllo HélloHéllo", "Hélloééééé", "", "éééé", "poloéé lopéé apploo", ""},
-      {true, true, false, true, true, true});
-
-    auto results =
-      cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), 3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  }
-
-  {
-    auto col0 = cudf::test::strings_column_wrapper(
-      {"Héllo HélloHéllo", "Hélloééééé", "", "éééééé", "poloéé lopéé applooéé po", ""},
-      {true, true, false, true, true, true});
-
-    auto exp_results = cudf::test::strings_column_wrapper(
-      {"Héllo HélloHéllo", "Hélloééééé", "", "éééé", " lopéé applooéé po", ""},
-      {true, true, false, true, true, true});
-
-    auto results =
-      cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), -3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  }
-
-  {
-    auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com",
-                                                    "www.apache..org",
-                                                    "tennis...com",
-                                                    "nvidia....com",
-                                                    "google...........com",
-                                                    "microsoft...c.....co..m"});
-
-    auto exp_results = cudf::test::strings_column_wrapper(
-      {"www.yahoo.com", "www.apache.", "tennis..", "nvidia..", "google..", "microsoft.."});
-
-    auto results =
-      cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("."), 3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  }
-
-  {
-    auto col0 = cudf::test::strings_column_wrapper({"www.yahoo.com",
-                                                    "www.apache..org",
-                                                    "tennis..com",
-                                                    "nvidia....com",
-                                                    "google...........com",
-                                                    ".",
-                                                    "microsoft...c.....co..m"});
-
-    auto exp_results = cudf::test::strings_column_wrapper(
-      {"www.yahoo.com", "www.apache..org", "tennis..com", "..com", "..com", ".", "co..m"});
-
-    auto results =
-      cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar(".."), -2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  }
-}
-
-TEST_F(StringsSliceTest, SearchColumnDelimiter)
-{
-  {
-    auto col0 = cudf::test::strings_column_wrapper(
-      {"H™élloi ™◎oo™ff™", "thesé", "", "lease™", "tést strings", "™"},
-      {true, true, false, true, true, true});
-    auto delim_col = cudf::test::strings_column_wrapper({"™", "™", "", "e", "t", "™"});
-
-    auto exp_results = cudf::test::strings_column_wrapper({"H", "thesé", "", "l", "", ""},
-                                                          {true, true, false, true, true, true});
-
-    auto results = cudf::strings::slice_strings(
-      cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  }
-
-  {
-    auto col0 = cudf::test::strings_column_wrapper({"H™élloﬀ ﬀﬀi ™◎ooﬀ™ff™",
-                                                    "tﬀﬀhﬀesé",
-                                                    "",
-                                                    "lﬀ fooﬀ ffﬀ eaﬀse™",
-                                                    "tést ﬀstri.nﬀgs",
-                                                    "ﬀﬀ ™ ﬀﬀ ﬀ"},
-                                                   {true, true, false, true, true, true});
-    auto delim_col = cudf::test::strings_column_wrapper({"ﬀ™", "ﬀ", "", "ﬀ ", "t", "ﬀ ™"});
-
-    auto exp_results = cudf::test::strings_column_wrapper(
-      {"ff™", "esé", "", "eaﬀse™", "ri.nﬀgs", " ﬀﬀ ﬀ"}, {true, true, false, true, true, true});
-
-    auto results = cudf::strings::slice_strings(
-      cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  }
-
-  {
-    auto col0 = cudf::test::strings_column_wrapper({"H™élloﬀ ﬀﬀi fooﬀ™ barﬀ™ gooﬀ™ ™◎ooﬀ™ff™",
-                                                    "tﬀﬀhﬀesé",
-                                                    "",
-                                                    "lﬀ fooﬀ ffﬀ eaﬀse™",
-                                                    "tést ﬀ™ffﬀ™ﬀ™ffﬀstri.ﬀ™ffﬀ™nﬀgs",
-                                                    "ﬀﬀ ™ ﬀﬀ ﬀ™ ﬀ™ﬀ™ﬀ™ ﬀ™ﬀ™ ﬀ"},
-                                                   {true, true, false, true, true, true});
-    auto delim_col = cudf::test::strings_column_wrapper({"ﬀ™", "ﬀ", "", "e ", "ﬀ™ff", "ﬀ™ﬀ™"},
-                                                        {true, true, false, true, true, true});
-
-    auto exp_results = cudf::test::strings_column_wrapper({"H™élloﬀ ﬀﬀi fooﬀ™ barﬀ™ goo",
-                                                           "tﬀﬀh",
-                                                           "",
-                                                           "lﬀ fooﬀ ffﬀ eaﬀse™",
-                                                           "tést ﬀ™ffﬀ™ﬀ™ffﬀstri.",
-                                                           "ﬀﬀ ™ ﬀﬀ ﬀ™ ﬀ™ﬀ™ﬀ™ ﬀ™ﬀ™ ﬀ"},
-                                                          {true, true, false, true, true, true});
-
-    auto results = cudf::strings::slice_strings(
-      cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  }
-
-  {
-    auto col0 = cudf::test::strings_column_wrapper({"H™élloﬀ ﬀﬀi fooﬀ™ barﬀ™ gooﬀ™ ™◎ooﬀ™ff™",
-                                                    "tﬀﬀhﬀesé",
-                                                    "",
-                                                    "lﬀ fooﬀ ffﬀ eaﬀse™",
-                                                    "tést ﬀ™ffﬀ™ﬀ™ffﬀstri.ﬀ™ffﬀ™nﬀgs",
-                                                    "ﬀﬀ ™ ﬀﬀ ﬀ™ ﬀ™ﬀ™ﬀ™ ﬀ™ﬀ™ ﬀ"});
-    auto delim_col = cudf::test::strings_column_wrapper({"ﬀ™", "ﬀ", "", "e ", "ﬀ™ff", "ﬀ™ﬀ™"},
-                                                        {true, true, false, true, true, true});
-
-    auto exp_results = cudf::test::strings_column_wrapper({" gooﬀ™ ™◎ooﬀ™ff™",
-                                                           "ﬀhﬀesé",
-                                                           "",
-                                                           "lﬀ fooﬀ ffﬀ eaﬀse™",
-                                                           "ﬀ™ﬀ™ffﬀstri.ﬀ™ffﬀ™nﬀgs",
-                                                           "ﬀﬀ ™ ﬀﬀ ﬀ™ ﬀ™ﬀ™ﬀ™ ﬀ™ﬀ™ ﬀ"});
-
-    auto results = cudf::strings::slice_strings(
-      cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
-  }
-}
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index d129539e960..445e283ef45 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -37,7 +37,7 @@ struct StringsSplitTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsSplitTest, Split)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "Héllo thesé", nullptr, "are some", "tést String", "", "no-delimiter"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
@@ -45,12 +45,12 @@ TEST_F(StringsSplitTest, Split)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
   cudf::strings_column_view strings_view(strings);
 
-  std::vector<const char*> h_expected1{"Héllo", nullptr, "are", "tést", "", "no-delimiter"};
+  std::vector<char const*> h_expected1{"Héllo", nullptr, "are", "tést", "", "no-delimiter"};
   cudf::test::strings_column_wrapper expected1(
     h_expected1.begin(),
     h_expected1.end(),
     thrust::make_transform_iterator(h_expected1.begin(), [](auto str) { return str != nullptr; }));
-  std::vector<const char*> h_expected2{"thesé", nullptr, "some", "String", nullptr, nullptr};
+  std::vector<char const*> h_expected2{"thesé", nullptr, "some", "String", nullptr, nullptr};
   cudf::test::strings_column_wrapper expected2(
     h_expected2.begin(),
     h_expected2.end(),
@@ -86,7 +86,7 @@ TEST_F(StringsSplitTest, SplitWithMax)
 
 TEST_F(StringsSplitTest, SplitWhitespace)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "Héllo thesé", nullptr, "are\tsome", "tést\nString", "  ", " a  b ", ""};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
@@ -94,12 +94,12 @@ TEST_F(StringsSplitTest, SplitWhitespace)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
   cudf::strings_column_view strings_view(strings);
 
-  std::vector<const char*> h_expected1{"Héllo", nullptr, "are", "tést", nullptr, "a", nullptr};
+  std::vector<char const*> h_expected1{"Héllo", nullptr, "are", "tést", nullptr, "a", nullptr};
   cudf::test::strings_column_wrapper expected1(
     h_expected1.begin(),
     h_expected1.end(),
     thrust::make_transform_iterator(h_expected1.begin(), [](auto str) { return str != nullptr; }));
-  std::vector<const char*> h_expected2{"thesé", nullptr, "some", "String", nullptr, "b", nullptr};
+  std::vector<char const*> h_expected2{"thesé", nullptr, "some", "String", nullptr, "b", nullptr};
   cudf::test::strings_column_wrapper expected2(
     h_expected2.begin(),
     h_expected2.end(),
@@ -134,7 +134,7 @@ TEST_F(StringsSplitTest, SplitWhitespaceWithMax)
 
 TEST_F(StringsSplitTest, RSplit)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "héllo", nullptr, "a_bc_déf", "a__bc", "_ab_cd", "ab_cd_", "", " a b ", " a  bbb   c"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
@@ -142,19 +142,19 @@ TEST_F(StringsSplitTest, RSplit)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
   cudf::strings_column_view strings_view(strings);
 
-  std::vector<const char*> h_expected1{
+  std::vector<char const*> h_expected1{
     "héllo", nullptr, "a", "a", "", "ab", "", " a b ", " a  bbb   c"};
   cudf::test::strings_column_wrapper expected1(
     h_expected1.begin(),
     h_expected1.end(),
     thrust::make_transform_iterator(h_expected1.begin(), [](auto str) { return str != nullptr; }));
-  std::vector<const char*> h_expected2{
+  std::vector<char const*> h_expected2{
     nullptr, nullptr, "bc", "", "ab", "cd", nullptr, nullptr, nullptr};
   cudf::test::strings_column_wrapper expected2(
     h_expected2.begin(),
     h_expected2.end(),
     thrust::make_transform_iterator(h_expected2.begin(), [](auto str) { return str != nullptr; }));
-  std::vector<const char*> h_expected3{
+  std::vector<char const*> h_expected3{
     nullptr, nullptr, "déf", "bc", "cd", "", nullptr, nullptr, nullptr};
   cudf::test::strings_column_wrapper expected3(
     h_expected3.begin(),
@@ -192,24 +192,24 @@ TEST_F(StringsSplitTest, RSplitWithMax)
 
 TEST_F(StringsSplitTest, RSplitWhitespace)
 {
-  std::vector<const char*> h_strings{"héllo", nullptr, "a_bc_déf", "", " a\tb ", " a\r bbb   c"};
+  std::vector<char const*> h_strings{"héllo", nullptr, "a_bc_déf", "", " a\tb ", " a\r bbb   c"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
   cudf::strings_column_view strings_view(strings);
-  std::vector<const char*> h_expected1{"héllo", nullptr, "a_bc_déf", nullptr, "a", "a"};
+  std::vector<char const*> h_expected1{"héllo", nullptr, "a_bc_déf", nullptr, "a", "a"};
   cudf::test::strings_column_wrapper expected1(
     h_expected1.begin(),
     h_expected1.end(),
     thrust::make_transform_iterator(h_expected1.begin(), [](auto str) { return str != nullptr; }));
-  std::vector<const char*> h_expected2{nullptr, nullptr, nullptr, nullptr, "b", "bbb"};
+  std::vector<char const*> h_expected2{nullptr, nullptr, nullptr, nullptr, "b", "bbb"};
   cudf::test::strings_column_wrapper expected2(
     h_expected2.begin(),
     h_expected2.end(),
     thrust::make_transform_iterator(h_expected2.begin(), [](auto str) { return str != nullptr; }));
-  std::vector<const char*> h_expected3{nullptr, nullptr, nullptr, nullptr, nullptr, "c"};
+  std::vector<char const*> h_expected3{nullptr, nullptr, nullptr, nullptr, nullptr, "c"};
   cudf::test::strings_column_wrapper expected3(
     h_expected3.begin(),
     h_expected3.end(),
@@ -245,7 +245,7 @@ TEST_F(StringsSplitTest, RSplitWhitespaceWithMax)
 
 TEST_F(StringsSplitTest, SplitRecord)
 {
-  std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
+  std::vector<char const*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
@@ -261,7 +261,7 @@ TEST_F(StringsSplitTest, SplitRecord)
 
 TEST_F(StringsSplitTest, SplitRecordWithMaxSplit)
 {
-  std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
+  std::vector<char const*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
@@ -278,7 +278,7 @@ TEST_F(StringsSplitTest, SplitRecordWithMaxSplit)
 
 TEST_F(StringsSplitTest, SplitRecordWhitespace)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "   Héllo thesé", nullptr, "are\tsome  ", "tést\nString", "  "};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
@@ -293,7 +293,7 @@ TEST_F(StringsSplitTest, SplitRecordWhitespace)
 
 TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "   Héllo thesé  ", nullptr, "are\tsome  ", "tést\nString", "  "};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
@@ -385,7 +385,7 @@ TEST_F(StringsSplitTest, MultiByteDelimiters)
 
 TEST_F(StringsSplitTest, SplitRegex)
 {
-  std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
+  std::vector<char const*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
@@ -428,7 +428,7 @@ TEST_F(StringsSplitTest, SplitRegex)
 
 TEST_F(StringsSplitTest, SplitRecordRegex)
 {
-  std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
+  std::vector<char const*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
@@ -471,7 +471,7 @@ TEST_F(StringsSplitTest, SplitRecordRegex)
 
 TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
 {
-  std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are\nsome  ", "tést\rString", ""};
+  std::vector<char const*> h_strings{" Héllo\tthesé", nullptr, "are\nsome  ", "tést\rString", ""};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
@@ -550,7 +550,7 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary)
 
 TEST_F(StringsSplitTest, RSplitRecord)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "héllo", nullptr, "a_bc_déf", "a__bc", "_ab_cd", "ab_cd_", "", " a b ", " a  bbb   c"};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
@@ -574,7 +574,7 @@ TEST_F(StringsSplitTest, RSplitRecord)
 
 TEST_F(StringsSplitTest, RSplitRecordWithMaxSplit)
 {
-  std::vector<const char*> h_strings{"héllo",
+  std::vector<char const*> h_strings{"héllo",
                                      nullptr,
                                      "a_bc_déf",
                                      "___a__bc",
@@ -607,7 +607,7 @@ TEST_F(StringsSplitTest, RSplitRecordWithMaxSplit)
 
 TEST_F(StringsSplitTest, RSplitRecordWhitespace)
 {
-  std::vector<const char*> h_strings{"héllo", nullptr, "a_bc_déf", "", " a\tb ", " a\r bbb   c"};
+  std::vector<char const*> h_strings{"héllo", nullptr, "a_bc_déf", "", " a\tb ", " a\r bbb   c"};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
@@ -623,7 +623,7 @@ TEST_F(StringsSplitTest, RSplitRecordWhitespace)
 
 TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "  héllo Asher ", nullptr, "   a_bc_déf   ", "", " a\tb ", " a\r bbb   c"};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
@@ -641,7 +641,7 @@ TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit)
 
 TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit)
 {
-  std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are some\n ", "tést\rString", ""};
+  std::vector<char const*> h_strings{" Héllo\tthesé", nullptr, "are some\n ", "tést\rString", ""};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
@@ -676,8 +676,8 @@ TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit)
 
 TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto prog    = cudf::strings::regex_program::create("\\s");
   auto results = cudf::strings::split(zero_size_strings_column);
   EXPECT_TRUE(results->num_columns() == 1);
@@ -752,9 +752,9 @@ TEST_F(StringsSplitTest, AllNullsCase)
 
 TEST_F(StringsSplitTest, Partition)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "héllo", nullptr, "a_bc_déf", "a__bc", "_ab_cd", "ab_cd_", "", " a b "};
-  std::vector<const char*> h_expecteds{
+  std::vector<char const*> h_expecteds{
     "héllo", nullptr, "a", "a", "", "ab",    "",       " a b ", "",      nullptr, "_", "_",
     "_",     "_",     "",  "",  "", nullptr, "bc_déf", "_bc",   "ab_cd", "cd_",   "",  ""};
 
@@ -792,9 +792,9 @@ TEST_F(StringsSplitTest, Partition)
 
 TEST_F(StringsSplitTest, PartitionWhitespace)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "héllo", nullptr, "a bc déf", "a  bc", " ab cd", "ab cd ", "", "a_b"};
-  std::vector<const char*> h_expecteds{"héllo", nullptr, "a",      "a",   "",      "ab",  "", "a_b",
+  std::vector<char const*> h_expecteds{"héllo", nullptr, "a",      "a",   "",      "ab",  "", "a_b",
                                        "",      nullptr, " ",      " ",   " ",     " ",   "", "",
                                        "",      nullptr, "bc déf", " bc", "ab cd", "cd ", "", ""};
 
@@ -832,9 +832,9 @@ TEST_F(StringsSplitTest, PartitionWhitespace)
 
 TEST_F(StringsSplitTest, RPartition)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "héllo", nullptr, "a_bc_déf", "a__bc", "_ab_cd", "ab_cd_", "", " a b "};
-  std::vector<const char*> h_expecteds{"",      nullptr, "a_bc", "a_", "_ab", "ab_cd", "", "",
+  std::vector<char const*> h_expecteds{"",      nullptr, "a_bc", "a_", "_ab", "ab_cd", "", "",
                                        "",      nullptr, "_",    "_",  "_",   "_",     "", "",
                                        "héllo", nullptr, "déf",  "bc", "cd",  "",      "", " a b "};
 
@@ -872,9 +872,9 @@ TEST_F(StringsSplitTest, RPartition)
 
 TEST_F(StringsSplitTest, RPartitionWhitespace)
 {
-  std::vector<const char*> h_strings{
+  std::vector<char const*> h_strings{
     "héllo", nullptr, "a bc déf", "a  bc", " ab cd", "ab cd ", "", "a_b"};
-  std::vector<const char*> h_expecteds{"",      nullptr, "a bc", "a ", " ab", "ab cd", "", "",
+  std::vector<char const*> h_expecteds{"",      nullptr, "a bc", "a ", " ab", "ab cd", "", "",
                                        "",      nullptr, " ",    " ",  " ",   " ",     "", "",
                                        "héllo", nullptr, "déf",  "bc", "cd",  "",      "", "a_b"};
 
@@ -912,8 +912,8 @@ TEST_F(StringsSplitTest, RPartitionWhitespace)
 
 TEST_F(StringsSplitTest, PartitionZeroSizeStringsColumns)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto results = cudf::strings::partition(zero_size_strings_column);
   EXPECT_TRUE(results->num_columns() == 0);
   results = cudf::strings::rpartition(zero_size_strings_column);
diff --git a/cpp/tests/strings/strip_tests.cpp b/cpp/tests/strings/strip_tests.cpp
index b366562e0e9..63179474944 100644
--- a/cpp/tests/strings/strip_tests.cpp
+++ b/cpp/tests/strings/strip_tests.cpp
@@ -30,8 +30,8 @@ struct StringsStripTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsStripTest, StripLeft)
 {
-  std::vector<const char*> h_strings{"  aBc  ", "   ", nullptr, "aaaa ", "b", "\tccc ddd"};
-  std::vector<const char*> h_expected{"aBc  ", "", nullptr, "aaaa ", "b", "ccc ddd"};
+  std::vector<char const*> h_strings{"  aBc  ", "   ", nullptr, "aaaa ", "b", "\tccc ddd"};
+  std::vector<char const*> h_expected{"aBc  ", "", nullptr, "aaaa ", "b", "ccc ddd"};
 
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
@@ -50,8 +50,8 @@ TEST_F(StringsStripTest, StripLeft)
 
 TEST_F(StringsStripTest, StripRight)
 {
-  std::vector<const char*> h_strings{"  aBc  ", "   ", nullptr, "aaaa ", "b", "\tccc ddd"};
-  std::vector<const char*> h_expected{"  aBc", "", nullptr, "", "b", "\tccc ddd"};
+  std::vector<char const*> h_strings{"  aBc  ", "   ", nullptr, "aaaa ", "b", "\tccc ddd"};
+  std::vector<char const*> h_expected{"  aBc", "", nullptr, "", "b", "\tccc ddd"};
 
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
@@ -71,8 +71,8 @@ TEST_F(StringsStripTest, StripRight)
 
 TEST_F(StringsStripTest, StripBoth)
 {
-  std::vector<const char*> h_strings{"  aBc  ", "   ", nullptr, "ééé ", "b", " ccc dddé"};
-  std::vector<const char*> h_expected{"aBc", "", nullptr, "", "b", "ccc ddd"};
+  std::vector<char const*> h_strings{"  aBc  ", "   ", nullptr, "ééé ", "b", " ccc dddé"};
+  std::vector<char const*> h_expected{"aBc", "", nullptr, "", "b", "ccc ddd"};
 
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
@@ -92,17 +92,25 @@ TEST_F(StringsStripTest, StripBoth)
 
 TEST_F(StringsStripTest, EmptyStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   auto results      = cudf::strings::strip(strings_view);
   auto view         = results->view();
   cudf::test::expect_column_empty(results->view());
 }
 
+TEST_F(StringsStripTest, AllEmptyStrings)
+{
+  auto input = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {1, 1, 0, 1, 1});
+  auto results =
+    cudf::strings::strip(cudf::strings_column_view(input), cudf::strings::side_type::BOTH);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
+}
+
 TEST_F(StringsStripTest, InvalidParameter)
 {
-  std::vector<const char*> h_strings{"string left intentionally blank"};
+  std::vector<char const*> h_strings{"string left intentionally blank"};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   auto strings_view = cudf::strings_column_view(strings);
   EXPECT_THROW(cudf::strings::strip(
diff --git a/cpp/tests/strings/translate_tests.cpp b/cpp/tests/strings/translate_tests.cpp
index c76bf3d87d0..ab3973242c6 100644
--- a/cpp/tests/strings/translate_tests.cpp
+++ b/cpp/tests/strings/translate_tests.cpp
@@ -30,7 +30,7 @@
 
 struct StringsTranslateTest : public cudf::test::BaseFixture {};
 
-std::pair<cudf::char_utf8, cudf::char_utf8> make_entry(const char* from, const char* to)
+std::pair<cudf::char_utf8, cudf::char_utf8> make_entry(char const* from, char const* to)
 {
   cudf::char_utf8 in  = 0;
   cudf::char_utf8 out = 0;
@@ -41,7 +41,7 @@ std::pair<cudf::char_utf8, cudf::char_utf8> make_entry(const char* from, const c
 
 TEST_F(StringsTranslateTest, Translate)
 {
-  std::vector<const char*> h_strings{"eee ddd", "bb cc", nullptr, "", "aa", "débd"};
+  std::vector<char const*> h_strings{"eee ddd", "bb cc", nullptr, "", "aa", "débd"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -52,7 +52,7 @@ TEST_F(StringsTranslateTest, Translate)
     make_entry("b", 0), make_entry("a", "A"), make_entry("é", "E"), make_entry("e", "_")};
   auto results = cudf::strings::translate(strings_view, translate_table);
 
-  std::vector<const char*> h_expected{"___ ddd", " cc", nullptr, "", "AA", "dEd"};
+  std::vector<char const*> h_expected{"___ ddd", " cc", nullptr, "", "AA", "dEd"};
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),
     h_expected.end(),
@@ -62,8 +62,8 @@ TEST_F(StringsTranslateTest, Translate)
 
 TEST_F(StringsTranslateTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> translate_table;
   auto results = cudf::strings::translate(strings_view, translate_table);
@@ -74,7 +74,7 @@ TEST_F(StringsTranslateTest, ZeroSizeStringsColumn)
 
 TEST_F(StringsTranslateTest, FilterCharacters)
 {
-  std::vector<const char*> h_strings{"eee ddd", "bb cc", nullptr, "", "12309", "débd"};
+  std::vector<char const*> h_strings{"eee ddd", "bb cc", nullptr, "", "12309", "débd"};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
diff --git a/cpp/tests/strings/urls_tests.cpp b/cpp/tests/strings/urls_tests.cpp
index bbb6156e21d..2aec72160cc 100644
--- a/cpp/tests/strings/urls_tests.cpp
+++ b/cpp/tests/strings/urls_tests.cpp
@@ -30,7 +30,7 @@ struct StringsConvertTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsConvertTest, UrlEncode)
 {
-  std::vector<const char*> h_strings{"www.nvidia.com/rapids?p=é",
+  std::vector<char const*> h_strings{"www.nvidia.com/rapids?p=é",
                                      "/_file-7.txt",
                                      "a b+c~d",
                                      "e\tfgh\\jklmnopqrstuvwxyz",
@@ -48,7 +48,7 @@ TEST_F(StringsConvertTest, UrlEncode)
   auto strings_view = cudf::strings_column_view(strings);
   auto results      = cudf::strings::url_encode(strings_view);
 
-  std::vector<const char*> h_expected{"www.nvidia.com%2Frapids%3Fp%3D%C3%A9",
+  std::vector<char const*> h_expected{"www.nvidia.com%2Frapids%3Fp%3D%C3%A9",
                                       "%2F_file-7.txt",
                                       "a%20b%2Bc~d",
                                       "e%09fgh%5Cjklmnopqrstuvwxyz",
@@ -67,7 +67,7 @@ TEST_F(StringsConvertTest, UrlEncode)
 
 TEST_F(StringsConvertTest, UrlDecode)
 {
-  std::vector<const char*> h_strings{"www.nvidia.com/rapids/%3Fp%3D%C3%A9",
+  std::vector<char const*> h_strings{"www.nvidia.com/rapids/%3Fp%3D%C3%A9",
                                      "/_file-1234567890.txt",
                                      "a%20b%2Bc~defghijklmnopqrstuvwxyz",
                                      "%25-accent%c3%a9d",
@@ -84,7 +84,7 @@ TEST_F(StringsConvertTest, UrlDecode)
   auto strings_view = cudf::strings_column_view(strings);
   auto results      = cudf::strings::url_decode(strings_view);
 
-  std::vector<const char*> h_expected{"www.nvidia.com/rapids/?p=é",
+  std::vector<char const*> h_expected{"www.nvidia.com/rapids/?p=é",
                                       "/_file-1234567890.txt",
                                       "a b+c~defghijklmnopqrstuvwxyz",
                                       "%-accentéd",
@@ -102,7 +102,7 @@ TEST_F(StringsConvertTest, UrlDecode)
 
 TEST_F(StringsConvertTest, UrlDecodeNop)
 {
-  std::vector<const char*> h_strings{"www.nvidia.com/rapids/abc123",
+  std::vector<char const*> h_strings{"www.nvidia.com/rapids/abc123",
                                      "/_file-1234567890.txt",
                                      "abcdefghijklmnopqrstuvwxyz",
                                      "ABCDEFGHIJKLMNOPQRSTUVWXYZ%",
@@ -123,7 +123,7 @@ TEST_F(StringsConvertTest, UrlDecodeNop)
 
 TEST_F(StringsConvertTest, UrlDecodeSliced)
 {
-  std::vector<const char*> h_strings{"www.nvidia.com/rapids/%3Fp%3D%C3%A9%",
+  std::vector<char const*> h_strings{"www.nvidia.com/rapids/%3Fp%3D%C3%A9%",
                                      "01/_file-1234567890.txt",
                                      "a%20b%2Bc~defghijklmnopqrstuvwxyz",
                                      "%25-accent%c3%a9d",
@@ -137,7 +137,7 @@ TEST_F(StringsConvertTest, UrlDecodeSliced)
     thrust::make_transform_iterator(h_strings.cbegin(),
                                     [](auto const str) { return str != nullptr; }));
 
-  std::vector<const char*> h_expected{"www.nvidia.com/rapids/?p=é%",
+  std::vector<char const*> h_expected{"www.nvidia.com/rapids/?p=é%",
                                       "01/_file-1234567890.txt",
                                       "a b+c~defghijklmnopqrstuvwxyz",
                                       "%-accentéd",
@@ -204,7 +204,7 @@ TEST_F(StringsConvertTest, UrlDecodeLargeStrings)
   string_encoded.push_back('\0');
   string_plain.push_back('\0');
 
-  std::vector<const char*> h_strings{string_encoded.data()};
+  std::vector<char const*> h_strings{string_encoded.data()};
   cudf::test::strings_column_wrapper strings(
     h_strings.cbegin(),
     h_strings.cend(),
@@ -214,7 +214,7 @@ TEST_F(StringsConvertTest, UrlDecodeLargeStrings)
   auto strings_view = cudf::strings_column_view(strings);
   auto results      = cudf::strings::url_decode(strings_view);
 
-  std::vector<const char*> h_expected{string_plain.data()};
+  std::vector<char const*> h_expected{string_plain.data()};
   cudf::test::strings_column_wrapper expected(
     h_expected.cbegin(),
     h_expected.cend(),
@@ -226,10 +226,10 @@ TEST_F(StringsConvertTest, UrlDecodeLargeStrings)
 
 TEST_F(StringsConvertTest, ZeroSizeUrlStringsColumn)
 {
-  cudf::column_view zero_size_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::url_encode(zero_size_column);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
+  auto results = cudf::strings::url_encode(zero_size_strings_column);
   cudf::test::expect_column_empty(results->view());
-  results = cudf::strings::url_decode(zero_size_column);
+  results = cudf::strings::url_decode(zero_size_strings_column);
   cudf::test::expect_column_empty(results->view());
 }
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index e7370c248c8..b7617fc5724 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -626,8 +626,8 @@ TEST_F(StructColumnWrapperTest, TestStructsColumnWithEmptyChild)
   // because EMPTY columns cannot have a null mask. This test ensures that
   // we can construct a structs column with a parent null mask and an EMPTY
   // child and then view it.
-  auto empty_col =
-    std::make_unique<cudf::column>(cudf::data_type(cudf::type_id::EMPTY), 3, rmm::device_buffer{});
+  auto empty_col = std::make_unique<cudf::column>(
+    cudf::data_type(cudf::type_id::EMPTY), 3, rmm::device_buffer{}, rmm::device_buffer{}, 0);
   int num_rows{empty_col->size()};
   vector_of_columns cols;
   cols.push_back(std::move(empty_col));
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
index 183056563fc..6ecc03b9222 100644
--- a/cpp/tests/structs/utilities_tests.cpp
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -64,27 +64,6 @@ TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevel)
   CUDF_TEST_EXPECT_TABLES_EQUAL(table, flattened_table->flattened_columns());
 }
 
-TYPED_TEST(TypedStructUtilitiesTest, NestedListsUnsupported)
-{
-  using T     = TypeParam;
-  using lists = cudf::test::lists_column_wrapper<T, int32_t>;
-  using nums  = cudf::test::fixed_width_column_wrapper<T, int32_t>;
-
-  auto lists_member = lists{{0, 1}, {22, 33}, {44, 55, 66}};
-  auto nums_member  = nums{{0, 1, 2}, cudf::test::iterators::null_at(6)};
-  auto structs_col  = cudf::test::structs_column_wrapper{{nums_member, lists_member}};
-  auto nums_col     = nums{{0, 1, 2}, cudf::test::iterators::null_at(6)};
-
-  EXPECT_THROW((void)cudf::structs::detail::flatten_nested_columns(
-                 cudf::table_view{{nums_col, structs_col}},
-                 {},
-                 {},
-                 cudf::structs::detail::column_nullability::FORCE,
-                 cudf::get_default_stream(),
-                 rmm::mr::get_current_device_resource()),
-               cudf::logic_error);
-}
-
 TYPED_TEST(TypedStructUtilitiesTest, NoStructs)
 {
   using T    = TypeParam;
@@ -372,29 +351,6 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
-TYPED_TEST(TypedStructUtilitiesTest, ListsAreUnsupported)
-{
-  using T    = TypeParam;
-  using ints = cudf::test::fixed_width_column_wrapper<int32_t>;
-  using lcw  = cudf::test::lists_column_wrapper<T, int32_t>;
-
-  // clang-format off
-  auto lists_member = lcw{  {0,1,2}, {3,4,5}, {6,7,8,9} };
-  auto ints_member  = ints{       0,       1,         2 };
-  // clang-format on
-
-  auto structs_with_lists_col = cudf::test::structs_column_wrapper{lists_member, ints_member};
-
-  EXPECT_THROW((void)cudf::structs::detail::flatten_nested_columns(
-                 cudf::table_view{{structs_with_lists_col}},
-                 {},
-                 {},
-                 cudf::structs::detail::column_nullability::FORCE,
-                 cudf::get_default_stream(),
-                 rmm::mr::get_current_device_resource()),
-               cudf::logic_error);
-}
-
 struct SuperimposeTest : StructUtilitiesTest {};
 
 template <typename T>
diff --git a/cpp/tests/table/experimental_row_operator_tests.cu b/cpp/tests/table/experimental_row_operator_tests.cu
index 5ae1c7d9729..896cc7a82d4 100644
--- a/cpp/tests/table/experimental_row_operator_tests.cu
+++ b/cpp/tests/table/experimental_row_operator_tests.cu
@@ -14,28 +14,16 @@
  * limitations under the License.
  */
 
+#include "row_operator_tests_utilities.hpp"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/table/row_operators.cuh>
-#include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-
-#include <cmath>
-#include <vector>
 
 template <typename T>
 struct TypedTableViewTest : public cudf::test::BaseFixture {};
@@ -45,175 +33,26 @@ using NumericTypesNotBool =
 TYPED_TEST_SUITE(TypedTableViewTest, NumericTypesNotBool);
 
 template <typename PhysicalElementComparator>
-auto self_comparison(cudf::table_view input,
-                     std::vector<cudf::order> const& column_order,
-                     PhysicalElementComparator comparator)
-{
-  rmm::cuda_stream_view stream{cudf::get_default_stream()};
-
-  auto const table_comparator =
-    cudf::experimental::row::lexicographic::self_comparator{input, column_order, {}, stream};
-
-  auto output = cudf::make_numeric_column(
-    cudf::data_type(cudf::type_id::BOOL8), input.num_rows(), cudf::mask_state::UNALLOCATED);
-
-  if (cudf::detail::has_nested_columns(input)) {
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator(0),
-                      thrust::make_counting_iterator(input.num_rows()),
-                      thrust::make_counting_iterator(0),
-                      output->mutable_view().data<bool>(),
-                      table_comparator.less<true>(cudf::nullate::NO{}, comparator));
-  } else {
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator(0),
-                      thrust::make_counting_iterator(input.num_rows()),
-                      thrust::make_counting_iterator(0),
-                      output->mutable_view().data<bool>(),
-                      table_comparator.less<false>(cudf::nullate::NO{}, comparator));
-  }
-  return output;
-}
-
+std::unique_ptr<cudf::column> self_comparison(cudf::table_view input,
+                                              std::vector<cudf::order> const& column_order,
+                                              PhysicalElementComparator comparator);
 template <typename PhysicalElementComparator>
-auto two_table_comparison(cudf::table_view lhs,
-                          cudf::table_view rhs,
-                          std::vector<cudf::order> const& column_order,
-                          PhysicalElementComparator comparator)
-{
-  rmm::cuda_stream_view stream{cudf::get_default_stream()};
-
-  auto const table_comparator = cudf::experimental::row::lexicographic::two_table_comparator{
-    lhs, rhs, column_order, {}, stream};
-  auto const lhs_it = cudf::experimental::row::lhs_iterator(0);
-  auto const rhs_it = cudf::experimental::row::rhs_iterator(0);
-
-  auto output = cudf::make_numeric_column(
-    cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED);
-
-  if (cudf::detail::has_nested_columns(lhs) || cudf::detail::has_nested_columns(rhs)) {
-    thrust::transform(rmm::exec_policy(stream),
-                      lhs_it,
-                      lhs_it + lhs.num_rows(),
-                      rhs_it,
-                      output->mutable_view().data<bool>(),
-                      table_comparator.less<true>(cudf::nullate::NO{}, comparator));
-  } else {
-    thrust::transform(rmm::exec_policy(stream),
-                      lhs_it,
-                      lhs_it + lhs.num_rows(),
-                      rhs_it,
-                      output->mutable_view().data<bool>(),
-                      table_comparator.less<false>(cudf::nullate::NO{}, comparator));
-  }
-  return output;
-}
-
+std::unique_ptr<cudf::column> two_table_comparison(cudf::table_view lhs,
+                                                   cudf::table_view rhs,
+                                                   std::vector<cudf::order> const& column_order,
+                                                   PhysicalElementComparator comparator);
 template <typename PhysicalElementComparator>
-auto self_equality(cudf::table_view input,
-                   std::vector<cudf::order> const& column_order,
-                   PhysicalElementComparator comparator)
-{
-  rmm::cuda_stream_view stream{cudf::get_default_stream()};
-
-  auto const table_comparator = cudf::experimental::row::equality::self_comparator{input, stream};
-
-  auto output = cudf::make_numeric_column(
-    cudf::data_type(cudf::type_id::BOOL8), input.num_rows(), cudf::mask_state::UNALLOCATED);
-
-  if (cudf::detail::has_nested_columns(input)) {
-    auto const equal_comparator =
-      table_comparator.equal_to<true>(cudf::nullate::NO{}, cudf::null_equality::EQUAL, comparator);
-
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator(0),
-                      thrust::make_counting_iterator(input.num_rows()),
-                      thrust::make_counting_iterator(0),
-                      output->mutable_view().data<bool>(),
-                      equal_comparator);
-  } else {
-    auto const equal_comparator =
-      table_comparator.equal_to<false>(cudf::nullate::NO{}, cudf::null_equality::EQUAL, comparator);
-
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator(0),
-                      thrust::make_counting_iterator(input.num_rows()),
-                      thrust::make_counting_iterator(0),
-                      output->mutable_view().data<bool>(),
-                      equal_comparator);
-  }
-
-  return output;
-}
-
-template <typename PhysicalElementComparator>
-auto two_table_equality(cudf::table_view lhs,
-                        cudf::table_view rhs,
-                        std::vector<cudf::order> const& column_order,
-                        PhysicalElementComparator comparator)
-{
-  rmm::cuda_stream_view stream{cudf::get_default_stream()};
-
-  auto const table_comparator =
-    cudf::experimental::row::equality::two_table_comparator{lhs, rhs, stream};
-
-  auto const lhs_it = cudf::experimental::row::lhs_iterator(0);
-  auto const rhs_it = cudf::experimental::row::rhs_iterator(0);
-
-  auto output = cudf::make_numeric_column(
-    cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED);
-
-  if (cudf::detail::has_nested_columns(lhs) or cudf::detail::has_nested_columns(rhs)) {
-    auto const equal_comparator =
-      table_comparator.equal_to<true>(cudf::nullate::NO{}, cudf::null_equality::EQUAL, comparator);
-
-    thrust::transform(rmm::exec_policy(stream),
-                      lhs_it,
-                      lhs_it + lhs.num_rows(),
-                      rhs_it,
-                      output->mutable_view().data<bool>(),
-                      equal_comparator);
-  } else {
-    auto const equal_comparator =
-      table_comparator.equal_to<false>(cudf::nullate::NO{}, cudf::null_equality::EQUAL, comparator);
-
-    thrust::transform(rmm::exec_policy(stream),
-                      lhs_it,
-                      lhs_it + lhs.num_rows(),
-                      rhs_it,
-                      output->mutable_view().data<bool>(),
-                      equal_comparator);
-  }
-  return output;
-}
-
+std::unique_ptr<cudf::column> two_table_equality(cudf::table_view lhs,
+                                                 cudf::table_view rhs,
+                                                 std::vector<cudf::order> const& column_order,
+                                                 PhysicalElementComparator comparator);
 template <typename PhysicalElementComparator>
-auto sorted_order(
+std::unique_ptr<cudf::column> sorted_order(
   std::shared_ptr<cudf::experimental::row::lexicographic::preprocessed_table> preprocessed_input,
   cudf::size_type num_rows,
   bool has_nested,
   PhysicalElementComparator comparator,
-  rmm::cuda_stream_view stream)
-{
-  auto output = cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<cudf::size_type>()),
-                                          num_rows,
-                                          cudf::mask_state::UNALLOCATED,
-                                          stream);
-  auto const out_begin = output->mutable_view().begin<cudf::size_type>();
-  thrust::sequence(rmm::exec_policy(stream), out_begin, out_begin + num_rows, 0);
-
-  auto const table_comparator =
-    cudf::experimental::row::lexicographic::self_comparator{preprocessed_input};
-  if (has_nested) {
-    auto const comp = table_comparator.less<true>(cudf::nullate::NO{}, comparator);
-    thrust::stable_sort(rmm::exec_policy(stream), out_begin, out_begin + num_rows, comp);
-  } else {
-    auto const comp = table_comparator.less<false>(cudf::nullate::NO{}, comparator);
-    thrust::stable_sort(rmm::exec_policy(stream), out_begin, out_begin + num_rows, comp);
-  }
-
-  return output;
-}
+  rmm::cuda_stream_view stream);
 
 TYPED_TEST(TypedTableViewTest, TestLexicographicalComparatorTwoTables)
 {
diff --git a/cpp/tests/table/row_operator_tests_utilities.cu b/cpp/tests/table/row_operator_tests_utilities.cu
new file mode 100644
index 00000000000..d1f918cc7af
--- /dev/null
+++ b/cpp/tests/table/row_operator_tests_utilities.cu
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "row_operator_tests_utilities.hpp"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+template <typename PhysicalElementComparator>
+std::unique_ptr<cudf::column> self_comparison(cudf::table_view input,
+                                              std::vector<cudf::order> const& column_order,
+                                              PhysicalElementComparator comparator)
+{
+  rmm::cuda_stream_view stream{cudf::get_default_stream()};
+
+  auto const table_comparator =
+    cudf::experimental::row::lexicographic::self_comparator{input, column_order, {}, stream};
+
+  auto output = cudf::make_numeric_column(
+    cudf::data_type(cudf::type_id::BOOL8), input.num_rows(), cudf::mask_state::UNALLOCATED);
+
+  if (cudf::detail::has_nested_columns(input)) {
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(input.num_rows()),
+                      thrust::make_counting_iterator(0),
+                      output->mutable_view().data<bool>(),
+                      table_comparator.less<true>(cudf::nullate::NO{}, comparator));
+  } else {
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(input.num_rows()),
+                      thrust::make_counting_iterator(0),
+                      output->mutable_view().data<bool>(),
+                      table_comparator.less<false>(cudf::nullate::NO{}, comparator));
+  }
+  return output;
+}
+
+using physical_comparator_t = cudf::experimental::row::lexicographic::physical_element_comparator;
+using sorting_comparator_t =
+  cudf::experimental::row::lexicographic::sorting_physical_element_comparator;
+
+template std::unique_ptr<cudf::column> self_comparison<physical_comparator_t>(
+  cudf::table_view input,
+  std::vector<cudf::order> const& column_order,
+  physical_comparator_t comparator);
+template std::unique_ptr<cudf::column> self_comparison<sorting_comparator_t>(
+  cudf::table_view input,
+  std::vector<cudf::order> const& column_order,
+  sorting_comparator_t comparator);
+
+template <typename PhysicalElementComparator>
+std::unique_ptr<cudf::column> two_table_comparison(cudf::table_view lhs,
+                                                   cudf::table_view rhs,
+                                                   std::vector<cudf::order> const& column_order,
+                                                   PhysicalElementComparator comparator)
+{
+  rmm::cuda_stream_view stream{cudf::get_default_stream()};
+
+  auto const table_comparator = cudf::experimental::row::lexicographic::two_table_comparator{
+    lhs, rhs, column_order, {}, stream};
+  auto const lhs_it = cudf::experimental::row::lhs_iterator(0);
+  auto const rhs_it = cudf::experimental::row::rhs_iterator(0);
+
+  auto output = cudf::make_numeric_column(
+    cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED);
+
+  if (cudf::detail::has_nested_columns(lhs) || cudf::detail::has_nested_columns(rhs)) {
+    thrust::transform(rmm::exec_policy(stream),
+                      lhs_it,
+                      lhs_it + lhs.num_rows(),
+                      rhs_it,
+                      output->mutable_view().data<bool>(),
+                      table_comparator.less<true>(cudf::nullate::NO{}, comparator));
+  } else {
+    thrust::transform(rmm::exec_policy(stream),
+                      lhs_it,
+                      lhs_it + lhs.num_rows(),
+                      rhs_it,
+                      output->mutable_view().data<bool>(),
+                      table_comparator.less<false>(cudf::nullate::NO{}, comparator));
+  }
+  return output;
+}
+
+template std::unique_ptr<cudf::column> two_table_comparison<physical_comparator_t>(
+  cudf::table_view lhs,
+  cudf::table_view rhs,
+  std::vector<cudf::order> const& column_order,
+  physical_comparator_t comparator);
+template std::unique_ptr<cudf::column> two_table_comparison<sorting_comparator_t>(
+  cudf::table_view lhs,
+  cudf::table_view rhs,
+  std::vector<cudf::order> const& column_order,
+  sorting_comparator_t comparator);
+
+template <typename PhysicalElementComparator>
+std::unique_ptr<cudf::column> sorted_order(
+  std::shared_ptr<cudf::experimental::row::lexicographic::preprocessed_table> preprocessed_input,
+  cudf::size_type num_rows,
+  bool has_nested,
+  PhysicalElementComparator comparator,
+  rmm::cuda_stream_view stream)
+{
+  auto output = cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<cudf::size_type>()),
+                                          num_rows,
+                                          cudf::mask_state::UNALLOCATED,
+                                          stream);
+  auto const out_begin = output->mutable_view().begin<cudf::size_type>();
+  thrust::sequence(rmm::exec_policy(stream), out_begin, out_begin + num_rows, 0);
+
+  auto const table_comparator =
+    cudf::experimental::row::lexicographic::self_comparator{preprocessed_input};
+  if (has_nested) {
+    auto const comp = table_comparator.less<true>(cudf::nullate::NO{}, comparator);
+    thrust::stable_sort(rmm::exec_policy(stream), out_begin, out_begin + num_rows, comp);
+  } else {
+    auto const comp = table_comparator.less<false>(cudf::nullate::NO{}, comparator);
+    thrust::stable_sort(rmm::exec_policy(stream), out_begin, out_begin + num_rows, comp);
+  }
+
+  return output;
+}
+
+template std::unique_ptr<cudf::column> sorted_order<physical_comparator_t>(
+  std::shared_ptr<cudf::experimental::row::lexicographic::preprocessed_table> preprocessed_input,
+  cudf::size_type num_rows,
+  bool has_nested,
+  physical_comparator_t comparator,
+  rmm::cuda_stream_view stream);
+template std::unique_ptr<cudf::column> sorted_order<sorting_comparator_t>(
+  std::shared_ptr<cudf::experimental::row::lexicographic::preprocessed_table> preprocessed_input,
+  cudf::size_type num_rows,
+  bool has_nested,
+  sorting_comparator_t comparator,
+  rmm::cuda_stream_view stream);
+
+template <typename PhysicalElementComparator>
+std::unique_ptr<cudf::column> two_table_equality(cudf::table_view lhs,
+                                                 cudf::table_view rhs,
+                                                 std::vector<cudf::order> const& column_order,
+                                                 PhysicalElementComparator comparator)
+{
+  rmm::cuda_stream_view stream{cudf::get_default_stream()};
+
+  auto const table_comparator =
+    cudf::experimental::row::equality::two_table_comparator{lhs, rhs, stream};
+
+  auto const lhs_it = cudf::experimental::row::lhs_iterator(0);
+  auto const rhs_it = cudf::experimental::row::rhs_iterator(0);
+
+  auto output = cudf::make_numeric_column(
+    cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED);
+
+  if (cudf::detail::has_nested_columns(lhs) or cudf::detail::has_nested_columns(rhs)) {
+    auto const equal_comparator =
+      table_comparator.equal_to<true>(cudf::nullate::NO{}, cudf::null_equality::EQUAL, comparator);
+
+    thrust::transform(rmm::exec_policy(stream),
+                      lhs_it,
+                      lhs_it + lhs.num_rows(),
+                      rhs_it,
+                      output->mutable_view().data<bool>(),
+                      equal_comparator);
+  } else {
+    auto const equal_comparator =
+      table_comparator.equal_to<false>(cudf::nullate::NO{}, cudf::null_equality::EQUAL, comparator);
+
+    thrust::transform(rmm::exec_policy(stream),
+                      lhs_it,
+                      lhs_it + lhs.num_rows(),
+                      rhs_it,
+                      output->mutable_view().data<bool>(),
+                      equal_comparator);
+  }
+  return output;
+}
+
+using physical_equality_t = cudf::experimental::row::equality::physical_equality_comparator;
+using nan_equality_t = cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
+
+template std::unique_ptr<cudf::column> two_table_equality<physical_equality_t>(
+  cudf::table_view lhs,
+  cudf::table_view rhs,
+  std::vector<cudf::order> const& column_order,
+  physical_equality_t comparator);
+template std::unique_ptr<cudf::column> two_table_equality<nan_equality_t>(
+  cudf::table_view lhs,
+  cudf::table_view rhs,
+  std::vector<cudf::order> const& column_order,
+  nan_equality_t comparator);
diff --git a/cpp/tests/table/row_operator_tests_utilities.hpp b/cpp/tests/table/row_operator_tests_utilities.hpp
new file mode 100644
index 00000000000..b34bf65d176
--- /dev/null
+++ b/cpp/tests/table/row_operator_tests_utilities.hpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <vector>
+
+template <typename PhysicalElementComparator>
+std::unique_ptr<cudf::column> self_comparison(cudf::table_view input,
+                                              std::vector<cudf::order> const& column_order,
+                                              PhysicalElementComparator comparator);
+template <typename PhysicalElementComparator>
+std::unique_ptr<cudf::column> two_table_comparison(cudf::table_view lhs,
+                                                   cudf::table_view rhs,
+                                                   std::vector<cudf::order> const& column_order,
+                                                   PhysicalElementComparator comparator);
+template <typename PhysicalElementComparator>
+std::unique_ptr<cudf::column> two_table_equality(cudf::table_view lhs,
+                                                 cudf::table_view rhs,
+                                                 std::vector<cudf::order> const& column_order,
+                                                 PhysicalElementComparator comparator);
+template <typename PhysicalElementComparator>
+std::unique_ptr<cudf::column> sorted_order(
+  std::shared_ptr<cudf::experimental::row::lexicographic::preprocessed_table> preprocessed_input,
+  cudf::size_type num_rows,
+  bool has_nested,
+  PhysicalElementComparator comparator,
+  rmm::cuda_stream_view stream);
diff --git a/cpp/tests/text/edit_distance_tests.cpp b/cpp/tests/text/edit_distance_tests.cpp
index 9be927d7ffd..837a4eb8de4 100644
--- a/cpp/tests/text/edit_distance_tests.cpp
+++ b/cpp/tests/text/edit_distance_tests.cpp
@@ -30,13 +30,13 @@ struct TextEditDistanceTest : public cudf::test::BaseFixture {};
 
 TEST_F(TextEditDistanceTest, EditDistance)
 {
-  std::vector<const char*> h_strings{"dog", nullptr, "cat", "mouse", "pup", "", "puppy", "thé"};
+  std::vector<char const*> h_strings{"dog", nullptr, "cat", "mouse", "pup", "", "puppy", "thé"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  std::vector<const char*> h_targets{"hog", "not", "cake", "house", "fox", nullptr, "puppy", "the"};
+  std::vector<char const*> h_targets{"hog", "not", "cake", "house", "fox", nullptr, "puppy", "the"};
   cudf::test::strings_column_wrapper targets(
     h_targets.begin(),
     h_targets.end(),
@@ -58,7 +58,7 @@ TEST_F(TextEditDistanceTest, EditDistance)
 
 TEST_F(TextEditDistanceTest, EditDistanceMatrix)
 {
-  std::vector<const char*> h_strings{"dog", nullptr, "hog", "frog", "cat", "", "hat", "clog"};
+  std::vector<char const*> h_strings{"dog", nullptr, "hog", "frog", "cat", "", "hat", "clog"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
diff --git a/cpp/tests/text/jaccard_tests.cpp b/cpp/tests/text/jaccard_tests.cpp
new file mode 100644
index 00000000000..987de316e7f
--- /dev/null
+++ b/cpp/tests/text/jaccard_tests.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <nvtext/jaccard.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+
+struct JaccardTest : public cudf::test::BaseFixture {};
+
+TEST_F(JaccardTest, Basic)
+{
+  auto input1 =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+  auto input2 =
+    cudf::test::strings_column_wrapper({"the slowest brown cat", "crawled under the jumping fox"});
+
+  auto view1 = cudf::strings_column_view(input1);
+  auto view2 = cudf::strings_column_view(input2);
+
+  auto results = nvtext::jaccard_index(view1, view2, 5);
+
+  auto expected = cudf::test::fixed_width_column_wrapper<float>({0.103448279f, 0.0697674453f});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  expected = cudf::test::fixed_width_column_wrapper<float>({1.0f, 1.0f});
+  results  = nvtext::jaccard_index(view1, view1, 5);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  results = nvtext::jaccard_index(view2, view2, 10);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+}
+
+TEST_F(JaccardTest, WithNulls)
+{
+  auto input1 =
+    cudf::test::strings_column_wrapper({"brown fox", "jumps over dog", "", ""}, {1, 1, 0, 1});
+  auto input2 =
+    cudf::test::strings_column_wrapper({"brown cat", "jumps on fox", "", ""}, {1, 1, 1, 0});
+
+  auto view1 = cudf::strings_column_view(input1);
+  auto view2 = cudf::strings_column_view(input2);
+
+  auto results = nvtext::jaccard_index(view1, view2, 5);
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<float>({0.25f, 0.200000003f, 0.f, 0.f}, {1, 1, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  expected = cudf::test::fixed_width_column_wrapper<float>({1.0f, 1.0f, 0.f, 0.f}, {1, 1, 0, 1});
+  results  = nvtext::jaccard_index(view1, view1, 7);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+}
+
+TEST_F(JaccardTest, Errors)
+{
+  auto input = cudf::test::strings_column_wrapper({"1", "2", "3"});
+  auto view  = cudf::strings_column_view(input);
+  // invalid parameter value
+  EXPECT_THROW(nvtext::jaccard_index(view, view, 1), std::invalid_argument);
+  // invalid size
+  auto input2 = cudf::test::strings_column_wrapper({"1", "2"});
+  auto view2  = cudf::strings_column_view(input2);
+  EXPECT_THROW(nvtext::jaccard_index(view, view2, 5), std::invalid_argument);
+}
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 9572ccd1baf..b1c961ec9e1 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -34,6 +34,7 @@ struct MinHashTest : public cudf::test::BaseFixture {};
 
 TEST_F(MinHashTest, Basic)
 {
+  auto validity = cudf::test::iterators::null_at(1);
   auto input =
     cudf::test::strings_column_wrapper({"doc 1",
                                         "",
@@ -42,15 +43,26 @@ TEST_F(MinHashTest, Basic)
                                         "doc 3",
                                         "d",
                                         "The quick brown fox jumpéd over the lazy brown dog."},
-                                       {1, 0, 1, 1, 1, 1, 1});
+                                       validity);
 
   auto view = cudf::strings_column_view(input);
 
   auto results = nvtext::minhash(view);
 
-  auto expected = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>(
-    {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, {1, 0, 1, 1, 1, 1, 1});
+  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(
+    {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto results64  = nvtext::minhash64(view);
+  auto expected64 = cudf::test::fixed_width_column_wrapper<uint64_t>({774489391575805754ul,
+                                                                      0ul,
+                                                                      3232308021562742685ul,
+                                                                      0ul,
+                                                                      13145552576991307582ul,
+                                                                      14660046701545912182ul,
+                                                                      398062025280761388ul},
+                                                                     validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
 TEST_F(MinHashTest, LengthEqualsWidth)
@@ -58,7 +70,7 @@ TEST_F(MinHashTest, LengthEqualsWidth)
   auto input   = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"});
   auto view    = cudf::strings_column_view(input);
   auto results = nvtext::minhash(view, 0, 5);
-  auto expected = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>(
+  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(
     {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
@@ -74,11 +86,10 @@ TEST_F(MinHashTest, MultiSeed)
 
   auto view = cudf::strings_column_view(input);
 
-  auto seeds = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>({0, 1, 2});
-
+  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({0, 1, 2});
   auto results = nvtext::minhash(view, cudf::column_view(seeds));
 
-  using LCW = cudf::test::lists_column_wrapper<cudf::hash_value_type>;
+  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
   LCW expected({LCW{1207251914u, 1677652962u, 1061355987u},
                 LCW{  21141582u,  580916568u, 1258052021u},
@@ -87,6 +98,19 @@ TEST_F(MinHashTest, MultiSeed)
                 LCW{  86520422u,  236622901u,  102546228u}});
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({0, 1, 2});
+  auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64));
+
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  // clang-format off
+  LCW64 expected64({LCW64{  774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul},
+                    LCW64{ 3232308021562742685ul,  4445611509348165860ul, 1188598072697676120ul},
+                    LCW64{13145552576991307582ul,  6846192680998069919ul, 1188598072697676120ul},
+                    LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul},
+                    LCW64{  398062025280761388ul,   377720198157450084ul,  984941365662009329ul}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
 TEST_F(MinHashTest, MultiSeedWithNullInputRow)
@@ -95,13 +119,24 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow)
   auto input    = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity);
   auto view     = cudf::strings_column_view(input);
 
-  auto seeds   = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>({1, 2});
+  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
   auto results = nvtext::minhash(view, cudf::column_view(seeds));
 
-  using LCW = cudf::test::lists_column_wrapper<cudf::hash_value_type>;
+  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
   LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}},
                validity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
+  auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64));
+
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul},
+                    LCW64{},
+                    LCW64{0ul, 0ul},
+                    LCW64{2717781266371273264ul, 6977325820868387259ul}},
+                   validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
 TEST_F(MinHashTest, EmptyTest)
@@ -110,6 +145,8 @@ TEST_F(MinHashTest, EmptyTest)
   auto view    = cudf::strings_column_view(input->view());
   auto results = nvtext::minhash(view);
   EXPECT_EQ(results->size(), 0);
+  results = nvtext::minhash64(view);
+  EXPECT_EQ(results->size(), 0);
 }
 
 TEST_F(MinHashTest, ErrorsTest)
@@ -117,17 +154,19 @@ TEST_F(MinHashTest, ErrorsTest)
   auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"});
   auto view  = cudf::strings_column_view(input);
   EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument);
-  EXPECT_THROW(nvtext::minhash(view, 0, 0, cudf::hash_id::HASH_MD5), std::invalid_argument);
-  auto seeds = cudf::test::fixed_width_column_wrapper<
-    cudf::hash_value_type>();  // cudf::device_span<cudf::hash_value_type
-                               // const>{};
+  EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument);
+  auto seeds = cudf::test::fixed_width_column_wrapper<uint32_t>();
   EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument);
+  auto seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
+  EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument);
 
   std::vector<std::string> h_input(50000, "");
   input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end());
   view  = cudf::strings_column_view(input);
 
-  auto const zeroes = thrust::constant_iterator<cudf::hash_value_type>(0);
-  seeds = cudf::test::fixed_width_column_wrapper<cudf::hash_value_type>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument);
+  auto const zeroes = thrust::constant_iterator<uint32_t>(0);
+  seeds             = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error);
+  seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error);
 }
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index a9deccabea4..323b3eed3e2 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -78,7 +78,7 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
 
 TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
 {
-  std::vector<const char*> h_strings{"the", "fox", "", "jumped", "over", nullptr, "the", "dog"};
+  std::vector<char const*> h_strings{"the", "fox", "", "jumped", "over", nullptr, "the", "dog"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -101,8 +101,8 @@ TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
 
 TEST_F(TextGenerateNgramsTest, Empty)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
+
   auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column));
   cudf::test::expect_column_empty(results->view());
   results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column));
@@ -121,7 +121,7 @@ TEST_F(TextGenerateNgramsTest, Errors)
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 3),
                cudf::logic_error);
 
-  std::vector<const char*> h_strings{"", nullptr, "", nullptr};
+  std::vector<char const*> h_strings{"", nullptr, "", nullptr};
   cudf::test::strings_column_wrapper strings_no_tokens(
     h_strings.begin(),
     h_strings.end(),
@@ -132,4 +132,37 @@ TEST_F(TextGenerateNgramsTest, Errors)
                cudf::logic_error);
 }
 
+TEST_F(TextGenerateNgramsTest, NgramsHash)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+
+  auto view    = cudf::strings_column_view(input);
+  auto results = nvtext::hash_character_ngrams(view);
+
+  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW expected({LCW{2169381797u, 3924065905u, 1634753325u, 3766025829u,  771291085u,
+                    2286480985u, 2815102125u, 2383213292u, 1587939873u, 3417728802u,
+                     741580288u, 1721912990u, 3322339040u, 2530504717u, 1448945146u},
+                LCW{3542029734u, 2351937583u, 2373822151u, 2610417165u, 1303810911u,
+                    2541942822u, 1736466351u, 3466558519u,  408633648u, 1698719372u,
+                     620653030u,   16851044u,  608863326u,  948572753u, 3672211877u,
+                    4097451013u, 1444462157u, 3762829398u,  743082018u, 2953783152u,
+                    2319357747u}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(TextGenerateNgramsTest, NgramsHashErrors)
+{
+  auto input = cudf::test::strings_column_wrapper({"1", "2", "3"});
+  auto view  = cudf::strings_column_view(input);
+
+  // invalid parameter value
+  EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), cudf::logic_error);
+  // strings not long enough to generate ngrams
+  EXPECT_THROW(nvtext::hash_character_ngrams(view), cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/text/ngrams_tokenize_tests.cpp b/cpp/tests/text/ngrams_tokenize_tests.cpp
index fabe4a532c0..5879bec3e64 100644
--- a/cpp/tests/text/ngrams_tokenize_tests.cpp
+++ b/cpp/tests/text/ngrams_tokenize_tests.cpp
@@ -31,7 +31,7 @@ struct TextNgramsTokenizeTest : public cudf::test::BaseFixture {};
 
 TEST_F(TextNgramsTokenizeTest, Tokenize)
 {
-  std::vector<const char*> h_strings{"the fox jumped over the dog",
+  std::vector<char const*> h_strings{"the fox jumped over the dog",
                                      "the dog chased  the cat",
                                      " the cat chased the mouse ",
                                      nullptr,
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index 8618c44f34a..5fa3bb24f24 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -33,7 +33,7 @@ struct TextNormalizeTest : public cudf::test::BaseFixture {};
 
 TEST_F(TextNormalizeTest, NormalizeSpaces)
 {
-  std::vector<const char*> h_strings{"the\t fox  jumped over the      dog",
+  std::vector<char const*> h_strings{"the\t fox  jumped over the      dog",
                                      "the dog\f chased  the cat\r",
                                      " the cat  chaséd  the mouse\n",
                                      nullptr,
@@ -48,7 +48,7 @@ TEST_F(TextNormalizeTest, NormalizeSpaces)
 
   cudf::strings_column_view strings_view(strings);
 
-  std::vector<const char*> h_expected{"the fox jumped over the dog",
+  std::vector<char const*> h_expected{"the fox jumped over the dog",
                                       "the dog chased the cat",
                                       "the cat chaséd the mouse",
                                       nullptr,
@@ -99,7 +99,7 @@ TEST_F(TextNormalizeTest, SomeNullStrings)
 TEST_F(TextNormalizeTest, NormalizeCharacters)
 {
   // These include punctuation, accents, whitespace, and CJK characters
-  std::vector<const char*> h_strings{"abc£def",
+  std::vector<char const*> h_strings{"abc£def",
                                      nullptr,
                                      "éè â îô\taeio",
                                      "\tĂĆĖÑ  Ü",
diff --git a/cpp/tests/text/replace_tests.cpp b/cpp/tests/text/replace_tests.cpp
index 169d698cdde..f798d596a3c 100644
--- a/cpp/tests/text/replace_tests.cpp
+++ b/cpp/tests/text/replace_tests.cpp
@@ -32,7 +32,7 @@ struct TextReplaceTest : public cudf::test::BaseFixture {};
 
 TEST_F(TextReplaceTest, ReplaceTokens)
 {
-  std::vector<const char*> h_strings{"the fox jumped over the dog",
+  std::vector<char const*> h_strings{"the fox jumped over the dog",
                                      "is theme of the thesis",
                                      nullptr,
                                      "",
@@ -44,7 +44,7 @@ TEST_F(TextReplaceTest, ReplaceTokens)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
   cudf::test::strings_column_wrapper targets({"is", "the"});
   cudf::test::strings_column_wrapper repls({"___", ""});
-  std::vector<const char*> h_expected{" fox jumped over  dog",
+  std::vector<char const*> h_expected{" fox jumped over  dog",
                                       "___ theme of  thesis",
                                       nullptr,
                                       "",
diff --git a/cpp/tests/text/stemmer_tests.cpp b/cpp/tests/text/stemmer_tests.cpp
index 6fa6bb83b81..939d2f1cd2f 100644
--- a/cpp/tests/text/stemmer_tests.cpp
+++ b/cpp/tests/text/stemmer_tests.cpp
@@ -32,7 +32,7 @@ struct TextStemmerTest : public cudf::test::BaseFixture {};
 
 TEST_F(TextStemmerTest, PorterStemmer)
 {
-  std::vector<const char*> h_strings{"abandon",
+  std::vector<char const*> h_strings{"abandon",
                                      nullptr,
                                      "abbey",
                                      "cleans",
@@ -58,7 +58,7 @@ TEST_F(TextStemmerTest, PorterStemmer)
 
 TEST_F(TextStemmerTest, IsLetterIndex)
 {
-  std::vector<const char*> h_strings{"abandon",
+  std::vector<char const*> h_strings{"abandon",
                                      nullptr,
                                      "abbey",
                                      "cleans",
@@ -117,7 +117,7 @@ TEST_F(TextStemmerTest, IsLetterIndex)
 
 TEST_F(TextStemmerTest, IsLetterIndices)
 {
-  std::vector<const char*> h_strings{"abandon",
+  std::vector<char const*> h_strings{"abandon",
                                      nullptr,
                                      "abbey",
                                      "cleans",
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index d50c7e73543..4db289ac5b8 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -28,8 +28,6 @@
 #include <iostream>
 #include <vector>
 
-#define MAX_ROWS_TENSOR 300
-
 // Global environment for temporary files
 auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
   ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
@@ -62,7 +60,7 @@ void create_hashed_vocab(std::string const& hash_file)
 TEST(TextSubwordTest, Tokenize)
 {
   uint32_t nrows = 100;
-  std::vector<const char*> h_strings(nrows, "This is a test. A test this is.");
+  std::vector<char const*> h_strings(nrows, "This is a test. A test this is.");
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
@@ -75,9 +73,8 @@ TEST(TextSubwordTest, Tokenize)
                                          *vocab,
                                          max_sequence_length,
                                          stride,
-                                         true,   // do_lower_case
-                                         false,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,    // do_lower_case
+                                         false);  // do_truncate
 
   EXPECT_EQ(nrows, result.nrows_tensor);
 
@@ -115,7 +112,7 @@ TEST(TextSubwordTest, Tokenize)
 
 TEST(TextSubwordTest, TokenizeMultiRow)
 {
-  std::vector<const char*> h_strings{"This is a test.", "This is a test. This is a tést."};
+  std::vector<char const*> h_strings{"This is a test.", "This is a test. This is a tést."};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
@@ -128,9 +125,8 @@ TEST(TextSubwordTest, TokenizeMultiRow)
                                          *vocab,
                                          max_sequence_length,
                                          stride,
-                                         true,   // do_lower_case
-                                         false,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,    // do_lower_case
+                                         false);  // do_truncate
 
   EXPECT_EQ(uint32_t{3}, result.nrows_tensor);
   cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens(
@@ -144,6 +140,48 @@ TEST(TextSubwordTest, TokenizeMultiRow)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
 }
 
+TEST(TextSubwordTest, TokenizeWithEmptyRow)
+{
+  std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
+  create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
+
+  cudf::test::strings_column_wrapper strings{
+    "This is a test.", "", "This is a test. This is a tést."};
+  auto input = cudf::strings_column_view{strings};
+
+  uint32_t const max_seq = 8;
+  uint32_t const stride  = 6;
+  bool const lower       = true;
+  bool const truncate    = false;
+
+  auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate);
+
+  EXPECT_EQ(uint32_t{4}, result.nrows_tensor);
+
+  // clang-format off
+  auto expected_tokens = cudf::test::fixed_width_column_wrapper<uint32_t>(
+    {2023, 2003, 1037, 3231, 1012,   0,    0,    0,
+        0,    0,    0,    0,    0,   0,    0,    0,
+     2023, 2003, 1037, 3231, 1012, 2023, 2003, 1037,   // this one
+     2003, 1037, 3231, 1012,    0,    0,    0,    0}); // continues here
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected_tokens);
+  // clang-format off
+  auto expected_attn = cudf::test::fixed_width_column_wrapper<uint32_t>(
+     {1, 1, 1, 1, 1, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 0, 0, 0, 0});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn);
+  // clang-format off
+  auto expected_metadata = cudf::test::fixed_width_column_wrapper<uint32_t>(
+    {0,0,4, 1,0,0, 2,0,6, 2,1,3}); // note that the 3rd element has 2 tensors
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
+}
+
 TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
 {
   cudf::test::strings_column_wrapper strings({"This is a test."});
@@ -158,9 +196,8 @@ TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
                                          *vocab,
                                          max_sequence_length,
                                          stride,
-                                         true,   // do_lower_case
-                                         false,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,    // do_lower_case
+                                         false);  // do_truncate
 
   EXPECT_EQ(uint32_t{1}, result.nrows_tensor);
   cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens({2023, 2003, 1037, 3231, 1012});
@@ -173,7 +210,7 @@ TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
 
 TEST(TextSubwordTest, ParameterErrors)
 {
-  std::vector<const char*> h_strings{"This is a test.", "This is a test. This is a tést."};
+  std::vector<char const*> h_strings{"This is a test.", "This is a test. This is a tést.", "", ""};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
@@ -181,21 +218,19 @@ TEST(TextSubwordTest, ParameterErrors)
 
   EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings},
                                         *vocab,
-                                        12,    // max_sequence_length
-                                        13,    // stride <= max_sequence_length
-                                        true,  // do_lower_case
-                                        true,  // do_truncate
-                                        MAX_ROWS_TENSOR),
+                                        12,     // max_sequence_length
+                                        13,     // stride <= max_sequence_length
+                                        true,   // do_lower_case
+                                        true),  // do_truncate
                cudf::logic_error);
 
   EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings},
                                         *vocab,
+                                        858993459,
                                         5,
-                                        5,
-                                        true,  // do_lower_case
-                                        true,  // do_truncate
-                                        858993459),
-               cudf::logic_error);
+                                        true,   // do_lower_case
+                                        true),  // do_truncate
+               std::overflow_error);
 }
 
 TEST(TextSubwordTest, EmptyStrings)
@@ -208,9 +243,8 @@ TEST(TextSubwordTest, EmptyStrings)
                                          *vocab,
                                          16,
                                          16,
-                                         true,   // do_lower_case
-                                         false,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,    // do_lower_case
+                                         false);  // do_truncate
   EXPECT_EQ(uint32_t{0}, result.nrows_tensor);
   EXPECT_EQ(0, result.tensor_token_ids->size());
   EXPECT_EQ(0, result.tensor_attention_mask->size());
@@ -227,30 +261,56 @@ TEST(TextSubwordTest, AllNullStrings)
                                          *vocab,
                                          16,
                                          16,
-                                         true,   // do_lower_case
-                                         false,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,    // do_lower_case
+                                         false);  // do_truncate
   EXPECT_EQ(uint32_t{0}, result.nrows_tensor);
   EXPECT_EQ(0, result.tensor_token_ids->size());
   EXPECT_EQ(0, result.tensor_attention_mask->size());
   EXPECT_EQ(0, result.tensor_metadata->size());
 }
 
+TEST(TextSubwordTest, NoTokens)
+{
+  std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
+  create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
+
+  cudf::test::strings_column_wrapper strings({"  ", "\n\r", "\t"});
+  auto input = cudf::strings_column_view{strings};
+
+  uint32_t const max_seq = 16;
+  uint32_t const stride  = 16;
+  bool const lower       = true;
+  bool const truncate    = true;
+
+  auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate);
+
+  std::vector<uint32_t> zeros(max_seq * input.size(), 0);
+
+  EXPECT_EQ(static_cast<uint32_t>(input.size()), result.nrows_tensor);
+
+  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(zeros.begin(), zeros.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected);
+  auto expected_metadata =
+    cudf::test::fixed_width_column_wrapper<uint32_t>({0, 0, 0, 1, 0, 0, 2, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
+}
+
 TEST(TextSubwordTest, TokenizeFromVocabStruct)
 {
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
 
-  std::vector<const char*> h_strings{"This is a test.", "This is a test. This is a tést."};
+  std::vector<char const*> h_strings{"This is a test.", "This is a test. This is a tést."};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   auto vocab  = nvtext::load_vocabulary_file(hash_file);
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
                                          *vocab,
                                          8,
                                          6,
-                                         true,  // do_lower_case
-                                         true,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,   // do_lower_case
+                                         true);  // do_truncate
 
   EXPECT_EQ(uint32_t{2}, result.nrows_tensor);
   cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens(
@@ -265,7 +325,7 @@ TEST(TextSubwordTest, TokenizeFromVocabStruct)
 
 TEST(TextSubwordTest, LoadVocabFileErrors)
 {
-  std::vector<const char*> h_strings{"This is a test.", "This is a test. This is a tést."};
+  std::vector<char const*> h_strings{"This is a test.", "This is a test. This is a tést."};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = temp_env->get_temp_filepath("nothing.txt");
   EXPECT_THROW(nvtext::load_vocabulary_file(hash_file), cudf::logic_error);
@@ -318,9 +378,8 @@ TEST(TextSubwordTest, TokenizeWithSpecialTokens)
                                          *vocab,
                                          8,
                                          6,
-                                         true,  // do_lower_case
-                                         true,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,   // do_lower_case
+                                         true);  // do_truncate
 
   EXPECT_EQ(static_cast<uint32_t>(h_strings.size()), result.nrows_tensor);
   // clang-format off
@@ -361,16 +420,15 @@ TEST(TextSubwordTest, ZeroHashBinCoefficient)
     outfile << "0\n1\n2\n";
   }
 
-  std::vector<const char*> h_strings{".zzzz"};
+  std::vector<char const*> h_strings{".zzzz"};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   auto vocab  = nvtext::load_vocabulary_file(hash_file);
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
                                          *vocab,
                                          8,
                                          8,
-                                         true,  // do_lower_case
-                                         true,  // do_truncate
-                                         MAX_ROWS_TENSOR);
+                                         true,   // do_lower_case
+                                         true);  // do_truncate
 
   // clang-format off
   cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens({7, 0, 0, 0, 0, 0, 0, 0});
diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
index 00fa03dd761..14fc4f8c6db 100644
--- a/cpp/tests/text/tokenize_tests.cpp
+++ b/cpp/tests/text/tokenize_tests.cpp
@@ -31,7 +31,7 @@ struct TextTokenizeTest : public cudf::test::BaseFixture {};
 
 TEST_F(TextTokenizeTest, Tokenize)
 {
-  std::vector<const char*> h_strings{"the fox jumped over the dog",
+  std::vector<char const*> h_strings{"the fox jumped over the dog",
                                      "the dog chased  the cat",
                                      " the cat chased the mouse ",
                                      nullptr,
@@ -62,7 +62,7 @@ TEST_F(TextTokenizeTest, Tokenize)
 
 TEST_F(TextTokenizeTest, TokenizeMulti)
 {
-  std::vector<const char*> h_strings{"the fox jumped over the dog",
+  std::vector<char const*> h_strings{"the fox jumped over the dog",
                                      "the dog chased  the cat",
                                      "the cat chased the mouse ",
                                      nullptr,
@@ -109,7 +109,7 @@ TEST_F(TextTokenizeTest, TokenizeErrorTest)
 
 TEST_F(TextTokenizeTest, CharacterTokenize)
 {
-  std::vector<const char*> h_strings{"the mousé ate the cheese", nullptr, ""};
+  std::vector<char const*> h_strings{"the mousé ate the cheese", nullptr, ""};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
diff --git a/cpp/tests/transform/integration/assert_unary.h b/cpp/tests/transform/integration/assert_unary.h
index 73d4e826aa2..98dc5d1a240 100644
--- a/cpp/tests/transform/integration/assert_unary.h
+++ b/cpp/tests/transform/integration/assert_unary.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ void ASSERT_UNARY(cudf::column_view const& out, cudf::column_view const& in, Typ
 
   ASSERT_TRUE(out_data.size() == in_data.size());
 
-  auto data_comparator = [ope](const TypeIn& in, const TypeOut& out) {
+  auto data_comparator = [ope](TypeIn const& in, TypeOut const& out) {
     EXPECT_EQ(out, static_cast<TypeOut>(ope(in)));
     return true;
   };
@@ -41,7 +41,7 @@ void ASSERT_UNARY(cudf::column_view const& out, cudf::column_view const& in, Typ
   auto out_valid = out_h.second;
 
   ASSERT_TRUE(out_valid.size() == in_valid.size());
-  auto valid_comparator = [](const bool& in, const bool& out) {
+  auto valid_comparator = [](bool const& in, bool const& out) {
     EXPECT_EQ(out, in);
     return true;
   };
diff --git a/cpp/tests/transform/integration/unary_transform_test.cpp b/cpp/tests/transform/integration/unary_transform_test.cpp
index f52cec7ab00..eff327e75be 100644
--- a/cpp/tests/transform/integration/unary_transform_test.cpp
+++ b/cpp/tests/transform/integration/unary_transform_test.cpp
@@ -29,7 +29,7 @@ namespace transformation {
 struct UnaryOperationIntegrationTest : public cudf::test::BaseFixture {};
 
 template <class dtype, class Op, class Data>
-void test_udf(const char udf[], Op op, Data data_init, cudf::size_type size, bool is_ptx)
+void test_udf(char const udf[], Op op, Data data_init, cudf::size_type size, bool is_ptx)
 {
   auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init);
@@ -46,7 +46,7 @@ void test_udf(const char udf[], Op op, Data data_init, cudf::size_type size, boo
 TEST_F(UnaryOperationIntegrationTest, Transform_FP32_FP32)
 {
   // c = a*a*a*a
-  const char* cuda =
+  char const* cuda =
     R"***(
 __device__ inline void    fdsf   (
        float* C,
@@ -57,7 +57,7 @@ __device__ inline void    fdsf   (
 }
 )***";
 
-  const char* ptx =
+  char const* ptx =
     R"***(
 //
 // Generated by NVIDIA NVVM Compiler
@@ -107,10 +107,10 @@ __device__ inline void    fdsf   (
 TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32)
 {
   // c = a * a - a
-  const char cuda[] =
+  char const cuda[] =
     "__device__ inline void f(int* output,int input){*output = input*input - input;}";
 
-  const char* ptx =
+  char const* ptx =
     R"***(
 .func _Z1fPii(
         .param .b64 _Z1fPii_param_0,
@@ -144,7 +144,7 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8)
   // Capitalize all the lower case letters
   // Assuming ASCII, the PTX code is compiled from the following CUDA code
 
-  const char cuda[] =
+  char const cuda[] =
     R"***(
 __device__ inline void f(
   signed char* output,
@@ -158,7 +158,7 @@ __device__ inline void f(
 }
 )***";
 
-  const char ptx[] =
+  char const ptx[] =
     R"***(
 .func _Z1fPcc(
         .param .b64 _Z1fPcc_param_0,
@@ -198,7 +198,7 @@ TEST_F(UnaryOperationIntegrationTest, Transform_Datetime)
 {
   // Add one day to timestamp in microseconds
 
-  const char cuda[] =
+  char const cuda[] =
     R"***(
 __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input)
 {
diff --git a/cpp/tests/transform/mask_to_bools_test.cpp b/cpp/tests/transform/mask_to_bools_test.cpp
index 2a14b5dacff..9fb4b3d35f4 100644
--- a/cpp/tests/transform/mask_to_bools_test.cpp
+++ b/cpp/tests/transform/mask_to_bools_test.cpp
@@ -65,7 +65,7 @@ TEST_P(MaskToBoolsTest, LargeDataSizeTest)
   auto mask = cudf::bools_to_mask(col);
 
   auto out = cudf::mask_to_bools(
-    static_cast<const cudf::bitmask_type*>(mask.first->data()), begin_bit, end_bit);
+    static_cast<cudf::bitmask_type const*>(mask.first->data()), begin_bit, end_bit);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, out->view());
 }
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 095495456e9..236407e62f3 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -98,10 +98,10 @@ std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>> build_li
   // }
   cudf::test::fixed_width_column_wrapper<T> values{
     1, 2, 3, 4, 5, 10, 6, 7, 8, 9, -1, -2, -3, -4, -5, -6, -7, -8, -9};
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> inner_offsets{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> inner_offsets{
     0, 2, 5, 6, 9, 10, 12, 14, 17, 19};
   auto inner_list = cudf::make_lists_column(9, inner_offsets.release(), values.release(), 0, {});
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> outer_offsets{0, 2, 2, 3, 5, 7, 9};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> outer_offsets{0, 2, 2, 3, 5, 7, 9};
   auto list = cudf::make_lists_column(6, outer_offsets.release(), std::move(inner_list), 0, {});
 
   // expected size = (num rows at level 1 + num_rows at level 2) + # values in the leaf
@@ -142,13 +142,13 @@ TYPED_TEST(RowBitCountTyped, ListsWithNulls)
   // }
   cudf::test::fixed_width_column_wrapper<T> values{{1, 2, 3, 4, 5, 10, 6, 7, 8},
                                                    {1, 1, 1, 0, 1, 1, 0, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> inner_offsets{0, 2, 5, 6, 9, 9};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> inner_offsets{0, 2, 5, 6, 9, 9};
   std::vector<bool> inner_list_validity{1, 1, 1, 1, 0};
   auto [null_mask, null_count] =
     cudf::test::detail::make_null_mask(inner_list_validity.begin(), inner_list_validity.end());
   auto inner_list = cudf::make_lists_column(
     5, inner_offsets.release(), values.release(), null_count, std::move(null_mask));
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> outer_offsets{0, 2, 2, 3, 5};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> outer_offsets{0, 2, 2, 3, 5};
   auto list = cudf::make_lists_column(4, outer_offsets.release(), std::move(inner_list), 0, {});
 
   cudf::table_view t({*list});
@@ -177,7 +177,7 @@ TEST_F(RowBitCount, Strings)
 
   // expect 1 offset (4 bytes) + length of string per row
   auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) {
-    return (static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::offset_type)) * CHAR_BIT;
+    return (static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::size_type)) * CHAR_BIT;
   });
   cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(size_iter,
                                                                    size_iter + strings.size());
@@ -200,7 +200,7 @@ TEST_F(RowBitCount, StringsWithNulls)
   // expect 1 offset (4 bytes) + (length of string, or 0 if null) + 1 validity bit per row
   auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings, &valids](int i) {
     return ((static_cast<cudf::size_type>(valids[i] ? strings[i].size() : 0) +
-             sizeof(cudf::offset_type)) *
+             sizeof(cudf::size_type)) *
             CHAR_BIT) +
            1;
   });
@@ -247,8 +247,8 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
     cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, num_rows + 1);
   auto list_offsets_view = list_offsets->mutable_view();
   thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()),
-                   list_offsets_view.begin<cudf::offset_type>(),
-                   list_offsets_view.end<cudf::offset_type>(),
+                   list_offsets_view.begin<cudf::size_type>(),
+                   list_offsets_view.end<cudf::size_type>(),
                    times_2{});
 
   // List<int32_t> = {{0,1}, {2,3}, {4,5}, ..., {2*(num_rows-1), 2*num_rows-1}};
@@ -267,7 +267,7 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   thrust::fill_n(rmm::exec_policy(cudf::get_default_stream()),
                  expected_row_bit_counts->mutable_view().begin<int32_t>(),
                  num_rows,
-                 CHAR_BIT * (2 * sizeof(int32_t) + sizeof(cudf::offset_type)));
+                 CHAR_BIT * (2 * sizeof(int32_t) + sizeof(cudf::size_type)));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(row_bit_counts->view(), expected_row_bit_counts->view());
 }
@@ -309,8 +309,7 @@ TEST_F(RowBitCount, StructsNoNulls)
   // expect 1 offset (4 bytes) + (length of string) + 1 float + 1 int16_t
   auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) {
     return ((sizeof(float) + sizeof(int16_t)) * CHAR_BIT) +
-           ((static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::offset_type)) *
-            CHAR_BIT);
+           ((static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::size_type)) * CHAR_BIT);
   });
   cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(size_iter,
                                                                    size_iter + t.num_rows());
@@ -534,7 +533,7 @@ TEST_F(RowBitCount, NestedTypes)
 
 TEST_F(RowBitCount, NullsInStringsList)
 {
-  using offsets_wrapper = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using offsets_wrapper = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
   // clang-format off
   auto strings = std::vector<std::string>{ "daïs", "def", "", "z", "bananas", "warp", "", "zing" };
@@ -552,7 +551,7 @@ TEST_F(RowBitCount, NullsInStringsList)
     {});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
     cudf::row_bit_count(cudf::table_view{{lists_col->view()}})->view(),
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type>{138, 106, 130, 130});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{138, 106, 130, 130});
 }
 
 TEST_F(RowBitCount, EmptyChildColumnInListOfStrings)
@@ -560,13 +559,13 @@ TEST_F(RowBitCount, EmptyChildColumnInListOfStrings)
   // Test with a list<string> column with 4 empty list rows.
   // Note: Since there are no strings in any of the lists,
   //       the lists column's child can be empty.
-  auto offsets   = cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, 0, 0, 0, 0};
+  auto offsets   = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
   auto lists_col = cudf::make_lists_column(
     4, offsets.release(), cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}), 0, {});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
     cudf::row_bit_count(cudf::table_view{{lists_col->view()}})->view(),
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type>{32, 32, 32, 32});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{32, 32, 32, 32});
 }
 
 TEST_F(RowBitCount, EmptyChildColumnInListOfLists)
@@ -579,12 +578,12 @@ TEST_F(RowBitCount, EmptyChildColumnInListOfLists)
     return cudf::empty_like(exemplar);
   };
 
-  auto offsets   = cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, 0, 0, 0, 0};
+  auto offsets   = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
   auto lists_col = cudf::make_lists_column(4, offsets.release(), empty_child_lists_column(), 0, {});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
     cudf::row_bit_count(cudf::table_view{{lists_col->view()}})->view(),
-    cudf::test::fixed_width_column_wrapper<cudf::offset_type>{32, 32, 32, 32});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{32, 32, 32, 32});
 }
 
 struct sum_functor {
@@ -639,12 +638,12 @@ TEST_F(RowBitCount, DepthJump)
   // the jump occurs from depth 2 (the leafmost int column)
   // to depth 0 (the topmost int column)
   cudf::test::fixed_width_column_wrapper<T> ____c0{1, 2, 3, 5, 5, 6, 7, 8};
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> ___offsets{0, 2, 4, 6, 8};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> ___offsets{0, 2, 4, 6, 8};
   auto ___c0 = cudf::make_lists_column(4, ___offsets.release(), ____c0.release(), 0, {});
   std::vector<std::unique_ptr<cudf::column>> __children;
   __children.push_back(std::move(___c0));
   cudf::test::structs_column_wrapper __c0(std::move(__children));
-  cudf::test::fixed_width_column_wrapper<cudf::offset_type> _offsets{0, 3, 4};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> _offsets{0, 3, 4};
   auto _c0 = cudf::make_lists_column(2, _offsets.release(), __c0.release(), 0, {});
   cudf::test::fixed_width_column_wrapper<int> _c1{3, 4};
   std::vector<std::unique_ptr<cudf::column>> children;
@@ -657,7 +656,7 @@ TEST_F(RowBitCount, DepthJump)
 
   // expected size = (num rows at level 1 + num_rows at level 2) + (# values the leaf int column) +
   // 1 (value in topmost int column)
-  constexpr cudf::size_type offset_size = sizeof(cudf::offset_type) * CHAR_BIT;
+  constexpr cudf::size_type offset_size = sizeof(cudf::size_type) * CHAR_BIT;
   constexpr cudf::size_type type_size   = sizeof(T) * CHAR_BIT;
   cudf::test::fixed_width_column_wrapper<cudf::size_type> expected{
     ((1 + 3) * offset_size) + (6 * type_size) + (1 * type_size),
@@ -693,7 +692,7 @@ TEST_F(RowBitCount, SlicedColumnsStrings)
 
   // expect 1 offset (4 bytes) + length of string per row
   auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) {
-    return (static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::offset_type)) * CHAR_BIT;
+    return (static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::size_type)) * CHAR_BIT;
   });
   cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(size_iter + 3,
                                                                    size_iter + 3 + slice_size);
@@ -736,7 +735,7 @@ TEST_F(RowBitCount, SlicedColumnsStructs)
 
   // expect 1 offset (4 bytes) + length of string per row + 1 int16_t per row
   auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) {
-    return (static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::offset_type) +
+    return (static_cast<cudf::size_type>(strings[i].size()) + sizeof(cudf::size_type) +
             sizeof(int16_t)) *
            CHAR_BIT;
   });
@@ -751,7 +750,7 @@ TEST_F(RowBitCount, EmptyTable)
   {
     cudf::table_view empty;
     auto result = cudf::row_bit_count(empty);
-    CUDF_EXPECTS(result != nullptr && result->size() == 0, "Expected an empty column");
+    EXPECT_TRUE(result != nullptr && result->size() == 0);
   }
 
   {
@@ -760,6 +759,6 @@ TEST_F(RowBitCount, EmptyTable)
     cudf::table_view empty({*strings, *ints});
 
     auto result = cudf::row_bit_count(empty);
-    CUDF_EXPECTS(result != nullptr && result->size() == 0, "Expected an empty column");
+    EXPECT_TRUE(result != nullptr && result->size() == 0);
   }
 }
diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp
index 93cc4aaa100..cf46dd74138 100644
--- a/cpp/tests/transpose/transpose_test.cpp
+++ b/cpp/tests/transpose/transpose_test.cpp
@@ -146,12 +146,10 @@ void run_test(size_t ncols, size_t nrows, bool add_nulls)
   auto result      = transpose(input_view);
   auto result_view = std::get<1>(result);
 
-  CUDF_EXPECTS(result_view.num_columns() == expected_view.num_columns(),
-               "Expected same number of columns");
+  ASSERT_EQ(result_view.num_columns(), expected_view.num_columns());
   for (cudf::size_type i = 0; i < result_view.num_columns(); ++i) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_view.column(i), expected_view.column(i));
-    CUDF_EXPECTS(result_view.column(i).null_count() == expected_nulls[i],
-                 "Expected correct null count");
+    EXPECT_EQ(result_view.column(i).null_count(), expected_nulls[i]);
   }
 }
 
diff --git a/cpp/tests/types/traits_test.cpp b/cpp/tests/types/traits_test.cpp
index 4ef693090b1..32d55624fc6 100644
--- a/cpp/tests/types/traits_test.cpp
+++ b/cpp/tests/types/traits_test.cpp
@@ -31,7 +31,7 @@ void tuple_for_each_impl(Tuple&& tuple, F&& f, std::index_sequence<Indices...>)
 }
 
 template <typename F, typename... Args>
-void tuple_for_each(const std::tuple<Args...>& tuple, F&& f)
+void tuple_for_each(std::tuple<Args...> const& tuple, F&& f)
 {
   tuple_for_each_impl(tuple, std::forward<F>(f), std::index_sequence_for<Args...>{});
 }
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index 70451d49182..9506e1918c0 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -33,41 +33,41 @@
 #include <type_traits>
 #include <vector>
 
-static const auto test_timestamps_D = std::vector<int32_t>{
+static auto const test_timestamps_D = std::vector<int32_t>{
   -1528,  // 1965-10-26 GMT
   17716,  // 2018-07-04 GMT
   19382,  // 2023-01-25 GMT
 };
 
-static const auto test_timestamps_s = std::vector<int64_t>{
+static auto const test_timestamps_s = std::vector<int64_t>{
   -131968728,  // 1965-10-26 14:01:12 GMT
   1530705600,  // 2018-07-04 12:00:00 GMT
   1674631932,  // 2023-01-25 07:32:12 GMT
 };
 
-static const auto test_timestamps_ms = std::vector<int64_t>{
+static auto const test_timestamps_ms = std::vector<int64_t>{
   -131968727238,  // 1965-10-26 14:01:12.762 GMT
   1530705600000,  // 2018-07-04 12:00:00.000 GMT
   1674631932929,  // 2023-01-25 07:32:12.929 GMT
 };
 
-static const auto test_timestamps_us = std::vector<int64_t>{
+static auto const test_timestamps_us = std::vector<int64_t>{
   -131968727238000,  // 1965-10-26 14:01:12.762000000 GMT
   1530705600000000,  // 2018-07-04 12:00:00.000000000 GMT
   1674631932929000,  // 2023-01-25 07:32:12.929000000 GMT
 };
 
-static const auto test_timestamps_ns = std::vector<int64_t>{
+static auto const test_timestamps_ns = std::vector<int64_t>{
   -131968727238000000,  // 1965-10-26 14:01:12.762000000 GMT
   1530705600000000000,  // 2018-07-04 12:00:00.000000000 GMT
   1674631932929000000,  // 2023-01-25 07:32:12.929000000 GMT
 };
 
-static const auto test_durations_D  = test_timestamps_D;
-static const auto test_durations_s  = test_timestamps_s;
-static const auto test_durations_ms = test_timestamps_ms;
-static const auto test_durations_us = test_timestamps_us;
-static const auto test_durations_ns = test_timestamps_ns;
+static auto const test_durations_D  = test_timestamps_D;
+static auto const test_durations_s  = test_timestamps_s;
+static auto const test_durations_ms = test_timestamps_ms;
+static auto const test_durations_us = test_timestamps_us;
+static auto const test_durations_ns = test_timestamps_ns;
 
 template <typename T, typename R>
 inline auto make_column(std::vector<R> data)
@@ -90,70 +90,91 @@ inline cudf::column make_exp_chrono_column(cudf::type_id type_id)
         test_timestamps_D.size(),
         rmm::device_buffer{test_timestamps_D.data(),
                            test_timestamps_D.size() * sizeof(test_timestamps_D.front()),
-                           cudf::get_default_stream()});
+                           cudf::get_default_stream()},
+
+        rmm::device_buffer{},
+        0);
     case cudf::type_id::TIMESTAMP_SECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_timestamps_s.size(),
         rmm::device_buffer{test_timestamps_s.data(),
                            test_timestamps_s.size() * sizeof(test_timestamps_s.front()),
-                           cudf::get_default_stream()});
+                           cudf::get_default_stream()},
+        rmm::device_buffer{},
+        0);
     case cudf::type_id::TIMESTAMP_MILLISECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_timestamps_ms.size(),
         rmm::device_buffer{test_timestamps_ms.data(),
                            test_timestamps_ms.size() * sizeof(test_timestamps_ms.front()),
-                           cudf::get_default_stream()});
+                           cudf::get_default_stream()},
+        rmm::device_buffer{},
+        0);
     case cudf::type_id::TIMESTAMP_MICROSECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_timestamps_us.size(),
         rmm::device_buffer{test_timestamps_us.data(),
                            test_timestamps_us.size() * sizeof(test_timestamps_us.front()),
-                           cudf::get_default_stream()});
+                           cudf::get_default_stream()},
+        rmm::device_buffer{},
+        0);
     case cudf::type_id::TIMESTAMP_NANOSECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_timestamps_ns.size(),
         rmm::device_buffer{test_timestamps_ns.data(),
                            test_timestamps_ns.size() * sizeof(test_timestamps_ns.front()),
-                           cudf::get_default_stream()});
+                           cudf::get_default_stream()},
+        rmm::device_buffer{},
+        0);
     case cudf::type_id::DURATION_DAYS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_D.size(),
         rmm::device_buffer{test_durations_D.data(),
                            test_durations_D.size() * sizeof(test_durations_D.front()),
-                           cudf::get_default_stream()});
+                           cudf::get_default_stream()},
+        rmm::device_buffer{},
+        0);
     case cudf::type_id::DURATION_SECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_s.size(),
         rmm::device_buffer{test_durations_s.data(),
                            test_durations_s.size() * sizeof(test_durations_s.front()),
-                           cudf::get_default_stream()});
+                           cudf::get_default_stream()},
+        rmm::device_buffer{},
+        0);
     case cudf::type_id::DURATION_MILLISECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_ms.size(),
         rmm::device_buffer{test_durations_ms.data(),
                            test_durations_ms.size() * sizeof(test_durations_ms.front()),
-                           cudf::get_default_stream()});
+                           cudf::get_default_stream()},
+        rmm::device_buffer{},
+        0);
     case cudf::type_id::DURATION_MICROSECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_us.size(),
         rmm::device_buffer{test_durations_us.data(),
                            test_durations_us.size() * sizeof(test_durations_us.front()),
-                           cudf::get_default_stream()});
+                           cudf::get_default_stream()},
+        rmm::device_buffer{},
+        0);
     case cudf::type_id::DURATION_NANOSECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_ns.size(),
         rmm::device_buffer{test_durations_ns.data(),
                            test_durations_ns.size() * sizeof(test_durations_ns.front()),
-                           cudf::get_default_stream()});
+                           cudf::get_default_stream()},
+        rmm::device_buffer{},
+        0);
     default: CUDF_FAIL("Unsupported type_id");
   }
 };
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index cef9c143c9d..bae402155e9 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -33,6 +33,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/default_stream.hpp>
 #include <cudf_test/detail/column_utilities.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -63,9 +64,9 @@ namespace {
 
 std::unique_ptr<column> generate_all_row_indices(size_type num_rows)
 {
-  auto indices =
-    cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED);
-  thrust::sequence(rmm::exec_policy(cudf::get_default_stream()),
+  auto indices = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, cudf::test::get_default_stream());
+  thrust::sequence(rmm::exec_policy(cudf::test::get_default_stream()),
                    indices->mutable_view().begin<size_type>(),
                    indices->mutable_view().end<size_type>(),
                    0);
@@ -102,7 +103,7 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   // if we are checking for exact equality, we should be checking for "unsanitized" data that may
   // be hiding underneath nulls. so check all rows instead of just non-null rows
   if (check_exact_equality) {
-    return generate_all_row_indices(c.get_sliced_child(cudf::get_default_stream()).size());
+    return generate_all_row_indices(c.get_sliced_child(cudf::test::get_default_stream()).size());
   }
 
   // Example input
@@ -122,7 +123,7 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
     0,
     [row_indices = row_indices.begin<size_type>(),
      validity    = c.null_mask(),
-     offsets     = c.offsets().begin<offset_type>(),
+     offsets     = c.offsets().begin<size_type>(),
      offset      = c.offset()] __device__(int index) {
       // both null mask and offsets data are not pre-sliced. so we need to add the column offset to
       // every incoming index.
@@ -131,7 +132,7 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
                ? (offsets[true_index + 1] - offsets[true_index])
                : 0;
     });
-  auto const output_size = thrust::reduce(rmm::exec_policy(cudf::get_default_stream()),
+  auto const output_size = thrust::reduce(rmm::exec_policy(cudf::test::get_default_stream()),
                                           row_size_iter,
                                           row_size_iter + row_indices.size());
   // no output. done.
@@ -146,7 +147,7 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   //
   auto output_row_start = cudf::make_fixed_width_column(
     data_type{type_id::INT32}, row_indices.size(), mask_state::UNALLOCATED);
-  thrust::exclusive_scan(rmm::exec_policy(cudf::get_default_stream()),
+  thrust::exclusive_scan(rmm::exec_policy(cudf::test::get_default_stream()),
                          row_size_iter,
                          row_size_iter + row_indices.size(),
                          output_row_start->mutable_view().begin<size_type>());
@@ -155,7 +156,7 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   //
   // result = [1, 1, 1, 1, 1]
   //
-  thrust::generate(rmm::exec_policy(cudf::get_default_stream()),
+  thrust::generate(rmm::exec_policy(cudf::test::get_default_stream()),
                    result->mutable_view().begin<size_type>(),
                    result->mutable_view().end<size_type>(),
                    [] __device__() { return 1; });
@@ -167,14 +168,14 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   auto output_row_iter = cudf::detail::make_counting_transform_iterator(
     0,
     [row_indices  = row_indices.begin<size_type>(),
-     offsets      = c.offsets().begin<offset_type>(),
+     offsets      = c.offsets().begin<size_type>(),
      offset       = c.offset(),
-     first_offset = cudf::detail::get_value<offset_type>(
-       c.offsets(), c.offset(), cudf::get_default_stream())] __device__(int index) {
+     first_offset = cudf::detail::get_value<size_type>(
+       c.offsets(), c.offset(), cudf::test::get_default_stream())] __device__(int index) {
       auto const true_index = row_indices[index] + offset;
       return offsets[true_index] - first_offset;
     });
-  thrust::scatter_if(rmm::exec_policy(cudf::get_default_stream()),
+  thrust::scatter_if(rmm::exec_policy(cudf::test::get_default_stream()),
                      output_row_iter,
                      output_row_iter + row_indices.size(),
                      output_row_start->view().begin<size_type>(),
@@ -188,18 +189,18 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   //
   auto keys =
     cudf::make_fixed_width_column(data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED);
-  thrust::generate(rmm::exec_policy(cudf::get_default_stream()),
+  thrust::generate(rmm::exec_policy(cudf::test::get_default_stream()),
                    keys->mutable_view().begin<size_type>(),
                    keys->mutable_view().end<size_type>(),
                    [] __device__() { return 0; });
-  thrust::scatter_if(rmm::exec_policy(cudf::get_default_stream()),
+  thrust::scatter_if(rmm::exec_policy(cudf::test::get_default_stream()),
                      row_size_iter,
                      row_size_iter + row_indices.size(),
                      output_row_start->view().begin<size_type>(),
                      row_size_iter,
                      keys->mutable_view().begin<size_type>(),
                      [] __device__(auto row_size) { return row_size != 0; });
-  thrust::inclusive_scan(rmm::exec_policy(cudf::get_default_stream()),
+  thrust::inclusive_scan(rmm::exec_policy(cudf::test::get_default_stream()),
                          keys->view().begin<size_type>(),
                          keys->view().end<size_type>(),
                          keys->mutable_view().begin<size_type>());
@@ -212,7 +213,7 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   // output
   //    result = [6, 7, 11, 12, 13]
   //
-  thrust::inclusive_scan_by_key(rmm::exec_policy(cudf::get_default_stream()),
+  thrust::inclusive_scan_by_key(rmm::exec_policy(cudf::test::get_default_stream()),
                                 keys->view().begin<size_type>(),
                                 keys->view().end<size_type>(),
                                 result->view().begin<size_type>(),
@@ -255,7 +256,7 @@ struct column_property_comparator {
         auto const true_index = row_indices[index] + offset;
         return !validity || cudf::bit_is_set(validity, true_index) ? 0 : 1;
       });
-    return thrust::reduce(rmm::exec_policy(cudf::get_default_stream()),
+    return thrust::reduce(rmm::exec_policy(cudf::test::get_default_stream()),
                           validity_iter,
                           validity_iter + row_indices.size());
   }
@@ -327,8 +328,8 @@ struct column_property_comparator {
     auto lhs_child_indices =
       generate_child_row_indices(lhs_l, lhs_row_indices, check_exact_equality);
     if (lhs_child_indices->size() > 0) {
-      auto lhs_child = lhs_l.get_sliced_child(cudf::get_default_stream());
-      auto rhs_child = rhs_l.get_sliced_child(cudf::get_default_stream());
+      auto lhs_child = lhs_l.get_sliced_child(cudf::test::get_default_stream());
+      auto rhs_child = rhs_l.get_sliced_child(cudf::test::get_default_stream());
       auto rhs_child_indices =
         generate_child_row_indices(rhs_l, rhs_row_indices, check_exact_equality);
       return cudf::type_dispatcher(lhs_child.type(),
@@ -355,8 +356,8 @@ struct column_property_comparator {
     structs_column_view r_scv(rhs);
 
     for (size_type i = 0; i < lhs.num_children(); i++) {
-      column_view lhs_child = l_scv.get_sliced_child(i, cudf::get_default_stream());
-      column_view rhs_child = r_scv.get_sliced_child(i, cudf::get_default_stream());
+      column_view lhs_child = l_scv.get_sliced_child(i, cudf::test::get_default_stream());
+      column_view rhs_child = r_scv.get_sliced_child(i, cudf::test::get_default_stream());
       if (!cudf::type_dispatcher(lhs_child.type(),
                                  column_property_comparator<check_exact_equality>{},
                                  lhs_child,
@@ -489,7 +490,8 @@ std::string stringify_column_differences(cudf::device_span<int const> difference
   CUDF_EXPECTS(not differences.empty(), "Shouldn't enter this function if `differences` is empty");
   std::string const depth_str = depth > 0 ? "depth " + std::to_string(depth) + '\n' : "";
   // move the differences to the host.
-  auto h_differences = cudf::detail::make_host_vector_sync(differences, cudf::get_default_stream());
+  auto h_differences =
+    cudf::detail::make_host_vector_sync(differences, cudf::test::get_default_stream());
   if (verbosity == debug_output_level::ALL_ERRORS) {
     std::ostringstream buffer;
     buffer << depth_str << "differences:" << std::endl;
@@ -510,9 +512,9 @@ std::string stringify_column_differences(cudf::device_span<int const> difference
     auto const index = h_differences[0];  // only stringify first difference
 
     auto const lhs_index =
-      cudf::detail::get_value<size_type>(lhs_row_indices, index, cudf::get_default_stream());
+      cudf::detail::get_value<size_type>(lhs_row_indices, index, cudf::test::get_default_stream());
     auto const rhs_index =
-      cudf::detail::get_value<size_type>(rhs_row_indices, index, cudf::get_default_stream());
+      cudf::detail::get_value<size_type>(rhs_row_indices, index, cudf::test::get_default_stream());
     auto diff_lhs = cudf::slice(lhs, {lhs_index, lhs_index + 1}).front();
     auto diff_rhs = cudf::slice(rhs, {rhs_index, rhs_index + 1}).front();
     return depth_str + "first difference: " + "lhs[" + std::to_string(index) +
@@ -532,17 +534,19 @@ struct column_comparator_impl {
                   size_type fp_ulps,
                   int depth)
   {
-    auto d_lhs_row_indices = cudf::column_device_view::create(lhs_row_indices);
-    auto d_rhs_row_indices = cudf::column_device_view::create(rhs_row_indices);
+    auto d_lhs_row_indices =
+      cudf::column_device_view::create(lhs_row_indices, cudf::test::get_default_stream());
+    auto d_rhs_row_indices =
+      cudf::column_device_view::create(rhs_row_indices, cudf::test::get_default_stream());
 
-    auto d_lhs = cudf::column_device_view::create(lhs);
-    auto d_rhs = cudf::column_device_view::create(rhs);
+    auto d_lhs = cudf::column_device_view::create(lhs, cudf::test::get_default_stream());
+    auto d_rhs = cudf::column_device_view::create(rhs, cudf::test::get_default_stream());
 
     auto lhs_tview = table_view{{lhs}};
     auto rhs_tview = table_view{{rhs}};
 
     auto const comparator = cudf::experimental::row::equality::two_table_comparator{
-      lhs_tview, rhs_tview, cudf::get_default_stream()};
+      lhs_tview, rhs_tview, cudf::test::get_default_stream()};
     auto const has_nulls = cudf::has_nulls(lhs_tview) or cudf::has_nulls(rhs_tview);
 
     auto const device_comparator = comparator.equal_to<false>(cudf::nullate::DYNAMIC{has_nulls});
@@ -553,20 +557,22 @@ struct column_comparator_impl {
                          corresponding_rows_not_equivalent<decltype(device_comparator)>>;
 
     auto differences = rmm::device_uvector<int>(
-      lhs_row_indices.size(), cudf::get_default_stream());  // worst case: everything different
+      lhs_row_indices.size(),
+      cudf::test::get_default_stream());  // worst case: everything different
     auto input_iter = thrust::make_counting_iterator(0);
 
-    auto diff_map = rmm::device_uvector<bool>(lhs_row_indices.size(), cudf::get_default_stream());
+    auto diff_map =
+      rmm::device_uvector<bool>(lhs_row_indices.size(), cudf::test::get_default_stream());
 
     thrust::transform(
-      rmm::exec_policy(cudf::get_default_stream()),
+      rmm::exec_policy(cudf::test::get_default_stream()),
       input_iter,
       input_iter + lhs_row_indices.size(),
       diff_map.begin(),
       ComparatorType(
         *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps, device_comparator, *d_lhs, *d_rhs));
 
-    auto diff_iter = thrust::copy_if(rmm::exec_policy(cudf::get_default_stream()),
+    auto diff_iter = thrust::copy_if(rmm::exec_policy(cudf::test::get_default_stream()),
                                      input_iter,
                                      input_iter + lhs_row_indices.size(),
                                      diff_map.begin(),
@@ -574,7 +580,7 @@ struct column_comparator_impl {
                                      thrust::identity<bool>{});
 
     differences.resize(thrust::distance(differences.begin(), diff_iter),
-                       cudf::get_default_stream());  // shrink back down
+                       cudf::test::get_default_stream());  // shrink back down
 
     if (not differences.is_empty()) {
       if (verbosity != debug_output_level::QUIET) {
@@ -612,13 +618,13 @@ struct column_comparator_impl<list_view, check_exact_equality> {
     if (lhs_row_indices.is_empty()) { return true; }
 
     // worst case - everything is different
-    rmm::device_uvector<int> differences(lhs_row_indices.size(), cudf::get_default_stream());
+    rmm::device_uvector<int> differences(lhs_row_indices.size(), cudf::test::get_default_stream());
 
     // compare offsets, taking slicing into account
 
     // left side
     size_type lhs_shift = cudf::detail::get_value<size_type>(
-      lhs_l.offsets(), lhs_l.offset(), cudf::get_default_stream());
+      lhs_l.offsets(), lhs_l.offset(), cudf::test::get_default_stream());
     auto lhs_offsets = thrust::make_transform_iterator(
       lhs_l.offsets().begin<size_type>() + lhs_l.offset(),
       [lhs_shift] __device__(size_type offset) { return offset - lhs_shift; });
@@ -630,7 +636,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
 
     // right side
     size_type rhs_shift = cudf::detail::get_value<size_type>(
-      rhs_l.offsets(), rhs_l.offset(), cudf::get_default_stream());
+      rhs_l.offsets(), rhs_l.offset(), cudf::test::get_default_stream());
     auto rhs_offsets = thrust::make_transform_iterator(
       rhs_l.offsets().begin<size_type>() + rhs_l.offset(),
       [rhs_shift] __device__(size_type offset) { return offset - rhs_shift; });
@@ -658,7 +664,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
     //
     auto input_iter = thrust::make_counting_iterator(0);
     auto diff_iter  = thrust::copy_if(
-      rmm::exec_policy(cudf::get_default_stream()),
+      rmm::exec_policy(cudf::test::get_default_stream()),
       input_iter,
       input_iter + lhs_row_indices.size(),
       differences.begin(),
@@ -694,7 +700,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
       });
 
     differences.resize(thrust::distance(differences.begin(), diff_iter),
-                       cudf::get_default_stream());  // shrink back down
+                       cudf::test::get_default_stream());  // shrink back down
 
     if (not differences.is_empty()) {
       if (verbosity != debug_output_level::QUIET) {
@@ -713,8 +719,8 @@ struct column_comparator_impl<list_view, check_exact_equality> {
     auto lhs_child_indices =
       generate_child_row_indices(lhs_l, lhs_row_indices, check_exact_equality);
     if (lhs_child_indices->size() > 0) {
-      auto lhs_child = lhs_l.get_sliced_child(cudf::get_default_stream());
-      auto rhs_child = rhs_l.get_sliced_child(cudf::get_default_stream());
+      auto lhs_child = lhs_l.get_sliced_child(cudf::test::get_default_stream());
+      auto rhs_child = rhs_l.get_sliced_child(cudf::test::get_default_stream());
       auto rhs_child_indices =
         generate_child_row_indices(rhs_l, rhs_row_indices, check_exact_equality);
       return cudf::type_dispatcher(lhs_child.type(),
@@ -746,8 +752,8 @@ struct column_comparator_impl<struct_view, check_exact_equality> {
     structs_column_view r_scv(rhs);
 
     for (size_type i = 0; i < lhs.num_children(); i++) {
-      column_view lhs_child = l_scv.get_sliced_child(i, cudf::get_default_stream());
-      column_view rhs_child = r_scv.get_sliced_child(i, cudf::get_default_stream());
+      column_view lhs_child = l_scv.get_sliced_child(i, cudf::test::get_default_stream());
+      column_view rhs_child = r_scv.get_sliced_child(i, cudf::test::get_default_stream());
       if (!cudf::type_dispatcher(lhs_child.type(),
                                  column_comparator<check_exact_equality>{},
                                  lhs_child,
@@ -881,8 +887,10 @@ void expect_equal_buffers(void const* lhs, void const* rhs, std::size_t size_byt
   }
   auto typed_lhs = static_cast<char const*>(lhs);
   auto typed_rhs = static_cast<char const*>(rhs);
-  EXPECT_TRUE(thrust::equal(
-    rmm::exec_policy(cudf::get_default_stream()), typed_lhs, typed_lhs + size_bytes, typed_rhs));
+  EXPECT_TRUE(thrust::equal(rmm::exec_policy(cudf::test::get_default_stream()),
+                            typed_lhs,
+                            typed_lhs + size_bytes,
+                            typed_rhs));
 }
 }  // namespace detail
 
@@ -979,20 +987,20 @@ std::string nested_offsets_to_string(NestedColumnView const& c, std::string cons
 
   // the first offset value to normalize everything against
   size_type first =
-    cudf::detail::get_value<size_type>(offsets, c.offset(), cudf::get_default_stream());
-  rmm::device_uvector<size_type> shifted_offsets(output_size, cudf::get_default_stream());
+    cudf::detail::get_value<size_type>(offsets, c.offset(), cudf::test::get_default_stream());
+  rmm::device_uvector<size_type> shifted_offsets(output_size, cudf::test::get_default_stream());
 
   // normalize the offset values for the column offset
   size_type const* d_offsets = offsets.head<size_type>() + c.offset();
   thrust::transform(
-    rmm::exec_policy(cudf::get_default_stream()),
+    rmm::exec_policy(cudf::test::get_default_stream()),
     d_offsets,
     d_offsets + output_size,
     shifted_offsets.begin(),
     [first] __device__(int32_t offset) { return static_cast<size_type>(offset - first); });
 
   auto const h_shifted_offsets =
-    cudf::detail::make_host_vector_sync(shifted_offsets, cudf::get_default_stream());
+    cudf::detail::make_host_vector_sync(shifted_offsets, cudf::test::get_default_stream());
   std::ostringstream buffer;
   for (size_t idx = 0; idx < h_shifted_offsets.size(); idx++) {
     buffer << h_shifted_offsets[idx];
@@ -1083,7 +1091,7 @@ struct column_view_printer {
     if (col.is_empty()) return;
     auto h_data = cudf::test::to_host<std::string>(col);
 
-    // explicitly replace '\r' and '\n' characters with "\r" and "\n" strings respectively.
+    // explicitly replace some special whitespace characters with their literal equivalents
     auto cleaned = [](std::string_view in) {
       std::string out(in);
       auto replace_char = [](std::string& out, char c, std::string_view repl) {
@@ -1091,8 +1099,13 @@ struct column_view_printer {
           out.replace(pos, 1, repl);
         }
       };
+      replace_char(out, '\a', "\\a");
+      replace_char(out, '\b', "\\b");
+      replace_char(out, '\f', "\\f");
       replace_char(out, '\r', "\\r");
+      replace_char(out, '\t', "\\t");
       replace_char(out, '\n', "\\n");
+      replace_char(out, '\v', "\\v");
       return out;
     };
 
@@ -1162,7 +1175,7 @@ struct column_view_printer {
     lists_column_view lcv(col);
 
     // propagate slicing to the child if necessary
-    column_view child    = lcv.get_sliced_child(cudf::get_default_stream());
+    column_view child    = lcv.get_sliced_child(cudf::test::get_default_stream());
     bool const is_sliced = lcv.offset() > 0 || child.offset() > 0;
 
     std::string tmp =
@@ -1205,7 +1218,7 @@ struct column_view_printer {
       iter + view.num_children(),
       std::ostream_iterator<std::string>(out_stream, "\n"),
       [&](size_type index) {
-        auto child = view.get_sliced_child(index, cudf::get_default_stream());
+        auto child = view.get_sliced_child(index, cudf::test::get_default_stream());
 
         // non-nested types don't typically display their null masks, so do it here for convenience.
         return (!is_nested(child.type()) && child.nullable()
diff --git a/cpp/tests/utilities/identify_stream_usage.cpp b/cpp/tests/utilities/identify_stream_usage.cpp
index c59622ee66d..ab2a85a0842 100644
--- a/cpp/tests/utilities/identify_stream_usage.cpp
+++ b/cpp/tests/utilities/identify_stream_usage.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <cudf/detail/utilities/stacktrace.hpp>
+
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
@@ -73,13 +75,10 @@ bool stream_is_invalid(cudaStream_t stream)
   return (stream != cudf::test::get_default_stream().value());
 #else
   // We explicitly list the possibilities rather than using
-  // `cudf::get_default_stream().value()` for two reasons:
-  // 1. There is no guarantee that `thrust::device` and the default value of
-  //    `cudf::get_default_stream().value()` are actually the same. At present,
-  //    the former is `cudaStreamLegacy` while the latter is 0.
-  // 2. Using the cudf default stream would require linking against cudf, which
-  //    adds unnecessary complexity to the build process (especially in CI)
-  //    when this simple approach is sufficient.
+  // `cudf::get_default_stream().value()` because there is no guarantee that
+  // `thrust::device` and the default value of
+  // `cudf::get_default_stream().value()` are actually the same. At present, the
+  // former is `cudaStreamLegacy` while the latter is 0.
   return (stream == cudaStreamDefault) || (stream == cudaStreamLegacy) ||
          (stream == cudaStreamPerThread);
 #endif
@@ -91,79 +90,15 @@ bool stream_is_invalid(cudaStream_t stream)
 void check_stream_and_error(cudaStream_t stream)
 {
   if (stream_is_invalid(stream)) {
-#ifdef __GNUC__
-    // If we're on the wrong stream, print the stack trace from the current frame.
-    // Adapted from from https://panthema.net/2008/0901-stacktrace-demangled/
-    constexpr int kMaxStackDepth = 64;
-    void* stack[kMaxStackDepth];
-    auto depth   = backtrace(stack, kMaxStackDepth);
-    auto strings = backtrace_symbols(stack, depth);
-
-    if (strings == nullptr) {
-      std::cout << "No stack trace could be found!" << std::endl;
-    } else {
-      // If we were able to extract a trace, parse it, demangle symbols, and
-      // print a readable output.
-
-      // allocate string which will be filled with the demangled function name
-      size_t funcnamesize = 256;
-      char* funcname      = (char*)malloc(funcnamesize);
-
-      // Start at frame 1 to skip print_trace itself.
-      for (int i = 1; i < depth; ++i) {
-        char* begin_name   = nullptr;
-        char* begin_offset = nullptr;
-        char* end_offset   = nullptr;
-
-        // find parentheses and +address offset surrounding the mangled name:
-        // ./module(function+0x15c) [0x8048a6d]
-        for (char* p = strings[i]; *p; ++p) {
-          if (*p == '(') {
-            begin_name = p;
-          } else if (*p == '+') {
-            begin_offset = p;
-          } else if (*p == ')' && begin_offset) {
-            end_offset = p;
-            break;
-          }
-        }
+    // Exclude the current function from stacktrace.
+    std::cout << cudf::detail::get_stacktrace(cudf::detail::capture_last_stackframe::NO)
+              << std::endl;
 
-        if (begin_name && begin_offset && end_offset && begin_name < begin_offset) {
-          *begin_name++   = '\0';
-          *begin_offset++ = '\0';
-          *end_offset     = '\0';
-
-          // mangled name is now in [begin_name, begin_offset) and caller offset
-          // in [begin_offset, end_offset). now apply __cxa_demangle():
-
-          int status;
-          char* ret = abi::__cxa_demangle(begin_name, funcname, &funcnamesize, &status);
-          if (status == 0) {
-            funcname =
-              ret;  // use possibly realloc()-ed string (__cxa_demangle may realloc funcname)
-            std::cout << "#" << i << " in " << strings[i] << " : " << funcname << "+"
-                      << begin_offset << std::endl;
-          } else {
-            // demangling failed. Output function name as a C function with no arguments.
-            std::cout << "#" << i << " in " << strings[i] << " : " << begin_name << "()+"
-                      << begin_offset << std::endl;
-          }
-        } else {
-          std::cout << "#" << i << " in " << strings[i] << std::endl;
-        }
-      }
-
-      free(funcname);
-    }
-    free(strings);
-#else
-    std::cout << "Backtraces are only when built with a GNU compiler." << std::endl;
-#endif  // __GNUC__
     char const* env_stream_error_mode{std::getenv("GTEST_CUDF_STREAM_ERROR_MODE")};
     if (env_stream_error_mode && !strcmp(env_stream_error_mode, "print")) {
-      std::cout << "Found unexpected stream!" << std::endl;
+      std::cout << "cudf_identify_stream_usage found unexpected stream!" << std::endl;
     } else {
-      throw std::runtime_error("Found unexpected stream!");
+      throw std::runtime_error("cudf_identify_stream_usage found unexpected stream!");
     }
   }
 }
@@ -240,7 +175,7 @@ DEFINE_OVERLOAD(cudaEventRecordWithFlags,
 // Execution APIS:
 // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html#group__CUDART__EXECUTION
 DEFINE_OVERLOAD(cudaLaunchKernel,
-                ARG(const void* func,
+                ARG(void const* func,
                     dim3 gridDim,
                     dim3 blockDim,
                     void** args,
@@ -248,7 +183,7 @@ DEFINE_OVERLOAD(cudaLaunchKernel,
                     cudaStream_t stream),
                 ARG(func, gridDim, blockDim, args, sharedMem, stream));
 DEFINE_OVERLOAD(cudaLaunchCooperativeKernel,
-                ARG(const void* func,
+                ARG(void const* func,
                     dim3 gridDim,
                     dim3 blockDim,
                     void** args,
@@ -262,12 +197,12 @@ DEFINE_OVERLOAD(cudaLaunchHostFunc,
 // Memory transfer APIS:
 // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
 DEFINE_OVERLOAD(cudaMemPrefetchAsync,
-                ARG(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream),
+                ARG(void const* devPtr, size_t count, int dstDevice, cudaStream_t stream),
                 ARG(devPtr, count, dstDevice, stream));
 DEFINE_OVERLOAD(cudaMemcpy2DAsync,
                 ARG(void* dst,
                     size_t dpitch,
-                    const void* src,
+                    void const* src,
                     size_t spitch,
                     size_t width,
                     size_t height,
@@ -289,7 +224,7 @@ DEFINE_OVERLOAD(cudaMemcpy2DToArrayAsync,
                 ARG(cudaArray_t dst,
                     size_t wOffset,
                     size_t hOffset,
-                    const void* src,
+                    void const* src,
                     size_t spitch,
                     size_t width,
                     size_t height,
@@ -297,26 +232,26 @@ DEFINE_OVERLOAD(cudaMemcpy2DToArrayAsync,
                     cudaStream_t stream),
                 ARG(dst, wOffset, hOffset, src, spitch, width, height, kind, stream));
 DEFINE_OVERLOAD(cudaMemcpy3DAsync,
-                ARG(const cudaMemcpy3DParms* p, cudaStream_t stream),
+                ARG(cudaMemcpy3DParms const* p, cudaStream_t stream),
                 ARG(p, stream));
 DEFINE_OVERLOAD(cudaMemcpy3DPeerAsync,
-                ARG(const cudaMemcpy3DPeerParms* p, cudaStream_t stream),
+                ARG(cudaMemcpy3DPeerParms const* p, cudaStream_t stream),
                 ARG(p, stream));
 DEFINE_OVERLOAD(
   cudaMemcpyAsync,
-  ARG(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream),
+  ARG(void* dst, void const* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream),
   ARG(dst, src, count, kind, stream));
 DEFINE_OVERLOAD(cudaMemcpyFromSymbolAsync,
                 ARG(void* dst,
-                    const void* symbol,
+                    void const* symbol,
                     size_t count,
                     size_t offset,
                     cudaMemcpyKind kind,
                     cudaStream_t stream),
                 ARG(dst, symbol, count, offset, kind, stream));
 DEFINE_OVERLOAD(cudaMemcpyToSymbolAsync,
-                ARG(const void* symbol,
-                    const void* src,
+                ARG(void const* symbol,
+                    void const* src,
                     size_t count,
                     size_t offset,
                     cudaMemcpyKind kind,
diff --git a/cpp/tests/utilities/tdigest_utilities.cu b/cpp/tests/utilities/tdigest_utilities.cu
index d2e95812894..9294aa0f681 100644
--- a/cpp/tests/utilities/tdigest_utilities.cu
+++ b/cpp/tests/utilities/tdigest_utilities.cu
@@ -110,12 +110,12 @@ std::unique_ptr<column> make_expected_tdigest_column(std::vector<expected_tdiges
     auto tdigests =
       cudf::make_structs_column(tdigest.mean.size(), std::move(inner_children), 0, {});
 
-    std::vector<offset_type> h_offsets{0, tdigest.mean.size()};
+    std::vector<size_type> h_offsets{0, tdigest.mean.size()};
     auto offsets =
       cudf::make_fixed_width_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED);
-    CUDF_CUDA_TRY(cudaMemcpy(offsets->mutable_view().begin<offset_type>(),
+    CUDF_CUDA_TRY(cudaMemcpy(offsets->mutable_view().begin<size_type>(),
                              h_offsets.data(),
-                             sizeof(offset_type) * 2,
+                             sizeof(size_type) * 2,
                              cudaMemcpyDefault));
 
     auto list = cudf::make_lists_column(1, std::move(offsets), std::move(tdigests), 0, {});
diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp
index 84ba256a13e..90a7270cb29 100644
--- a/cpp/tests/utilities_tests/column_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp
@@ -157,7 +157,7 @@ struct ColumnUtilitiesStringsTest : public cudf::test::BaseFixture {};
 
 TEST_F(ColumnUtilitiesStringsTest, StringsToHost)
 {
-  std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
+  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -171,7 +171,7 @@ TEST_F(ColumnUtilitiesStringsTest, StringsToHost)
 
 TEST_F(ColumnUtilitiesStringsTest, StringsToHostAllNulls)
 {
-  std::vector<const char*> h_strings{nullptr, nullptr, nullptr};
+  std::vector<char const*> h_strings{nullptr, nullptr, nullptr};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -184,7 +184,7 @@ TEST_F(ColumnUtilitiesStringsTest, StringsToHostAllNulls)
 
 TEST_F(ColumnUtilitiesStringsTest, PrintColumnDuration)
 {
-  const char* delimiter = ",";
+  char const* delimiter = ",";
 
   cudf::test::fixed_width_column_wrapper<cudf::duration_s, int32_t> cudf_col({100, 0, 7, 140000});
 
@@ -195,7 +195,7 @@ TEST_F(ColumnUtilitiesStringsTest, PrintColumnDuration)
 
 TYPED_TEST(ColumnUtilitiesTestIntegral, PrintColumnNumeric)
 {
-  const char* delimiter = ",";
+  char const* delimiter = ",";
 
   cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col({1, 2, 3, 4, 5});
   auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
@@ -215,7 +215,7 @@ TYPED_TEST(ColumnUtilitiesTestIntegral, PrintColumnNumeric)
 
 TYPED_TEST(ColumnUtilitiesTestIntegral, PrintColumnWithInvalids)
 {
-  const char* delimiter = ",";
+  char const* delimiter = ",";
 
   cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col{{1, 2, 3, 4, 5}, {1, 0, 1, 0, 1}};
   auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
@@ -230,7 +230,7 @@ TYPED_TEST(ColumnUtilitiesTestIntegral, PrintColumnWithInvalids)
 
 TYPED_TEST(ColumnUtilitiesTestFloatingPoint, PrintColumnNumeric)
 {
-  const char* delimiter = ",";
+  char const* delimiter = ",";
 
   cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
     {10001523.25, 2.0, 3.75, 0.000000034, 5.3});
@@ -244,7 +244,7 @@ TYPED_TEST(ColumnUtilitiesTestFloatingPoint, PrintColumnNumeric)
 
 TYPED_TEST(ColumnUtilitiesTestFloatingPoint, PrintColumnWithInvalids)
 {
-  const char* delimiter = ",";
+  char const* delimiter = ",";
 
   cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
     {10001523.25, 2.0, 3.75, 0.000000034, 5.3}, {1, 0, 1, 0, 1});
@@ -258,9 +258,9 @@ TYPED_TEST(ColumnUtilitiesTestFloatingPoint, PrintColumnWithInvalids)
 
 TEST_F(ColumnUtilitiesStringsTest, StringsToString)
 {
-  const char* delimiter = ",";
+  char const* delimiter = ",";
 
-  std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
+  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -274,6 +274,14 @@ TEST_F(ColumnUtilitiesStringsTest, StringsToString)
   EXPECT_EQ(cudf::test::to_string(strings, delimiter), tmp.str());
 }
 
+TEST_F(ColumnUtilitiesStringsTest, PrintEscapeStrings)
+{
+  char const* delimiter = ",";
+  cudf::test::strings_column_wrapper input({"e\te\ne", "é\bé\ré", "e\vé\fé\abell"});
+  std::string expected{"e\\te\\ne,é\\bé\\ré,e\\vé\\fé\\abell"};
+  EXPECT_EQ(cudf::test::to_string(input, delimiter), expected);
+}
+
 TYPED_TEST(ColumnUtilitiesTestFixedPoint, NonNullableToHost)
 {
   using namespace numeric;
@@ -385,7 +393,7 @@ TEST_F(ColumnUtilitiesListsTest, UnsanitaryLists)
   //    0, 1, 2
   std::vector<std::unique_ptr<cudf::column>> children;
   children.emplace_back(
-    std::move(cudf::test::fixed_width_column_wrapper<cudf::offset_type>{0, 3}.release()));
+    std::move(cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 3}.release()));
   children.emplace_back(std::move(cudf::test::fixed_width_column_wrapper<int>{0, 1, 2}.release()));
 
   auto l0 = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::LIST},
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index c150bb2070c..881a237de72 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -47,7 +47,7 @@ void expect_equivalent(host_span<T> a, host_span<T> b)
 }
 
 template <typename T>
-void expect_equivalent(hostdevice_span<T> a, hostdevice_span<T> b)
+void expect_equivalent(cudf::detail::hostdevice_span<T> a, cudf::detail::hostdevice_span<T> b)
 {
   EXPECT_EQ(a.size(), b.size());
   EXPECT_EQ(a.host_ptr(), b.host_ptr());
@@ -69,7 +69,7 @@ void expect_match(std::string expected, host_span<T> input)
 }
 
 template <typename T>
-void expect_match(std::string expected, hostdevice_span<T> input)
+void expect_match(std::string expected, cudf::detail::hostdevice_span<T> input)
 {
   return expect_match(expected.begin(), expected.size(), host_span<T>(input));
 }
@@ -291,7 +291,7 @@ TEST(MdSpanTest, DeviceReadWrite)
 
   readwrite_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(vector);
   readwrite_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(vector);
-  vector.device_to_host(cudf::get_default_stream(), true);
+  vector.device_to_host_sync(cudf::get_default_stream());
   EXPECT_EQ(vector[5][6], 30);
 }
 
@@ -323,7 +323,7 @@ TEST(MdSpanTest, CanGetCount)
 
 auto get_test_hostdevice_vector()
 {
-  auto v = hostdevice_vector<char>(0, 11, cudf::get_default_stream());
+  auto v = cudf::detail::hostdevice_vector<char>(0, 11, cudf::get_default_stream());
   for (auto c : create_hello_world_message()) {
     v.push_back(c);
   }
@@ -335,7 +335,7 @@ TEST(HostDeviceSpanTest, CanCreateFullSubspan)
 {
   auto message = get_test_hostdevice_vector();
   auto const message_span =
-    hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
 
   expect_equivalent(message_span, message.subspan(0, message_span.size()));
 }
@@ -345,7 +345,7 @@ TEST(HostDeviceSpanTest, CanCreateHostSpan)
   auto message            = get_test_hostdevice_vector();
   auto const message_span = host_span<char>(message.host_ptr(), message.size());
   auto const hd_span =
-    hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
 
   expect_equivalent(message_span, cudf::host_span<char>(hd_span));
 }
@@ -354,7 +354,7 @@ TEST(HostDeviceSpanTest, CanTakeSubspanFull)
 {
   auto message = get_test_hostdevice_vector();
   auto const message_span =
-    hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
 
   expect_match("hello world", message.subspan(0, 11));
   expect_match("hello world", message_span.subspan(0, 11));
@@ -364,7 +364,7 @@ TEST(HostDeviceSpanTest, CanTakeSubspanPartial)
 {
   auto message = get_test_hostdevice_vector();
   auto const message_span =
-    hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
 
   expect_match("lo w", message.subspan(3, 4));
   expect_match("lo w", message_span.subspan(3, 4));
@@ -374,7 +374,7 @@ TEST(HostDeviceSpanTest, CanGetData)
 {
   auto message = get_test_hostdevice_vector();
   auto const message_span =
-    hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
 
   EXPECT_EQ(message.host_ptr(), message_span.host_ptr());
 }
@@ -383,8 +383,8 @@ TEST(HostDeviceSpanTest, CanGetSize)
 {
   auto message = get_test_hostdevice_vector();
   auto const message_span =
-    hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
-  auto const empty_span = hostdevice_span<char>();
+    cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+  auto const empty_span = cudf::detail::hostdevice_span<char>();
 
   EXPECT_EQ(static_cast<size_t>(11), message_span.size());
   EXPECT_EQ(static_cast<size_t>(0), empty_span.size());
@@ -393,12 +393,12 @@ TEST(HostDeviceSpanTest, CanGetSize)
 TEST(HostDeviceSpanTest, CanGetSizeBytes)
 {
   auto doubles     = std::vector<double>({6, 3, 2});
-  auto doubles_hdv = hostdevice_vector<double>(0, 3, cudf::get_default_stream());
+  auto doubles_hdv = cudf::detail::hostdevice_vector<double>(0, 3, cudf::get_default_stream());
   for (auto d : doubles) {
     doubles_hdv.push_back(d);
   }
-  auto const doubles_span = hostdevice_span<double>(doubles_hdv);
-  auto const empty_span   = hostdevice_span<double>();
+  auto const doubles_span = cudf::detail::hostdevice_span<double>(doubles_hdv);
+  auto const empty_span   = cudf::detail::hostdevice_span<double>();
 
   EXPECT_EQ(static_cast<size_t>(24), doubles_span.size_bytes());
   EXPECT_EQ(static_cast<size_t>(0), empty_span.size_bytes());
@@ -407,11 +407,11 @@ TEST(HostDeviceSpanTest, CanGetSizeBytes)
 TEST(HostDeviceSpanTest, CanCopySpan)
 {
   auto message = get_test_hostdevice_vector();
-  hostdevice_span<char> message_span_copy;
+  cudf::detail::hostdevice_span<char> message_span_copy;
 
   {
     auto const message_span =
-      hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
+      cudf::detail::hostdevice_span<char>(message.host_ptr(), message.device_ptr(), message.size());
 
     message_span_copy = message_span;
   }
@@ -425,7 +425,7 @@ TEST(HostDeviceSpanTest, CanSendToDevice)
 {
   auto message = get_test_hostdevice_vector();
 
-  message.host_to_device(cudf::get_default_stream(), true);
+  message.host_to_device_sync(cudf::get_default_stream());
 
   char d_message[12];
   cudaMemcpy(d_message, message.device_ptr(), 11, cudaMemcpyDefault);
@@ -437,7 +437,7 @@ TEST(HostDeviceSpanTest, CanSendToDevice)
 
 __global__ void simple_device_char_kernel(device_span<char> result)
 {
-  const char* str = "world hello";
+  char const* str = "world hello";
   for (int offset = 0; offset < result.size(); ++offset) {
     result.data()[offset] = str[offset];
   }
@@ -446,11 +446,11 @@ __global__ void simple_device_char_kernel(device_span<char> result)
 TEST(HostDeviceSpanTest, CanGetFromDevice)
 {
   auto message = get_test_hostdevice_vector();
-  message.host_to_device(cudf::get_default_stream(), true);
+  message.host_to_device_sync(cudf::get_default_stream());
   simple_device_char_kernel<<<1, 1, 0, cudf::get_default_stream()>>>(message);
 
-  message.device_to_host(cudf::get_default_stream(), true);
-  expect_match("world hello", hostdevice_span<char>(message));
+  message.device_to_host_sync(cudf::get_default_stream());
+  expect_match("world hello", cudf::detail::hostdevice_span<char>(message));
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/dependencies.yaml b/dependencies.yaml
index 70d7f8c1ec8..f99b7404854 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.8"]
+      cuda: ["11.8", "12.0"]
       arch: [x86_64]
     includes:
       - build_all
@@ -172,13 +172,14 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &cmake_ver cmake>=3.23.1,!=3.25.0
+          - &cmake_ver cmake>=3.26.4
           - ninja
       - output_types: conda
         packages:
           - c-compiler
           - cxx-compiler
           - dlpack>=0.5,<0.6.0a0
+          - zlib>=1.2.13
     specific:
       - output_types: conda
         matrices:
@@ -194,6 +195,11 @@ dependencies:
               - sysroot_linux-aarch64==2.17
       - output_types: conda
         matrices:
+          - matrix:
+              cuda: "12.0"
+            packages:
+              - cuda-version=12.0
+              - cuda-nvcc
           - matrix:
               arch: x86_64
               cuda: "11.8"
@@ -208,18 +214,33 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - librmm==23.6.*
-          - libkvikio==23.6.*
+          - librmm==23.10.*
+          - libkvikio==23.10.*
       - output_types: conda
         packages:
           - fmt>=9.1.0,<10
+          - &gbench benchmark==1.8.0
           - &gtest gtest>=1.13.0
           - &gmock gmock>=1.13.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - libarrow==11.0.0.*
+          - libarrow==12.0.1.*
           - librdkafka>=1.9.0,<1.10.0a0
           - spdlog>=1.11.0,<1.12
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              arch: x86_64
+            packages:
+              # Align nvcomp version with rapids-cmake
+              # TODO: not yet available for aarch64 CUDA 12
+              - &nvcomp nvcomp==2.6.1
+          - matrix:
+              arch: aarch64
+              cuda: "11.8"
+            packages:
+              - *nvcomp
   build_wheels:
     common:
       - output_types: pyproject
@@ -230,20 +251,20 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cython>=0.29,<0.30
+          - cython>=3.0.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==11.0.0.*
-          - numpy>=1.21,<1.24  # Temporarily upper bound numpy to avoid overflow deprecations
+          - pyarrow==12.0.1.*
+          - numpy>=1.21
   build_python:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - scikit-build>=0.13.1,<0.17.2
-          - rmm==23.6.*
+          - scikit-build>=0.13.1
+          - rmm==23.10.*
       - output_types: conda
         packages:
-          - &protobuf protobuf>=4.21.6,<4.22
+          - &protobuf protobuf>=4.21,<5
       - output_types: pyproject
         packages:
           - protoc-wheel
@@ -252,27 +273,39 @@ dependencies:
       - output_types: [conda, requirements]
         packages:
           # Allow runtime version to float up to minor version
-          - libarrow==11.*
+          - libarrow==12.*
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           # Allow runtime version to float up to minor version
-          - pyarrow==11.*
+          - pyarrow==12.*
   cudatoolkit:
     specific:
       - output_types: conda
         matrices:
+          - matrix:
+              cuda: "12.0"
+            packages:
+              - cuda-version=12.0
+              - cuda-cudart-dev
+              - cuda-nvrtc-dev
+              - cuda-nvtx-dev
+              - libcurand-dev
           - matrix:
               cuda: "11.8"
             packages:
-              - cudatoolkit=11.8
+              - cuda-version=11.8
+              - cudatoolkit
+              - cuda-nvtx=11.8
               - libcurand-dev=10.3.0.86
               - libcurand=10.3.0.86
           - matrix:
               cuda: "11.5"
             packages:
-              - cudatoolkit=11.5
+              - cuda-version=11.5
+              - cudatoolkit
+              - cuda-nvtx=11.5
                 # Can't hard pin the version since 11.x is missing many
                 # packages for specific versions
               - libcurand-dev>=10.2.6.48,<=10.2.7.107
@@ -280,36 +313,82 @@ dependencies:
           - matrix:
               cuda: "11.4"
             packages:
-              - cudatoolkit=11.4
+              - cuda-version=11.4
+              - cudatoolkit
+              - &cudanvtx114 cuda-nvtx=11.4
               - &libcurand_dev114 libcurand-dev>=10.2.5.43,<=10.2.5.120
               - &libcurand114 libcurand>=10.2.5.43,<=10.2.5.120
           - matrix:
               cuda: "11.2"
             packages:
-              - cudatoolkit=11.2
+              - cuda-version=11.2
+              - cudatoolkit
                 # The NVIDIA channel doesn't publish pkgs older than 11.4 for
                 # these libs, so 11.2 uses 11.4 packages (the oldest
                 # available).
+              - *cudanvtx114
               - *libcurand_dev114
               - *libcurand114
+      - output_types: conda
+        matrices:
+          - matrix:
+              cuda: "12.0"
+              arch: x86_64
+            packages:
+              - libcufile-dev
+          - matrix:
+              cuda: "11.8"
+              arch: x86_64
+            packages:
+              - libcufile=1.4.0.31
+              - libcufile-dev=1.4.0.31
+          - matrix:
+              cuda: "11.5"
+              arch: x86_64
+            packages:
+              - libcufile>=1.1.0.37,<=1.1.1.25
+              - libcufile-dev>=1.1.0.37,<=1.1.1.25
+          - matrix:
+              cuda: "11.4"
+              arch: x86_64
+            packages:
+              - &libcufile_114 libcufile>=1.0.0.82,<=1.0.2.10
+              - &libcufile_dev114 libcufile-dev>=1.0.0.82,<=1.0.2.10
+          - matrix:
+              cuda: "11.2"
+              arch: x86_64
+            packages:
+              # The NVIDIA channel doesn't publish pkgs older than 11.4 for these libs,
+              # so 11.2 uses 11.4 packages (the oldest available).
+              - *libcufile_114
+              - *libcufile_dev114
+          # Fallback matrix for aarch64, which doesn't support libcufile.
+          - matrix:
+            packages:
   develop:
     common:
       - output_types: [conda, requirements]
         packages:
           - pre-commit
+          # pre-commit requires identify minimum version 1.0, but clang-format requires textproto support and that was
+          # added in 2.5.20, so we need to call out the minimum version needed for our plugins
+          - identify>=2.5.20
       - output_types: conda
         packages:
-          - doxygen=1.8.20 # pre-commit hook needs a specific version.
+          - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version.
   docs:
     common:
       - output_types: [conda]
         packages:
-          - doxygen=1.8.20
+          - dask-cuda==23.10.*
+          - *doxygen
+          - make
           - myst-nb
           - nbsphinx
           - numpydoc
           - pandoc
           - pydata-sphinx-theme
+          - scipy
           - sphinx
           - sphinx-autobuild
           - sphinx-copybutton
@@ -342,65 +421,78 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fsspec>=0.6.0
-          - numpy>=1.21,<1.24  # Temporarily upper bound numpy to avoid overflow deprecations
+          - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
   run_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cachetools
-          - cuda-python>=11.7.1,<12.0
-          - &numba numba>=0.56.4,<0.57
+          - &numba numba>=0.57
           - nvtx>=0.2.1
           - packaging
-          - rmm==23.6.*
-          - typing_extensions
+          - rmm==23.10.*
+          - typing_extensions>=4.0.0
           - *protobuf
       - output_types: conda
         packages:
-          - cubinlinker
           - cupy>=12.0.0
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
-          - ptxcompiler
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           # This index is needed for rmm, cubinlinker, ptxcompiler.
           - --extra-index-url=https://pypi.ngc.nvidia.com
-          - cubinlinker-cu11
           - git+https://github.com/python-streamz/streamz.git@master
-          - ptxcompiler-cu11
+          - &cupy_pip cupy-cuda11x>=12.0.0
       - output_types: pyproject
         packages:
           - cubinlinker
-          - &cupy_pip cupy-cuda11x>=12.0.0
+          - *cupy_pip
           - ptxcompiler
     specific:
-      - output_types: requirements
+      - output_types: [conda, requirements, pyproject]
         matrices:
           - matrix:
-              arch: x86_64
+              cuda: "12.0"
+            packages:
+              - cuda-python>=12.0,<13.0a0
+          - matrix: # All CUDA 11 versions
             packages:
-              - cupy-cuda115>=12.0.0
+              - cuda-python>=11.7.1,<12.0a0
+      - output_types: [conda, pyproject]
+        matrices:
           - matrix:
-              arch: aarch64
+              cuda: "12.0"
             packages:
-              - cupy-cuda11x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
+          - matrix: # All CUDA 11 versions
+            packages:
+              - cubinlinker
+              - ptxcompiler
+      - output_types: requirements
+        matrices:
+          - matrix:
+              cuda: "12.0"
+            packages:
+          - matrix: # All CUDA 11 versions
+            packages:
+              - cubinlinker-cu11
+              - ptxcompiler-cu11
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask==2023.3.2
-          - distributed==2023.3.2.1
+          - dask>=2023.7.1
+          - distributed>=2023.7.1
       - output_types: conda
         packages:
           - cupy>=12.0.0
-          - dask-core==2023.3.2  # dask-core in conda is the actual package & dask is the meta package
+          - dask-core>=2023.7.1  # dask-core in conda is the actual package & dask is the meta package
       - output_types: pyproject
         packages:
-          - &cudf cudf==23.6.*
+          - &cudf cudf==23.10.*
           - *cupy_pip
   run_cudf_kafka:
     common:
@@ -419,17 +511,23 @@ dependencies:
         packages:
           - confluent-kafka>=1.9.0,<1.10.0a0
           - *cudf
-          - cudf_kafka==23.6.*
+          - cudf_kafka==23.10.*
   test_cpp:
     common:
       - output_types: conda
         packages:
           - *cmake_ver
+          - *gbench
           - *gtest
           - *gmock
     specific:
       - output_types: conda
         matrices:
+          - matrix:
+              cuda: "12.0"
+            packages:
+              - cuda-version=12.0
+              - cuda-sanitizer-api
           - matrix:
               cuda: "11.8"
             packages:
@@ -443,17 +541,6 @@ dependencies:
           - *cmake_ver
           - maven
           - openjdk=8.*
-    specific:
-      - output_types: conda
-        matrices:
-          - matrix:
-              cuda: "11.5"
-            packages:
-              - cuda-nvtx=11.5
-          - matrix:
-              cuda: "11.8"
-            packages:
-              - cuda-nvtx=11.8
   test_python_common:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -506,5 +593,5 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==23.6.*
+          - dask-cuda==23.10.*
           - *numba
diff --git a/docs/cudf/Makefile b/docs/cudf/Makefile
index e513c89a561..0b3a6876a5a 100644
--- a/docs/cudf/Makefile
+++ b/docs/cudf/Makefile
@@ -2,7 +2,7 @@
 #
 
 # You can set these variables from the command line.
-SPHINXOPTS    = -n -v
+SPHINXOPTS    = -n -v -W --keep-going
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = cudf
 SOURCEDIR     = source
diff --git a/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst b/docs/cudf/source/_templates/autosummary/class.rst
similarity index 50%
rename from docs/cudf/source/_templates/autosummary/class_without_autosummary.rst
rename to docs/cudf/source/_templates/autosummary/class.rst
index 6676c672b20..a16cd0d7305 100644
--- a/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst
+++ b/docs/cudf/source/_templates/autosummary/class.rst
@@ -4,3 +4,6 @@
 .. currentmodule:: {{ module }}
 
 .. autoclass:: {{ objname }}
+
+..
+   Don't include the methods or attributes sections, numpydoc adds them for us instead.
diff --git a/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst b/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst
deleted file mode 100644
index a9c9bd2b650..00000000000
--- a/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-{% extends "!autosummary/class.rst" %}
-
-{% block methods %}
-{% if methods %}
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-      {% for item in all_methods %}
-      {%- if not item.startswith('_') or item in ['__call__'] %}
-      {{ name }}.{{ item }}
-      {%- endif -%}
-      {%- endfor %}
-
-{% endif %}
-{% endblock %}
-
-{% block attributes %}
-{% if attributes %}
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-      {% for item in all_attributes %}
-      {%- if not item.startswith('_') %}
-      {{ name }}.{{ item }}
-      {%- endif -%}
-      {%- endfor %}
-
-{% endif %}
-{% endblock %}
diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
index dfe1b2a9b9b..a8f3edf5a04 100644
--- a/docs/cudf/source/api_docs/dataframe.rst
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -7,7 +7,6 @@ Constructor
 ~~~~~~~~~~~
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_with_autosummary.rst
 
    DataFrame
 
@@ -41,6 +40,7 @@ Conversion
    :toctree: api/
 
    DataFrame.astype
+   DataFrame.convert_dtypes
    DataFrame.copy
 
 Indexing, iteration
@@ -148,9 +148,9 @@ Computations / descriptive stats
    DataFrame.quantile
    DataFrame.rank
    DataFrame.round
+   DataFrame.scale
    DataFrame.skew
    DataFrame.sum
-   DataFrame.sum_of_squares
    DataFrame.std
    DataFrame.var
    DataFrame.nunique
@@ -218,6 +218,7 @@ Reshaping, sorting, transposing
    DataFrame.sort_index
    DataFrame.nlargest
    DataFrame.nsmallest
+   DataFrame.swaplevel
    DataFrame.stack
    DataFrame.unstack
    DataFrame.melt
@@ -250,11 +251,17 @@ Serialization / IO / conversion
 .. autosummary::
    :toctree: api/
 
+   DataFrame.deserialize
+   DataFrame.device_deserialize
+   DataFrame.device_serialize
    DataFrame.from_arrow
    DataFrame.from_dict
    DataFrame.from_pandas
    DataFrame.from_records
    DataFrame.hash_values
+   DataFrame.host_deserialize
+   DataFrame.host_serialize
+   DataFrame.serialize
    DataFrame.to_arrow
    DataFrame.to_dict
    DataFrame.to_dlpack
@@ -269,3 +276,5 @@ Serialization / IO / conversion
    DataFrame.to_feather
    DataFrame.to_records
    DataFrame.to_string
+   DataFrame.values
+   DataFrame.values_host
diff --git a/docs/cudf/source/api_docs/extension_dtypes.rst b/docs/cudf/source/api_docs/extension_dtypes.rst
new file mode 100644
index 00000000000..daccb01b737
--- /dev/null
+++ b/docs/cudf/source/api_docs/extension_dtypes.rst
@@ -0,0 +1,170 @@
+================
+Extension Dtypes
+================
+.. currentmodule:: cudf.core.dtypes
+
+cuDF supports a number of extension dtypes that build on top of the types that pandas supports. These dtypes are not directly available in pandas, which instead relies on object dtype arrays that run at Python rather than native speeds. The following dtypes are supported:
+
+
+cudf.CategoricalDtype
+=====================
+.. autosummary::
+   :toctree: api/
+
+   CategoricalDtype
+
+
+Properties and Methods
+----------------------
+.. autosummary::
+   :toctree: api/
+
+    CategoricalDtype.categories
+    CategoricalDtype.construct_from_string
+    CategoricalDtype.deserialize
+    CategoricalDtype.device_deserialize
+    CategoricalDtype.device_serialize
+    CategoricalDtype.from_pandas
+    CategoricalDtype.host_deserialize
+    CategoricalDtype.host_serialize
+    CategoricalDtype.is_dtype
+    CategoricalDtype.name
+    CategoricalDtype.ordered
+    CategoricalDtype.serialize
+    CategoricalDtype.str
+    CategoricalDtype.to_pandas
+    CategoricalDtype.type
+
+
+cudf.Decimal32Dtype
+===================
+.. autosummary::
+   :toctree: api/
+
+   Decimal32Dtype
+
+Properties and Methods
+----------------------
+.. autosummary::
+   :toctree: api/
+
+   Decimal32Dtype.ITEMSIZE
+   Decimal32Dtype.MAX_PRECISION
+   Decimal32Dtype.deserialize
+   Decimal32Dtype.device_deserialize
+   Decimal32Dtype.device_serialize
+   Decimal32Dtype.from_arrow
+   Decimal32Dtype.host_deserialize
+   Decimal32Dtype.host_serialize
+   Decimal32Dtype.is_dtype
+   Decimal32Dtype.itemsize
+   Decimal32Dtype.precision
+   Decimal32Dtype.scale
+   Decimal32Dtype.serialize
+   Decimal32Dtype.str
+   Decimal32Dtype.to_arrow
+
+cudf.Decimal64Dtype
+===================
+.. autosummary::
+   :toctree: api/
+
+   Decimal64Dtype
+
+Properties and Methods
+----------------------
+.. autosummary::
+   :toctree: api/
+
+   Decimal64Dtype.ITEMSIZE
+   Decimal64Dtype.MAX_PRECISION
+   Decimal64Dtype.deserialize
+   Decimal64Dtype.device_deserialize
+   Decimal64Dtype.device_serialize
+   Decimal64Dtype.from_arrow
+   Decimal64Dtype.host_deserialize
+   Decimal64Dtype.host_serialize
+   Decimal64Dtype.is_dtype
+   Decimal64Dtype.itemsize
+   Decimal64Dtype.precision
+   Decimal64Dtype.scale
+   Decimal64Dtype.serialize
+   Decimal64Dtype.str
+   Decimal64Dtype.to_arrow
+
+cudf.Decimal128Dtype
+====================
+.. autosummary::
+   :toctree: api/
+
+   Decimal128Dtype
+
+Properties and Methods
+----------------------
+.. autosummary::
+   :toctree: api/
+
+   Decimal128Dtype.ITEMSIZE
+   Decimal128Dtype.MAX_PRECISION
+   Decimal128Dtype.deserialize
+   Decimal128Dtype.device_deserialize
+   Decimal128Dtype.device_serialize
+   Decimal128Dtype.from_arrow
+   Decimal128Dtype.host_deserialize
+   Decimal128Dtype.host_serialize
+   Decimal128Dtype.is_dtype
+   Decimal128Dtype.itemsize
+   Decimal128Dtype.precision
+   Decimal128Dtype.scale
+   Decimal128Dtype.serialize
+   Decimal128Dtype.str
+   Decimal128Dtype.to_arrow
+
+cudf.ListDtype
+==============
+.. autosummary::
+   :toctree: api/
+
+   ListDtype
+
+Properties and Methods
+----------------------
+.. autosummary::
+   :toctree: api/
+
+   ListDtype.deserialize
+   ListDtype.device_deserialize
+   ListDtype.device_serialize
+   ListDtype.element_type
+   ListDtype.from_arrow
+   ListDtype.host_deserialize
+   ListDtype.host_serialize
+   ListDtype.is_dtype
+   ListDtype.leaf_type
+   ListDtype.serialize
+   ListDtype.to_arrow
+   ListDtype.type
+
+cudf.StructDtype
+================
+.. autosummary::
+   :toctree: api/
+
+   StructDtype
+
+Properties and Methods
+----------------------
+.. autosummary::
+   :toctree: api/
+
+   StructDtype.deserialize
+   StructDtype.device_deserialize
+   StructDtype.device_serialize
+   StructDtype.fields
+   StructDtype.from_arrow
+   StructDtype.host_deserialize
+   StructDtype.host_serialize
+   StructDtype.is_dtype
+   StructDtype.serialize
+   StructDtype.to_arrow
+   StructDtype.type
diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst
index 5c28b4e7e85..38e070b0d53 100644
--- a/docs/cudf/source/api_docs/general_functions.rst
+++ b/docs/cudf/source/api_docs/general_functions.rst
@@ -26,13 +26,23 @@ Top-level conversions
    :toctree: api/
 
     cudf.to_numeric
+    cudf.from_dataframe
     cudf.from_dlpack
     cudf.from_pandas
 
-Top-level dealing with datetimelike
------------------------------------
+Top-level dealing with datetimelike data
+----------------------------------------
 
 .. autosummary::
    :toctree: api/
 
     cudf.to_datetime
+    cudf.date_range
+
+Top-level dealing with Interval data
+------------------------------------
+
+.. autosummary::
+   :toctree: api/
+
+    cudf.interval_range
diff --git a/docs/cudf/source/api_docs/index.rst b/docs/cudf/source/api_docs/index.rst
index ef04167c327..01047a31462 100644
--- a/docs/cudf/source/api_docs/index.rst
+++ b/docs/cudf/source/api_docs/index.rst
@@ -22,3 +22,4 @@ This page provides a list of all publicly accessible modules, methods and classe
     list_handling
     struct_handling
     options
+    extension_dtypes
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index 9163440b23c..013eaf29a56 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -12,7 +12,6 @@ used before calling these methods directly.**
 
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_with_autosummary.rst
 
    Index
 
@@ -21,9 +20,10 @@ Properties
 .. autosummary::
    :toctree: api/
 
+   Index.dtype
+   Index.duplicated
    Index.empty
    Index.has_duplicates
-   Index.duplicated
    Index.hasnans
    Index.is_monotonic
    Index.is_monotonic_increasing
@@ -52,7 +52,6 @@ Modifying and computations
    Index.is_floating
    Index.is_integer
    Index.is_interval
-   Index.is_mixed
    Index.is_numeric
    Index.is_object
    Index.min
@@ -93,6 +92,13 @@ Conversion
    :toctree: api/
 
    Index.astype
+   Index.deserialize
+   Index.device_deserialize
+   Index.device_serialize
+   Index.host_deserialize
+   Index.host_serialize
+   Index.serialize
+   Index.tolist
    Index.to_arrow
    Index.to_cupy
    Index.to_list
@@ -110,6 +116,7 @@ Sorting
    :toctree: api/
 
    Index.argsort
+   Index.find_label_range
    Index.searchsorted
    Index.sort_values
 
@@ -141,15 +148,26 @@ Selecting
    Index.get_slice_bound
    Index.isin
 
+String Operations
+~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.str
+
 .. _api.numericindex:
 
 Numeric Index
 -------------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    RangeIndex
+   RangeIndex.start
+   RangeIndex.stop
+   RangeIndex.step
+   RangeIndex.to_numpy
+   RangeIndex.to_arrow
    Int64Index
    UInt64Index
    Float64Index
@@ -160,7 +178,6 @@ CategoricalIndex
 ----------------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    CategoricalIndex
 
@@ -203,7 +220,6 @@ MultiIndex
 ----------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    MultiIndex
 
@@ -234,6 +250,7 @@ MultiIndex components
 
    MultiIndex.to_frame
    MultiIndex.droplevel
+   MultiIndex.swaplevel
 
 MultiIndex selecting
 ~~~~~~~~~~~~~~~~~~~~
@@ -249,7 +266,6 @@ DatetimeIndex
 -------------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    DatetimeIndex
 
@@ -283,6 +299,7 @@ Time-specific operations
    DatetimeIndex.round
    DatetimeIndex.ceil
    DatetimeIndex.floor
+   DatetimeIndex.tz_convert
    DatetimeIndex.tz_localize
 
 Conversion
@@ -297,7 +314,6 @@ TimedeltaIndex
 --------------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_without_autosummary.rst
 
    TimedeltaIndex
 
diff --git a/docs/cudf/source/api_docs/io.rst b/docs/cudf/source/api_docs/io.rst
index a52667cd3e4..417970715f8 100644
--- a/docs/cudf/source/api_docs/io.rst
+++ b/docs/cudf/source/api_docs/io.rst
@@ -36,9 +36,9 @@ Parquet
    read_parquet
    DataFrame.to_parquet
    cudf.io.parquet.read_parquet_metadata
-   :template: autosummary/class_with_autosummary.rst
-
    cudf.io.parquet.ParquetDatasetWriter
+   cudf.io.parquet.ParquetDatasetWriter.close
+   cudf.io.parquet.ParquetDatasetWriter.write_table
 
 
 ORC
diff --git a/docs/cudf/source/api_docs/list_handling.rst b/docs/cudf/source/api_docs/list_handling.rst
index f1fb6d1ca74..78980e5d56c 100644
--- a/docs/cudf/source/api_docs/list_handling.rst
+++ b/docs/cudf/source/api_docs/list_handling.rst
@@ -5,6 +5,12 @@ List handling
 lists and apply list methods to it. These can be accessed like
 ``Series.list.<function/property>``.
 
+.. currentmodule:: cudf
+.. autosummary::
+   :toctree: api/
+
+   Series.list
+
 .. currentmodule:: cudf.core.column.lists.ListMethods
 .. autosummary::
    :toctree: api/
diff --git a/docs/cudf/source/api_docs/options.rst b/docs/cudf/source/api_docs/options.rst
index 28143eaaf2d..b3a4004e2d9 100644
--- a/docs/cudf/source/api_docs/options.rst
+++ b/docs/cudf/source/api_docs/options.rst
@@ -1,8 +1,8 @@
 .. _api.options:
 
-============
-cudf Options
-============
+====================
+Options and settings
+====================
 
 .. autosummary::
    :toctree: api/
@@ -10,3 +10,16 @@ cudf Options
    cudf.get_option
    cudf.set_option
    cudf.describe_option
+   cudf.option_context
+
+
+Available options
+-----------------
+
+You can get a list of available options and their descriptions with :func:`~cudf.describe_option`. When called
+with no argument :func:`~cudf.describe_option` will print out the descriptions for all available options.
+
+.. ipython:: python
+
+   import cudf
+   cudf.describe_option()
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index b38ef3e382c..ebfc1e3f5d1 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -7,7 +7,6 @@ Constructor
 -----------
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_with_autosummary.rst
 
    Series
 
@@ -44,8 +43,16 @@ Conversion
    :toctree: api/
 
    Series.astype
+   Series.convert_dtypes
    Series.copy
+   Series.deserialize
+   Series.device_deserialize
+   Series.device_serialize
+   Series.host_deserialize
+   Series.host_serialize
+   Series.serialize
    Series.to_list
+   Series.tolist
    Series.__array__
    Series.scale
 
@@ -192,6 +199,7 @@ Missing data handling
    Series.dropna
    Series.ffill
    Series.fillna
+   Series.interpolate
    Series.isna
    Series.isnull
    Series.nans_to_nulls
@@ -255,6 +263,12 @@ Datetimelike properties
 datetimelike and return several properties.
 These can be accessed like ``Series.dt.<property>``.
 
+.. currentmodule:: cudf
+.. autosummary::
+   :toctree: api/
+
+   Series.dt
+
 Datetime properties
 ^^^^^^^^^^^^^^^^^^^
 .. currentmodule:: cudf.core.series.DatetimeProperties
@@ -323,6 +337,12 @@ Categorical accessor
 Categorical-dtype specific methods and attributes are available under
 the ``Series.cat`` accessor.
 
+.. currentmodule:: cudf
+.. autosummary::
+   :toctree: api/
+
+   Series.cat
+
 .. currentmodule:: cudf.core.column.categorical.CategoricalAccessor
 .. autosummary::
    :toctree: api/
diff --git a/docs/cudf/source/api_docs/string_handling.rst b/docs/cudf/source/api_docs/string_handling.rst
index cc85e9dba2b..ab0f085e1a6 100644
--- a/docs/cudf/source/api_docs/string_handling.rst
+++ b/docs/cudf/source/api_docs/string_handling.rst
@@ -5,6 +5,12 @@ String handling
 strings and apply several methods to it. These can be accessed like
 ``Series.str.<function/property>``.
 
+.. currentmodule:: cudf
+.. autosummary::
+   :toctree: api/
+
+   Series.str
+
 .. currentmodule:: cudf.core.column.string.StringMethods
 .. autosummary::
    :toctree: api/
diff --git a/docs/cudf/source/api_docs/struct_handling.rst b/docs/cudf/source/api_docs/struct_handling.rst
index 05ba990382a..336aa732df8 100644
--- a/docs/cudf/source/api_docs/struct_handling.rst
+++ b/docs/cudf/source/api_docs/struct_handling.rst
@@ -5,6 +5,12 @@ Struct handling
 Structs and apply struct methods to it. These can be accessed like
 ``Series.struct.<function/property>``.
 
+.. currentmodule:: cudf
+.. autosummary::
+   :toctree: api/
+
+   Series.struct
+
 .. currentmodule:: cudf.core.column.struct.StructMethods
 .. autosummary::
    :toctree: api/
diff --git a/docs/cudf/source/api_docs/subword_tokenize.rst b/docs/cudf/source/api_docs/subword_tokenize.rst
index fc814bcb92a..cd240fe4db4 100644
--- a/docs/cudf/source/api_docs/subword_tokenize.rst
+++ b/docs/cudf/source/api_docs/subword_tokenize.rst
@@ -7,6 +7,6 @@ Constructor
 ~~~~~~~~~~~
 .. autosummary::
    :toctree: api/
-   :template: autosummary/class_with_autosummary.rst
 
    SubwordTokenizer
+   SubwordTokenizer.__call__
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 2d3d2494747..03b1bb7039b 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -22,6 +22,7 @@
 from docutils.nodes import Text
 from sphinx.addnodes import pending_xref
 
+# -- Custom Extensions ----------------------------------------------------
 sys.path.append(os.path.abspath("./_ext"))
 
 # -- General configuration ------------------------------------------------
@@ -52,9 +53,6 @@
 
 copybutton_prompt_text = ">>> "
 autosummary_generate = True
-ipython_mplbackend = "str"
-
-html_use_modindex = True
 
 # Enable automatic generation of systematic, namespaced labels for sections
 myst_heading_anchors = 2
@@ -81,9 +79,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '23.06'
+version = '23.10'
 # The full version, including alpha/beta/rc tags.
-release = '23.06.00'
+release = '23.10.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -100,9 +98,6 @@
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = "sphinx"
 
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
 html_theme_options = {
     "external_links": [],
     # https://github.com/pydata/pydata-sphinx-theme/issues/1220
@@ -204,10 +199,17 @@
     "numpy": ("https://numpy.org/doc/stable", None),
     "pyarrow": ("https://arrow.apache.org/docs/", None),
     "pandas": ("https://pandas.pydata.org/docs/", None),
+    "typing_extensions": ("https://typing-extensions.readthedocs.io/en/stable/", None),
 }
 
 # Config numpydoc
-numpydoc_show_inherited_class_members = True
+numpydoc_show_inherited_class_members = {
+    # option_context inherits undocumented members from the parent class
+    "cudf.option_context": False,
+}
+
+# Rely on toctrees generated from autosummary on each of the pages we define
+# rather than the autosummaries on the numpydoc auto-generated class pages.
 numpydoc_class_members_toctree = False
 numpydoc_attributes_as_param_list = False
 
@@ -220,8 +222,6 @@
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
 }
 
-_internal_names_to_ignore = {"cudf.core.column.string.StringColumn"}
-
 
 def resolve_aliases(app, doctree):
     pending_xrefs = doctree.traverse(condition=pending_xref)
@@ -245,33 +245,19 @@ def ignore_internal_references(app, env, node, contnode):
         # use `cudf.Index`
         node["reftarget"] = "cudf.Index"
         return contnode
-    elif name is not None and name in _internal_names_to_ignore:
-        node["reftarget"] = ""
-        return contnode
-
-
-def process_class_docstrings(app, what, name, obj, options, lines):
-    """
-    For those classes for which we use ::
-    :template: autosummary/class_without_autosummary.rst
-    the documented attributes/methods have to be listed in the class
-    docstring. However, if one of those lists is empty, we use 'None',
-    which then generates warnings in sphinx / ugly html output.
-    This "autodoc-process-docstring" event connector removes that part
-    from the processed docstring.
-    """
-    if what == "class":
-        if name in {"cudf.RangeIndex", "cudf.Int64Index", "cudf.UInt64Index", "cudf.Float64Index", "cudf.CategoricalIndex", "cudf.IntervalIndex", "cudf.MultiIndex", "cudf.DatetimeIndex", "cudf.TimedeltaIndex", "cudf.TimedeltaIndex"}:
+    return None
 
-            cut_index = lines.index('.. rubric:: Attributes')
-            lines[:] = lines[:cut_index]
 
-
-nitpick_ignore = [("py:class", "SeriesOrIndex"),]
+nitpick_ignore = [
+    ("py:class", "SeriesOrIndex"),
+    ("py:class", "Dtype"),
+    # TODO: Remove this when we figure out why typing_extensions doesn't seem
+    # to map types correctly for intersphinx
+    ("py:class", "typing_extensions.Self"),
+]
 
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
     app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer")
     app.connect("doctree-read", resolve_aliases)
     app.connect("missing-reference", ignore_internal_references)
-    app.connect("autodoc-process-docstring", process_class_docstrings)
diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md
index 3244959b14c..65b0e4e3f41 100644
--- a/docs/cudf/source/developer_guide/contributing_guide.md
+++ b/docs/cudf/source/developer_guide/contributing_guide.md
@@ -15,7 +15,7 @@ Developers are strongly recommended to set up `pre-commit` prior to any developm
 The `.pre-commit-config.yaml` file at the root of the repo is the primary source of truth linting.
 Specifically, cuDF uses the following tools:
 
-- [`flake8`](https://github.com/pycqa/flake8) checks for general code formatting compliance.
+- [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance.
 - [`black`](https://github.com/psf/black) is an automatic code formatter.
 - [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently.
 - [`mypy`](http://mypy-lang.org/) performs static type checking.
@@ -28,10 +28,9 @@ Linter config data is stored in a number of files.
 We generally use `pyproject.toml` over `setup.cfg` and avoid project-specific files (e.g. `pyproject.toml` > `python/cudf/pyproject.toml`).
 However, differences between tools and the different packages in the repo result in the following caveats:
 
-- `flake8` has no plans to support `pyproject.toml`, so it must live in `.flake8`.
 - `isort` must be configured per project to set which project is the "first party" project.
 
-As a result, we currently maintain both root and project-level `pyproject.toml` files as well as a `.flake8` file.
+As a result, we currently maintain both root and project-level `pyproject.toml` files.
 
 For more information on how to use pre-commit hooks, see the code formatting section of the
 [overall contributing guide](https://github.com/rapidsai/cudf/blob/main/CONTRIBUTING.md#python--pre-commit-hooks).
@@ -41,6 +40,8 @@ For more information on how to use pre-commit hooks, see the code formatting sec
 cuDF follows the policy of deprecating code for one release prior to removal.
 For example, if we decide to remove an API during the 22.08 release cycle,
 it will be marked as deprecated in the 22.08 release and removed in the 22.10 release.
+Note that if it is explicitly mentioned in a comment (like `# Do not remove until..`),
+do not enforce the deprecation by removing the affected code until the condition in the comment is met.
 All internal usage of deprecated APIs in cuDF should be removed when the API is deprecated.
 This prevents users from encountering unexpected deprecation warnings when using other (non-deprecated) APIs.
 The documentation for the API should also be updated to reflect its deprecation.
diff --git a/docs/cudf/source/developer_guide/documentation.md b/docs/cudf/source/developer_guide/documentation.md
index 187934cd274..26557de917a 100644
--- a/docs/cudf/source/developer_guide/documentation.md
+++ b/docs/cudf/source/developer_guide/documentation.md
@@ -121,6 +121,35 @@ while still matching the pandas layout as closely as possible.
 When adding a new API, developers simply have to add the API to the appropriate page.
 Adding the name of the function to the appropriate autosummary list is sufficient for it to be documented.
 
+### Documenting classes
+
+Python classes and the Sphinx plugins used in RAPIDS interact in nontrivial ways.
+`autosummary`'s default page generated for a class uses [`autodoc`](https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html) to automatically detect and document all methods of a class.
+That means that in addition to the manually created `autosummary` pages where class methods are grouped into sections of related features, there is another page for each class where all the methods of that class are automatically summarized in a table for quick access.
+However, we also use the [`numpydoc`](https://numpydoc.readthedocs.io/) extension, which offers the same feature.
+We use both in order to match the contents and style of the pandas documentation as closely as possible.
+
+pandas is also particular about what information is included in a class's documentation.
+While the documentation pages for the major user-facing classes like `DataFrame`, `Series`, and `Index` contain all APIs, less visible classes or subclasses (such as subclasses of `Index`) only include the methods that are specific to those subclasses.
+For example, {py:class}`cudf.CategoricalIndex` only includes `codes` and `categories` on its page, not the entire set of `Index` functionality.
+
+To accommodate these requirements, we take the following approach:
+1. The default `autosummary` template for classes is overridden with a [simpler template that does not generate method or attribute documentation](https://github.com/rapidsai/cudf/blob/main/docs/cudf/source/_templates/autosummary/class.rst). In other words, we disable `autosummary`'s generation of Methods and Attributes lists.
+2. We rely on `numpydoc` entirely for the classes that need their entire APIs listed (`DataFrame`/`Series`/etc). `numpydoc` will automatically populate Methods and Attributes section if (and only if) they are not already defined in the class's docstring.
+3. For classes that should only include a subset of APIs, we include those explicitly in the class's documentation. When those lists exist, `numpydoc` will not override them. If either the Methods or Attributes section should be empty, that section must still be included but should simply contain "None". For example, the class documentation for `CategoricalIndex` could include something like the following:
+
+```
+    Attributes
+    ----------
+    codes
+    categories
+
+    Methods
+    -------
+    None
+
+```
+
 ## Comparing to pandas
 
 cuDF aims to provide a pandas-like experience.
diff --git a/docs/cudf/source/developer_guide/index.md b/docs/cudf/source/developer_guide/index.md
index 4c4f4c0ad49..27e05ce6459 100644
--- a/docs/cudf/source/developer_guide/index.md
+++ b/docs/cudf/source/developer_guide/index.md
@@ -26,4 +26,5 @@ documentation
 testing
 benchmarking
 options
+pylibcudf
 ```
diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md
index 16b84476549..016c2c1d281 100644
--- a/docs/cudf/source/developer_guide/library_design.md
+++ b/docs/cudf/source/developer_guide/library_design.md
@@ -325,30 +325,26 @@ This section describes the internal implementation details of the copy-on-write
 It is recommended that developers familiarize themselves with [the user-facing documentation](copy-on-write-user-doc) of this functionality before reading through the internals
 below.
 
-The core copy-on-write implementation relies on the `CopyOnWriteBuffer` class.
-When the cudf option `"copy_on_write"` is `True`, `as_buffer` will always return a `CopyOnWriteBuffer`.
-This subclass of `cudf.Buffer` contains all the mechanisms to enable copy-on-write behavior.
-The class stores [weak references](https://docs.python.org/3/library/weakref.html) to every existing `CopyOnWriteBuffer` in `CopyOnWriteBuffer._instances`, a mapping from `ptr` keys to `WeakSet`s containing references to `CopyOnWriteBuffer` objects.
-This means that all `CopyOnWriteBuffer`s that point to the same device memory are contained in the same `WeakSet` (corresponding to the same `ptr` key) in `CopyOnWriteBuffer._instances`.
-This data structure is then used to determine whether or not to make a copy when a write operation is performed on a `Column` (see below).
-If multiple buffers point to the same underlying memory, then a copy must be made whenever a modification is attempted.
+The core copy-on-write implementation relies on the factory function `as_exposure_tracked_buffer` and the two classes `ExposureTrackedBuffer` and `BufferSlice`.
+
+An `ExposureTrackedBuffer` is a subclass of the regular `Buffer` that tracks internal and external references to its underlying memory. Internal references are tracked by maintaining [weak references](https://docs.python.org/3/library/weakref.html) to every `BufferSlice` of the underlying memory. External references are tracked through "exposure" status of the underlying memory. A buffer is considered exposed if the device pointer (integer or void*) has been handed out to a library outside of cudf. In this case, we have no way of knowing if the data are being modified by a third party.
+
+`BufferSlice` is a subclass of `ExposureTrackedBuffer` that represents a _slice_ of the memory underlying a exposure tracked buffer.
+
+When the cudf option `"copy_on_write"` is `True`, `as_buffer` calls `as_exposure_tracked_buffer`, which always returns a `BufferSlice`. It is then the slices that determine whether or not to make a copy when a write operation is performed on a `Column` (see below). If multiple slices point to the same underlying memory, then a copy must be made whenever a modification is attempted.
 
 
 ### Eager copies when exposing to third-party libraries
 
-If a `Column`/`CopyOnWriteBuffer` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred. Hence whenever
+If a `Column`/`BufferSlice` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred. Hence whenever
 someone accesses data through the `__cuda_array_interface__`, we eagerly trigger the copy by calling
-`_unlink_shared_buffers` which ensures a true copy of underlying device data is made and
-unlinks the buffer from any shared "weak" references. Any future copy requests must also trigger a true physical copy (since we cannot track the lifetime of the third-party object). To handle this we also mark the `Column`/`CopyOnWriteBuffer` as
-`obj._zero_copied=True` thus indicating that any future shallow-copy requests will trigger a true physical copy
-rather than a copy-on-write shallow copy with weak references.
+`.make_single_owner_inplace` which ensures a true copy of underlying data is made and that the slice is the sole owner. Any future copy requests must also trigger a true physical copy (since we cannot track the lifetime of the third-party object). To handle this we also mark the `Column`/`BufferSlice` as exposed thus indicating that any future shallow-copy requests will trigger a true physical copy rather than a copy-on-write shallow copy.
 
 ### Obtaining a read-only object
 
 A read-only object can be quite useful for operations that will not
-mutate the data. This can be achieved by calling `._get_cuda_array_interface(readonly=True)`, and creating a `SimpleNameSpace` object around it.
-This will not trigger a deep copy even if the `CopyOnWriteBuffer`
-has weak references. This API should only be used when the lifetime of the proxy object is restricted to cudf's internal code execution. Handing this out to external libraries or user-facing APIs will lead to untracked references and undefined copy-on-write behavior. We currently use this API for device to host
+mutate the data. This can be achieved by calling `.get_ptr(mode="read")`, and using `cuda_array_interface_wrapper` to wrap a `__cuda_array_interface__` object around it.
+This will not trigger a deep copy even if multiple `BufferSlice` points to the same `ExposureTrackedBuffer`. This API should only be used when the lifetime of the proxy object is restricted to cudf's internal code execution. Handing this out to external libraries or user-facing APIs will lead to untracked references and undefined copy-on-write behavior. We currently use this API for device to host
 copies like in `ColumnBase.data_array_view(mode="read")` which is used for `Column.values_host`.
 
 
diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
new file mode 100644
index 00000000000..1b321dbb1fe
--- /dev/null
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -0,0 +1,155 @@
+# pylibcudf
+
+pylibcudf is a lightweight Cython wrapper around libcudf.
+It aims to provide a near-zero overhead interface to accessing libcudf in Python.
+It should be possible to achieve near-native C++ performance using Cythonized code calling pylibcudf, while also allowing fairly performant usage from Python.
+In addition to these requirements, pylibcudf must also integrate naturally with other Python libraries.
+In other words, it should interoperate fairly transparently with standard Python containers, community protocols like `__cuda_array_interface__`, and common vocabulary types like CuPy arrays.
+
+
+## General Design Principles
+
+To satisfy the goals of pylibcudf, we impose the following set of design principles:
+- Every public function or method should be `cpdef`ed. This allows it to be used in both Cython and Python code. This incurs some slight overhead over `cdef` functions, but we assume that this is acceptable because 1) the vast majority of users will be using pure Python rather than Cython, and 2) the overhead of a `cpdef` function over a `cdef` function is on the order of a nanosecond, while CUDA kernel launch overhead is on the order of a microsecond, so these function overheads should be washed out by typical usage of pylibcudf.
+- Every variable used should be strongly typed and either be a primitive type (int, float, etc) or a cdef class. Any enums in C++ should be mirrored using `cpdef enum`, which will create both a C-style enum in Cython and a PEP 435-style Python enum that will automatically be used in Python.
+- All typing in code should be written using Cython syntax, not PEP 484 Python typing syntax. Not only does this ensure compatibility with Cython < 3, but even with Cython 3 PEP 484 support remains incomplete as of this writing.
+- All cudf code should interact only with pylibcudf, never with libcudf directly.
+- All imports should be relative so that pylibcudf can be easily extracted from cudf later
+  - Exception: All imports of libcudf API bindings in `cudf._lib.cpp` should use absolute imports of `cudf._lib.cpp as libcudf`. We should convert the `cpp` directory into a proper package so that it can be imported as `libcudf` in that fashion. When moving pylibcudf into a separate package, it will be renamed to `libcudf` and only the imports will need to change.
+- Ideally, pylibcudf should depend on nothing other than rmm and pyarrow. This will allow it to be extracted into a a largely standalone library and used in environments where the larger dependency tree of cudf may be cumbersome.
+
+
+## Relationship to libcudf
+
+In general, the relationship between pylibcudf and libcudf can be understood in terms of two components, data structures and algorithms.
+
+(data-structures)=
+
+### Data Structures
+
+Typically, every type in libcudf should have a mirror Cython `cdef` class with an attribute `self.c_obj: unique_ptr[${underlying_type}]` that owns an instance of the underlying libcudf type.
+Each type should also implement a corresponding method `cdef ${cython_type} from_libcudf(${underlying_type} dt)` to enable constructing the Cython object from an underlying libcudf instance.
+Depending on the nature of the type, the function may need to accept a `unique_ptr` and take ownership e.g. `cdef ${cython_type} from_libcudf(unique_ptr[${underlying_type}] obj)`.
+This will typically be the case for types that own GPU data, may want to codify further.
+
+For example, `libcudf::data_type` maps to `pylibcudf.DataType`, which looks like this (implementation omitted):
+```cython
+
+cdef class DataType:
+    cdef data_type c_obj
+
+    cpdef TypeId id(self)
+    cpdef int32_t scale(self)
+
+    @staticmethod
+    cdef DataType from_libcudf(data_type dt)
+```
+
+This allows pylibcudf functions to accept a typed `DataType` parameter and then trivially call underlying libcudf algorithms by accessing the argument's `c_obj`.
+
+#### pylibcudf Tables and Columns
+
+The primary exception to the above set of rules are libcudf's core data owning types, `cudf::table` and `cudf::column`.
+libcudf uses modern C++ idioms based on smart pointers to avoid resource leaks and make code exception-safe.
+To avoid passing around raw pointers, and to ensure that ownership semantics are clear, libcudf has separate `view` types corresponding to data owning types.
+For example, `cudf::column` owns data, while `cudf::column_view` represents an view on a column of data and `cudf::mutable_column_view` represents a mutable view.
+A `column_view` need not actually reference data owned by a `cudf::column`; any memory buffer will do.
+This separation allows libcudf algorithms to clearly communicate ownership expectations and allows multiple views into the same data to coexist.
+
+While libcudf algorithms accept views as inputs, any algorithms that allocate data must return `cudf::column` and `cudf::table` objects.
+libcudf's ownership model is problematic for pylibcudf, which must be able to seamlessly interoperate with data provided by other Python libraries like PyTorch or Numba.
+Therefore, pylibcudf employs the following strategy:
+- pylibcudf defines the `gpumemoryview` type, which (analogous to the [Python `memoryview` type](https://docs.python.org/3/library/stdtypes.html#memoryview)) represents a view into memory owned by another object that it keeps alive using Python's standard reference counting machinery. A `gpumemoryview` is constructible from any object implementing the [CUDA Array Interface protocol](https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html).
+  - This type will eventually be generalized for reuse outside of pylibcudf.
+- pylibcudf defines its own Table and Column classes.
+  - A Table maintains Python references to the Columns it contains, so multiple Tables may share the same Column.
+  - A Column consists of `gpumemoryview`s of its data buffers (which may include children for nested types) and its null mask.
+- `pylibcudf.Table` and `pylibcudf.Column` provide easy access to `cudf::table_view` and `cudf::column_view` objects viewing the same columns/memory. These can be then be used when implementing any pylibcudf algorithm in terms of the underlying libcudf algorithm. Specifically, each of these classes owns an instance of the libcudf view type and provides a method `view` that may be used to access a pointer to that object to be passed to libcudf.
+
+
+### Algorithms
+
+pylibcudf algorithms should look almost exactly like libcudf algorithms.
+Any libcudf function should be mirrored in pylibcudf with an identical signature and libcudf types mapped to corresponding pylibcudf types.
+All calls to libcudf algorithms should perform any requisite Python preprocessing early, then release the GIL prior to calling libcudf.
+For example, here is the implementation of `gather`:
+```cython
+
+cpdef Table gather(
+    Table source_table,
+    Column gather_map,
+    OutOfBoundsPolicy bounds_policy
+):
+    cdef unique_ptr[table] c_result
+    with nogil:
+        c_result = move(
+            cpp_copying.gather(
+                source_table.view(),
+                gather_map.view(),
+                bounds_policy
+            )
+        )
+    return Table.from_libcudf(move(c_result))
+```
+
+There are a couple of notable points from the snippet above:
+- The object returned from libcudf is immediately converted to a pylibcudf type.
+- `cudf::gather` accepts a `cudf::out_of_bounds_policy` enum parameter. `OutOfBoundsPolicy` is an alias for this type in pylibcudf that matches our Python naming conventions (CapsCase instead of snake\_case).
+
+## Miscellaneous Notes
+
+### Cython Scoped Enums
+Cython 3 introduced support for scoped enumerations.
+However, this support has some bugs as well as some easy pitfalls.
+Our usage of enums is intended to minimize the complexity of our code while also working around Cython's limitations.
+
+```{warning}
+The guidance in this section may change often as Cython is updated and our understanding of best practices evolves.
+```
+
+- All pxd files that declare a C++ enum should use `cpdef enum class` declarations.
+  -  Reason: This declaration makes the C++ enum available in Cython code while also transparently creating a Python enum.
+- Any pxd file containing only C++ declarations must still have a corresponding pyx file if any of the declarations are scoped enums.
+  - Reason: The creation of the Python enum requires that Cython actually generate the necessary Python C API code, which will not happen if only a pxd file is present.
+-  If a C++ enum will be part of a pylibcudf module's public API, then it should be imported (not cimported) directly into the pyx file and aliased with a name that matches our Python class naming conventions (CapsCase) instead of our C++ naming convention (snake\_case).
+  - Reason: We want to expose the enum to both Python and Cython consumers of the module. As a side effect, this aliasing avoids [this Cython bug](https://github.com/cython/cython/issues/5609).
+  - Note: Once the above Cython bug is resolved, the enum should also be aliased into the pylibcudf pxd file when it is cimported so that Python and Cython usage will match.
+
+Here is an example of appropriate enum usage.
+
+
+```cython
+# cpp/copying.pxd
+cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
+    # cpdef here so that we export both a cdef enum class and a Python enum.Enum.
+    cpdef enum class out_of_bounds_policy(bool):
+        NULLIFY
+        DONT_CHECK
+
+
+# cpp/copying.pyx
+# This file is empty, but is required to compile the Python enum in cpp/copying.pxd
+
+
+# pylibcudf/copying.pxd
+
+# cimport the enum using the exact name
+# Once https://github.com/cython/cython/issues/5609 is resolved,
+# this import should instead be
+# from cudf._lib.cpp.copying cimport out_of_bounds_policy as OutOfBoundsPolicy
+from cudf._lib.cpp.copying cimport out_of_bounds_policy
+
+
+# pylibcudf/copying.pyx
+# Access cpp.copying members that aren't part of this module's public API via
+# this module alias
+from cudf._lib.cpp cimport copying as cpp_copying
+from cudf._lib.cpp.copying cimport out_of_bounds_policy
+
+# This import exposes the enum in the public API of this module.
+# It requires a no-cython-lint tag because it will be unused: all typing of
+# parameters etc will need to use the Cython name `out_of_bounds_policy` until
+# the Cython bug is resolved.
+from cudf._lib.cpp.copying import \
+    out_of_bounds_policy as OutOfBoundsPolicy  # no-cython-lint
+```
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 0352c624e04..c3da2558db8 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -5,12 +5,11 @@
    "id": "4c6c548b",
    "metadata": {},
    "source": [
-    "10 Minutes to cuDF and Dask-cuDF\n",
-    "=======================\n",
+    "# 10 Minutes to cuDF and Dask-cuDF\n",
     "\n",
     "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly towards new users.\n",
     "\n",
-    "### What are these Libraries?\n",
+    "## What are these Libraries?\n",
     "\n",
     "[cuDF](https://github.com/rapidsai/cudf) is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API in the style of [pandas](https://pandas.pydata.org).\n",
     "\n",
@@ -19,7 +18,7 @@
     "[Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n",
     "\n",
     "\n",
-    "### When to use cuDF and Dask-cuDF\n",
+    "## When to use cuDF and Dask-cuDF\n",
     "\n",
     "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF."
    ]
@@ -51,8 +50,7 @@
    "id": "eff5fc19",
    "metadata": {},
    "source": [
-    "Object Creation\n",
-    "---------------"
+    "## Object Creation"
    ]
   },
   {
@@ -574,8 +572,7 @@
    "id": "5820795f",
    "metadata": {},
    "source": [
-    "Viewing Data\n",
-    "-------------"
+    "## Viewing Data"
    ]
   },
   {
@@ -1002,10 +999,7 @@
    "id": "3302a647",
    "metadata": {},
    "source": [
-    "Selection\n",
-    "------------\n",
-    "\n",
-    "## Getting"
+    "## Selecting a Column"
    ]
   },
   {
@@ -1088,7 +1082,7 @@
    "id": "a5160dd1",
    "metadata": {},
    "source": [
-    "## Selection by Label"
+    "## Selecting Rows by Label"
    ]
   },
   {
@@ -1250,7 +1244,7 @@
    "id": "d8e07162",
    "metadata": {},
    "source": [
-    "## Selection by Position"
+    "## Selecting Rows by Position"
    ]
   },
   {
@@ -2316,15 +2310,6 @@
     "ds.fillna(999).head(n=3)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "22199029",
-   "metadata": {},
-   "source": [
-    "Operations\n",
-    "------------"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "d97605e6",
@@ -3643,8 +3628,7 @@
    "id": "e0915c46",
    "metadata": {},
    "source": [
-    "Time Series\n",
-    "------------"
+    "## Time Series"
    ]
   },
   {
@@ -3816,8 +3800,7 @@
    "id": "45f9408b",
    "metadata": {},
    "source": [
-    "Categoricals\n",
-    "------------"
+    "## Categoricals"
    ]
   },
   {
@@ -4076,21 +4059,12 @@
     "dgdf.grade.cat.codes.compute()"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "f9d616e2",
-   "metadata": {},
-   "source": [
-    "Converting Data Representation\n",
-    "--------------------------------"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "1b391a0d",
    "metadata": {},
    "source": [
-    "## Pandas"
+    "## Converting to Pandas"
    ]
   },
   {
@@ -4413,7 +4387,7 @@
    "id": "a104294a",
    "metadata": {},
    "source": [
-    "## Numpy"
+    "## Converting to Numpy"
    ]
   },
   {
@@ -4561,7 +4535,7 @@
    "id": "b520acf7",
    "metadata": {},
    "source": [
-    "## Arrow"
+    "## Converting to Arrow"
    ]
   },
   {
@@ -4636,21 +4610,12 @@
     "ddf.head().to_arrow()"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "f0734690",
-   "metadata": {},
-   "source": [
-    "Getting Data In/Out\n",
-    "------------------------"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "6f0251c6",
    "metadata": {},
    "source": [
-    "## CSV"
+    "## Reading/Writing CSV Files"
    ]
   },
   {
@@ -5142,7 +5107,7 @@
    "id": "763c555b",
    "metadata": {},
    "source": [
-    "## Parquet"
+    "## Reading/Writing Parquet Files"
    ]
   },
   {
@@ -5427,7 +5392,7 @@
    "id": "90a49967",
    "metadata": {},
    "source": [
-    "## ORC"
+    "## Reading/Writing ORC Files"
    ]
   },
   {
@@ -5694,8 +5659,7 @@
    "id": "c988553d",
    "metadata": {},
    "source": [
-    "Dask Performance Tips\n",
-    "--------------------------------\n",
+    "## Dask Performance Tips\n",
     "\n",
     "Like Apache Spark, Dask operations are [lazy](https://en.wikipedia.org/wiki/Lazy_evaluation). Instead of being executed immediately, most operations are added to a task graph and the actual evaluation is delayed until the result is needed.\n",
     "\n",
@@ -6171,7 +6135,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.9.13 ('cudf-dev')",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -6185,7 +6149,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.10.12"
   },
   "vscode": {
    "interpreter": {
diff --git a/docs/cudf/source/user_guide/cudf.CategoricalDtype.rst b/docs/cudf/source/user_guide/cudf.CategoricalDtype.rst
deleted file mode 100644
index 808c20e0750..00000000000
--- a/docs/cudf/source/user_guide/cudf.CategoricalDtype.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-cudf.CategoricalDtype
-=====================
-
-.. currentmodule:: cudf
-
-.. autoclass:: CategoricalDtype
-   :members: categories, ordered, from_pandas, to_pandas
-
-
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-
-      CategoricalDtype.categories
-      CategoricalDtype.ordered
-      CategoricalDtype.from_pandas
-      CategoricalDtype.to_pandas
diff --git a/docs/cudf/source/user_guide/cudf.Decimal128Dtype.rst b/docs/cudf/source/user_guide/cudf.Decimal128Dtype.rst
deleted file mode 100644
index cada8fd6cb6..00000000000
--- a/docs/cudf/source/user_guide/cudf.Decimal128Dtype.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-cudf.Decimal128Dtype
-===================
-
-.. currentmodule:: cudf
-
-.. autoclass:: Decimal128Dtype
-   :members: precision, scale, itemsize, to_arrow, from_arrow
-
-
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-
-      Decimal128Dtype.precision
-      Decimal128Dtype.scale
-      Decimal128Dtype.itemsize
-      Decimal128Dtype.to_arrow
-      Decimal128Dtype.from_arrow
diff --git a/docs/cudf/source/user_guide/cudf.Decimal32Dtype.rst b/docs/cudf/source/user_guide/cudf.Decimal32Dtype.rst
deleted file mode 100644
index c4c65bb2d24..00000000000
--- a/docs/cudf/source/user_guide/cudf.Decimal32Dtype.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-cudf.Decimal32Dtype
-===================
-
-.. currentmodule:: cudf
-
-.. autoclass:: Decimal32Dtype
-   :members: precision, scale, itemsize, to_arrow, from_arrow
-
-
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-
-      Decimal32Dtype.precision
-      Decimal32Dtype.scale
-      Decimal32Dtype.itemsize
-      Decimal32Dtype.to_arrow
-      Decimal32Dtype.from_arrow
diff --git a/docs/cudf/source/user_guide/cudf.Decimal64Dtype.rst b/docs/cudf/source/user_guide/cudf.Decimal64Dtype.rst
deleted file mode 100644
index 99305ade485..00000000000
--- a/docs/cudf/source/user_guide/cudf.Decimal64Dtype.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-cudf.Decimal64Dtype
-===================
-
-.. currentmodule:: cudf
-
-.. autoclass:: Decimal64Dtype
-   :members: precision, scale, itemsize, to_arrow, from_arrow
-
-
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-
-      Decimal64Dtype.precision
-      Decimal64Dtype.scale
-      Decimal64Dtype.itemsize
-      Decimal64Dtype.to_arrow
-      Decimal64Dtype.from_arrow
diff --git a/docs/cudf/source/user_guide/cudf.ListDtype.rst b/docs/cudf/source/user_guide/cudf.ListDtype.rst
deleted file mode 100644
index a9b5000e657..00000000000
--- a/docs/cudf/source/user_guide/cudf.ListDtype.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-cudf.ListDtype
-==============
-
-.. currentmodule:: cudf
-
-.. autoclass:: ListDtype
-   :members: element_type, leaf_type, from_arrow, to_arrow
-
-
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-
-      ListDtype.element_type
-      ListDtype.leaf_type
-      ListDtype.from_arrow
-      ListDtype.to_arrow
diff --git a/docs/cudf/source/user_guide/cudf.StructDtype.rst b/docs/cudf/source/user_guide/cudf.StructDtype.rst
deleted file mode 100644
index dd2a841dbe3..00000000000
--- a/docs/cudf/source/user_guide/cudf.StructDtype.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-cudf.StructDtype
-================
-
-.. currentmodule:: cudf
-
-.. autoclass:: StructDtype
-   :members: fields, from_arrow, to_arrow
-
-
-
-..
-   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
-   .. autosummary::
-      :toctree:
-
-      StructDtype.fields
-      StructDtype.from_arrow
-      StructDtype.to_arrow
diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb
index c98a4ddea23..c5b1210a2c7 100644
--- a/docs/cudf/source/user_guide/cupy-interop.ipynb
+++ b/docs/cudf/source/user_guide/cupy-interop.ipynb
@@ -35,7 +35,7 @@
    "id": "e7e64b1a",
    "metadata": {},
    "source": [
-    "### Converting a cuDF DataFrame to a CuPy Array\n",
+    "## Converting a cuDF DataFrame to a CuPy Array\n",
     "\n",
     "If we want to convert a cuDF DataFrame to a CuPy ndarray, There are multiple ways to do it:\n",
     "\n",
@@ -110,7 +110,7 @@
    "id": "0759ab29",
    "metadata": {},
    "source": [
-    "### Converting a cuDF Series to a CuPy Array"
+    "## Converting a cuDF Series to a CuPy Array"
    ]
   },
   {
@@ -258,7 +258,7 @@
    "id": "b353bded",
    "metadata": {},
    "source": [
-    "### Converting a CuPy Array to a cuDF DataFrame\n",
+    "## Converting a CuPy Array to a cuDF DataFrame\n",
     "\n",
     "We can also convert a CuPy ndarray to a cuDF DataFrame. Like before, there are multiple ways to do it:\n",
     "\n",
@@ -782,7 +782,7 @@
    "id": "395e2bba",
    "metadata": {},
    "source": [
-    "### Converting a CuPy Array to a cuDF Series\n",
+    "## Converting a CuPy Array to a cuDF Series\n",
     "\n",
     "To convert an array to a Series, we can directly pass the array to the `Series` constructor."
    ]
@@ -818,7 +818,7 @@
    "id": "7e159619",
    "metadata": {},
    "source": [
-    "### Interweaving CuDF and CuPy for Smooth PyData Workflows\n",
+    "## Interweaving CuDF and CuPy for Smooth PyData Workflows\n",
     "\n",
     "RAPIDS libraries and the entire GPU PyData ecosystem are developing quickly, but sometimes a one library may not have the functionality you need. One example of this might be taking the row-wise sum (or mean) of a Pandas DataFrame. cuDF's support for row-wise operations isn't mature, so you'd need to either transpose the DataFrame or write a UDF and explicitly calculate the sum across each row. Transposing could lead to hundreds of thousands of columns (which cuDF wouldn't perform well with) depending on your data's shape, and writing a UDF can be time intensive.\n",
     "\n",
@@ -1078,7 +1078,7 @@
    "id": "61bfb868",
    "metadata": {},
    "source": [
-    "### Converting a cuDF DataFrame to a CuPy Sparse Matrix\n",
+    "## Converting a cuDF DataFrame to a CuPy Sparse Matrix\n",
     "\n",
     "We can also convert a DataFrame or Series to a CuPy sparse matrix. We might want to do this if downstream processes expect CuPy sparse matrices as an input.\n",
     "\n",
@@ -1421,7 +1421,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/docs/cudf/source/user_guide/data-types.md b/docs/cudf/source/user_guide/data-types.md
index ee75457e87d..1f4cfbc7366 100644
--- a/docs/cudf/source/user_guide/data-types.md
+++ b/docs/cudf/source/user_guide/data-types.md
@@ -9,18 +9,18 @@ All data types in cuDF are [nullable](missing-data).
 
 <div class="special-table">
 
-| Kind of data         | Data type(s)                                                                                      |
-|----------------------|---------------------------------------------------------------------------------------------------|
-| Signed integer       | `'int8'`, `'int16'`, `'int32'`, `'int64'`                                                         |
-| Unsigned integer     | `'uint32'`, `'uint64'`                                                                            |
-| Floating-point       | `'float32'`, `'float64'`                                                                          |
-| Datetime             | `'datetime64[s]'`, `'datetime64[ms]'`, `'datetime64['us']`, `'datetime64[ns]'`                    |
-| Timedelta (duration) | `'timedelta[s]'`, `'timedelta[ms]'`, `'timedelta['us']`, `'timedelta[ns]'`                        |
-| Category             | {py:func}`cudf.CategoricalDtype`                                                                  |
-| String               | `'object'` or `'string'`                                                                          |
-| Decimal              | {py:func}`cudf.Decimal32Dtype`, {py:func}`cudf.Decimal64Dtype`, {py:func}`cudf.Decimal128Dtype`   |
-| List                 | {py:func}`cudf.ListDtype`                                                                         |
-| Struct               | {py:func}`cudf.StructDtype`                                                                       |
+| Kind of data         | Data type(s)                                                                                                                             |
+|----------------------|------------------------------------------------------------------------------------------------------------------------------------------|
+| Signed integer       | `'int8'`, `'int16'`, `'int32'`, `'int64'`                                                                                                |
+| Unsigned integer     | `'uint32'`, `'uint64'`                                                                                                                   |
+| Floating-point       | `'float32'`, `'float64'`                                                                                                                 |
+| Datetime             | `'datetime64[s]'`, `'datetime64[ms]'`, `'datetime64['us']`, `'datetime64[ns]'`                                                           |
+| Timedelta (duration) | `'timedelta[s]'`, `'timedelta[ms]'`, `'timedelta['us']`, `'timedelta[ns]'`                                                               |
+| Category             | {py:class}`~cudf.core.dtypes.CategoricalDtype`                                                                                           |
+| String               | `'object'` or `'string'`                                                                                                                 |
+| Decimal              | {py:class}`~cudf.core.dtypes.Decimal32Dtype`, {py:class}`~cudf.core.dtypes.Decimal64Dtype`, {py:class}`~cudf.core.dtypes.Decimal128Dtype`|
+| List                 | {py:class}`~cudf.core.dtypes.ListDtype`                                                                                                  |
+| Struct               | {py:class}`~cudf.core.dtypes.StructDtype`                                                                                                |
 
 </div>
 
@@ -60,9 +60,11 @@ cuDF does not support storing arbitrary Python objects.
 ## Decimal data types
 
 We provide special data types for working with decimal data, namely
-`Decimal32Dtype`, `Decimal64Dtype`, and `Decimal128Dtype`.  Use these
-data types when you need to store values with greater precision than
-allowed by floating-point representation.
+{py:class}`~cudf.core.dtypes.Decimal32Dtype`,
+{py:class}`~cudf.core.dtypes.Decimal64Dtype`, and
+{py:class}`~cudf.core.dtypes.Decimal128Dtype`.  Use these data types when you
+need to store values with greater precision than allowed by floating-point
+representation.
 
 Decimal data types in cuDF are based on fixed-point representation.  A
 decimal data type is composed of a _precision_ and a _scale_.  The
@@ -110,10 +112,11 @@ type:
 
 ## Nested data types (`List` and `Struct`)
 
-`ListDtype` and `StructDtype` are special data types in cuDF for
-working with list-like and dictionary-like data. These are referred to
-as "nested" data types, because they enable you to store a list of
-lists, or a struct of lists, or a struct of list of lists, etc.,
+{py:class}`~cudf.core.dtypes.ListDtype` and
+{py:class}`~cudf.core.dtypes.StructDtype` are special data types in cuDF for
+working with list-like and dictionary-like data. These are referred to as
+"nested" data types, because they enable you to store a list of lists, or a
+struct of lists, or a struct of list of lists, etc.,
 
 You can create lists and struct Series from existing Pandas Series of
 lists and dictionaries respectively:
diff --git a/docs/cudf/source/user_guide/groupby.md b/docs/cudf/source/user_guide/groupby.md
index 66b548727e1..53ff971a64f 100644
--- a/docs/cudf/source/user_guide/groupby.md
+++ b/docs/cudf/source/user_guide/groupby.md
@@ -1,6 +1,7 @@
 ---
-substitutions:
-  describe: '`describe`'
+myst:
+  substitutions:
+    describe: '`describe`'
 ---
 
 (basics-groupby)=
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index ba8c65784d2..1e6d4b332d1 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -921,7 +921,7 @@
    "id": "313c77f3",
    "metadata": {},
    "source": [
-    "`cudf.NA` can also be directly returned from a function resulting in data that has the the correct nulls in the end, just as if it were run in Pandas. For the following data, the last row fulfills the condition that `1 + 3 > 3` and returns `NA` for that row:"
+    "`cudf.NA` can also be directly returned from a function resulting in data that has the correct nulls in the end, just as if it were run in Pandas. For the following data, the last row fulfills the condition that `1 + 3 > 3` and returns `NA` for that row:"
    ]
   },
   {
@@ -2186,7 +2186,7 @@
    "source": [
     "## GroupBy DataFrame UDFs\n",
     "\n",
-    "We can also apply UDFs to grouped DataFrames using `apply_grouped`. This example is also drawn and adapted from the RAPIDS [API documentation]().\n",
+    "We can also apply UDFs to grouped DataFrames using `apply_grouped`.\n",
     "\n",
     "First, we'll group our DataFrame based on column `b`, which is either True or False."
    ]
diff --git a/docs/cudf/source/user_guide/io/io.md b/docs/cudf/source/user_guide/io/io.md
index c62351925f0..adcdaa51e7e 100644
--- a/docs/cudf/source/user_guide/io/io.md
+++ b/docs/cudf/source/user_guide/io/io.md
@@ -17,7 +17,7 @@ IO format.
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+-------------------+--------+--------+---------+---------+
     |                       |       CSV       |      Parquet    |       JSON       |       ORC       |  AVRO  |        HDF        |       DLPack    |      Feather      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | Data Type             | Writer | Reader | Writer | Reader | Writer¹ | Reader | Writer | Reader | Reader | Writer¹ | Reader¹ | Writer | Reader | Writer¹ | Reader¹ |
+    | Data Type             | Writer | Reader | Writer | Reader | Writer¹ | Reader | Writer | Reader | Reader | Writer² | Reader² | Writer | Reader | Writer² | Reader² |
     +=======================+========+========+========+========+=========+========+========+========+========+=========+=========+========+========+=========+=========+
     | int8                  | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
@@ -43,9 +43,9 @@ IO format.
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
     | str                   | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | category              | ✅     | ❌     | ❌     | ❌     | ✅      | ❌     | ❌     | ❌     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    | category              | ✅     | ❌     | ❌     | ❌     | ❌      | ❌     | ❌     | ❌     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | list                  | ❌     | ❌     | ✅     | ✅     | ✅      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ✅      | ✅      |
+    | list                  | ❌     | ❌     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
     | timedelta64[s]        | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ❌     | ❌     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
@@ -63,13 +63,13 @@ IO format.
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
     | datetime64[ns]        | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | struct                | ❌     | ❌     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    | struct                | ❌     | ❌     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | decimal32             | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
+    | decimal32             | ✅     | ✅     | ✅     | ✅     | ✅      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | decimal64             | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
+    | decimal64             | ✅     | ✅     | ✅     | ✅     | ✅      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | decimal128            | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
+    | decimal128            | ✅     | ✅     | ✅     | ✅     | ✅      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
 ```
 
@@ -78,7 +78,8 @@ IO format.
 
 **Notes:**
 
-- \[¹\] - Not GPU-accelerated.
+- \[¹\] - Not all orientations are GPU-accelerated.
+- \[²\] - Not GPU-accelerated.
 
 ## Magnum IO GPUDirect Storage Integration
 
@@ -123,9 +124,11 @@ tuning parameters in KvikIO see: <https://github.com/rapidsai/kvikio>
 Operations that support the use of GPUDirect Storage:
 
 - {py:func}`cudf.read_avro`
+- {py:func}`cudf.read_json`
 - {py:func}`cudf.read_parquet`
 - {py:func}`cudf.read_orc`
 - {py:meth}`cudf.DataFrame.to_csv`
+- {py:func}`cudf.DataFrame.to_json`
 - {py:meth}`cudf.DataFrame.to_parquet`
 - {py:meth}`cudf.DataFrame.to_orc`
 
diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md
index ba04a231f41..9e821fd8833 100644
--- a/docs/cudf/source/user_guide/pandas-comparison.md
+++ b/docs/cudf/source/user_guide/pandas-comparison.md
@@ -87,37 +87,51 @@ do *not* guarantee output ordering.
 Compare the results obtained from Pandas and cuDF below:
 
 ```{code} python
- >>> import cupy as cp
- >>> df = cudf.DataFrame({'a': cp.random.randint(0, 1000, 1000), 'b': range(1000)})
- >>> df.groupby("a").mean().head()
-          b
- a
- 742  694.5
- 29   840.0
- 459  525.5
- 442  363.0
- 666    7.0
- >>> df.to_pandas().groupby("a").mean().head()
-          b
- a
- 2   643.75
- 6    48.00
- 7   631.00
- 9   906.00
- 10  640.00
+>>> import cupy as cp
+>>> cp.random.seed(0)
+>>> import cudf
+>>> df = cudf.DataFrame({'a': cp.random.randint(0, 1000, 1000), 'b': range(1000)})
+>>> df.groupby("a").mean().head()
+         b
+a
+29   193.0
+803  915.0
+5    138.0
+583  300.0
+418  613.0
+>>> df.to_pandas().groupby("a").mean().head()
+            b
+a
+0   70.000000
+1  356.333333
+2  770.000000
+3  838.000000
+4  342.000000
 ```
 
-To match Pandas behavior, you must explicitly pass `sort=True`:
+To match Pandas behavior, you must explicitly pass `sort=True`
+or enable the `mode.pandas_compatible` option when trying to
+match Pandas behavior with `sort=False`:
 
 ```{code} python
 >>> df.to_pandas().groupby("a", sort=True).mean().head()
-         b
+            b
+a
+0   70.000000
+1  356.333333
+2  770.000000
+3  838.000000
+4  342.000000
+
+>>> cudf.set_option("mode.pandas_compatible", True)
+>>> df.groupby("a").mean().head()
+            b
 a
-2   643.75
-6    48.00
-7   631.00
-9   906.00
-10  640.00
+0   70.000000
+1  356.333333
+2  770.000000
+3  838.000000
+4  342.000000
 ```
 
 ## Floating-point computation
@@ -131,7 +145,7 @@ For example, `s.sum()` is not guaranteed to produce identical results
 to Pandas nor produce identical results from run to run, when `s` is a
 Series of floats.  If you need to compare floating point results, you
 should typically do so using the functions provided in the
-[`cudf.testing`](/api_docs/general_utilities#testing-functions)
+[`cudf.testing`](/api_docs/general_utilities)
 module, which allow you to compare values up to a desired precision.
 
 ## Column names
diff --git a/docs/dask_cudf/Makefile b/docs/dask_cudf/Makefile
index d0c3cbf1020..37721116bd9 100644
--- a/docs/dask_cudf/Makefile
+++ b/docs/dask_cudf/Makefile
@@ -3,8 +3,9 @@
 
 # You can set these variables from the command line, and also
 # from the environment for the first two.
-SPHINXOPTS    ?=
+SPHINXOPTS    ?= -n -v
 SPHINXBUILD   ?= sphinx-build
+SPHINXPROJ    = dask_cudf
 SOURCEDIR     = source
 BUILDDIR      = build
 
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
index 2a6f705f927..ad629b5e949 100644
--- a/docs/dask_cudf/source/conf.py
+++ b/docs/dask_cudf/source/conf.py
@@ -11,8 +11,8 @@
 project = "dask-cudf"
 copyright = "2018-2023, NVIDIA Corporation"
 author = "NVIDIA Corporation"
-version = "23.06"
-release = "23.06.00"
+version = '23.10'
+release = '23.10.00'
 
 language = "en"
 
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index e2c068d3d7d..4a68c7dbc60 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.06/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake
   )
 endif()
diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
index 65f151a5126..b2c620848de 100644
--- a/java/ci/Dockerfile.centos7
+++ b/java/ci/Dockerfile.centos7
@@ -32,7 +32,7 @@ RUN yum install -y git zlib-devel maven tar wget patch ninja-build
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
 RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
 
-ARG CMAKE_VERSION=3.23.3
+ARG CMAKE_VERSION=3.26.4
 RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
    tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
    rm cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
diff --git a/java/ci/README.md b/java/ci/README.md
index 595b5b6fee1..e9599b33bf1 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-23.06
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-23.10
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable devtoolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-23.06.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-23.10.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index 985a56d9e82..afcc0e15a2c 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>23.06.0-SNAPSHOT</version>
+    <version>23.10.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/java/src/main/java/ai/rapids/cudf/ChunkedPack.java b/java/src/main/java/ai/rapids/cudf/ChunkedPack.java
new file mode 100644
index 00000000000..d44c1322902
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/ChunkedPack.java
@@ -0,0 +1,103 @@
+/*
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * JNI interface to cudf::chunked_pack.
+ *
+ * ChunkedPack has an Iterator-like API with the familiar `hasNext` and `next`
+ * methods. `next` should be used in a loop until `hasNext` returns false.
+ *
+ * However, `ChunkedPack.next` is special because it takes a `DeviceMemoryBuffer` as a
+ * parameter, which means that the caller can call `next` giving any bounce buffer it
+ * may have previously allocated. No requirement exists that the bounce buffer be the
+ * same each time, the only requirement is that their sizes are all the same, and match
+ * the size that was passed to `Table.makeChunkedPack` (which instantiates this class).
+ *
+ * The user of `ChunkedPack` must close `.close()` when done using it to clear up both
+ * host and device resources.
+ */
+public class ChunkedPack implements AutoCloseable {
+  long nativePtr;
+
+  /**
+   * This constructor is invoked by `Table.makeChunkedPack` after creating a native
+   * `cudf::chunked_pack`.
+   * @param nativePtr pointer to a `cudf::chunked_pack`
+   */
+  public ChunkedPack(long nativePtr) {
+    this.nativePtr = nativePtr;
+  }
+
+  /**
+   * Get the final contiguous size of the table we are packing. This is
+   * the size that the final buffer should be, just like if the user called
+   * `cudf::pack` instead.
+   * @return the total number of bytes for the table in contiguous layout
+   */
+  public long getTotalContiguousSize() {
+    return chunkedPackGetTotalContiguousSize(nativePtr);
+  }
+
+  /**
+   * Method to be called to ensure that `ChunkedPack` has work left.
+   * This method should be invoked followed by a call to `next`, until
+   * `hasNext` returns false.
+   * @return true if there is work left to be done (`next` should be called),
+   *         false otherwise.
+   */
+  public boolean hasNext() {
+    return chunkedPackHasNext(nativePtr);
+  }
+
+  /**
+   * Place the next contiguous chunk of our table into `userPtr`.
+   *
+   * This method throws if `hasNext` is false.
+   * @param userPtr the bounce buffer to use for this iteration
+   * @return the number of bytes that we were able to place in `userPtr`. This is
+   *         at most `userPtr.getLength()`.
+   */
+  public long next(DeviceMemoryBuffer userPtr) {
+    return chunkedPackNext(nativePtr, userPtr.getAddress(), userPtr.getLength());
+  }
+
+  /**
+   * Generates opaque table metadata that can be unpacked via `cudf::unpack`
+   * at a later time.
+   * @return a `PackedColumnMetadata` instance referencing cuDF packed table metadata
+   */
+  public PackedColumnMetadata buildMetadata() {
+    return new PackedColumnMetadata(chunkedPackBuildMetadata(nativePtr));
+  }
+
+  @Override
+  public void close() {
+    try {
+      chunkedPackDelete(nativePtr);
+    } finally {
+      nativePtr = 0;
+    }
+  }
+
+  private static native long chunkedPackGetTotalContiguousSize(long nativePtr);
+  private static native boolean chunkedPackHasNext(long nativePtr);
+  private static native long chunkedPackNext(long nativePtr, long userPtr, long userPtrSize);
+  private static native long chunkedPackBuildMetadata(long nativePtr);
+  private static native void chunkedPackDelete(long nativePtr);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index fecb13e1921..30e92d2367f 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -50,10 +50,11 @@ public interface EventHandler {
      *
      * @note the callback is invoked with this `ColumnVector`'s lock held.
      *
-     * @param refCount - the updated ref count for this ColumnVector at the time
+     * @param cv reference to the ColumnVector we are closing
+     * @param refCount the updated ref count for this ColumnVector at the time
      *                 of invocation
      */
-    void onClosed(int refCount);
+    void onClosed(ColumnVector cv, int refCount);
   }
 
   private static final Logger log = LoggerFactory.getLogger(ColumnVector.class);
@@ -205,16 +206,16 @@ private static long getColumnViewFromColumn(long nativePointer) {
     }
   }
 
-  static long initViewHandle(DType type, int rows, int nc,
-                                       BaseDeviceMemoryBuffer dataBuffer,
-                                       BaseDeviceMemoryBuffer validityBuffer,
-                                       BaseDeviceMemoryBuffer offsetBuffer, long[] childHandles) {
+  static long initViewHandle(DType type, int numRows, int nullCount,
+                             BaseDeviceMemoryBuffer dataBuffer,
+                             BaseDeviceMemoryBuffer validityBuffer,
+                             BaseDeviceMemoryBuffer offsetBuffer, long[] childHandles) {
     long cd = dataBuffer == null ? 0 : dataBuffer.address;
     long cdSize = dataBuffer == null ? 0 : dataBuffer.length;
     long od = offsetBuffer == null ? 0 : offsetBuffer.address;
     long vd = validityBuffer == null ? 0 : validityBuffer.address;
     return makeCudfColumnView(type.typeId.getNativeId(), type.getScale(), cd, cdSize,
-        od, vd, nc, rows, childHandles);
+        od, vd, nullCount, numRows, childHandles);
   }
 
   static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) {
@@ -260,9 +261,10 @@ public synchronized void close() {
     refCount--;
     offHeap.delRef();
     if (eventHandler != null) {
-      eventHandler.onClosed(refCount);
+      eventHandler.onClosed(this, refCount);
     }
     if (refCount == 0) {
+      super.close();
       offHeap.clean(false);
     } else if (refCount < 0) {
       offHeap.logRefCountDebug("double free " + this);
@@ -1727,4 +1729,39 @@ public static ColumnVector empty(HostColumnVector.DataType colType) {
       throw new IllegalArgumentException("Unsupported data type: " + colType);
     }
   }
+
+  static ColumnVector[] getColumnVectorsFromPointers(long[] nativeHandles) {
+    ColumnVector[] columns = new ColumnVector[nativeHandles.length];
+    try {
+      for (int i = 0; i < nativeHandles.length; i++) {
+        long nativeHandle = nativeHandles[i];
+        // setting address to zero, so we don't clean it in case of an exception as it
+        // will be cleaned up by the constructor
+        nativeHandles[i] = 0;
+        columns[i] = new ColumnVector(nativeHandle);
+      }
+      return columns;
+    } catch (Throwable t) {
+      for (ColumnVector columnVector : columns) {
+        if (columnVector != null) {
+          try {
+            columnVector.close();
+          } catch (Throwable s) {
+            t.addSuppressed(s);
+          }
+        }
+      }
+      for (long nativeHandle : nativeHandles) {
+        if (nativeHandle != 0) {
+          try {
+            deleteCudfColumn(nativeHandle);
+          } catch (Throwable s) {
+            t.addSuppressed(s);
+          }
+        }
+      }
+
+      throw t;
+    }
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 8b59ea68972..3f3a55f0970 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -50,18 +50,18 @@ public class ColumnView implements AutoCloseable, BinaryOperable {
    */
   ColumnView(long address) {
     this.viewHandle = address;
-    this.type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle));
-    this.rows = ColumnView.getNativeRowCount(viewHandle);
-    this.nullCount = ColumnView.getNativeNullCount(viewHandle);
-    this.offHeap = null;
     try {
+      this.type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle));
+      this.rows = ColumnView.getNativeRowCount(viewHandle);
+      this.nullCount = ColumnView.getNativeNullCount(viewHandle);
+      this.offHeap = null;
       AssertEmptyNulls.assertNullsAreEmpty(this);
-    } catch (AssertionError ae) {
+    } catch (Throwable t) {
       // offHeap state is null, so there is nothing to clean in offHeap
       // delete ColumnView to avoid memory leak
       deleteColumnView(viewHandle);
       viewHandle = 0;
-      throw ae;
+      throw t;
     }
   }
 
@@ -75,17 +75,17 @@ public class ColumnView implements AutoCloseable, BinaryOperable {
    */
   protected ColumnView(ColumnVector.OffHeapState state) {
     offHeap = state;
-    viewHandle = state.getViewHandle();
-    type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle));
-    rows = ColumnView.getNativeRowCount(viewHandle);
-    nullCount = ColumnView.getNativeNullCount(viewHandle);
     try {
+      viewHandle = state.getViewHandle();
+      type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle));
+      rows = ColumnView.getNativeRowCount(viewHandle);
+      nullCount = ColumnView.getNativeNullCount(viewHandle);
       AssertEmptyNulls.assertNullsAreEmpty(this);
-    } catch (AssertionError ae) {
+    } catch (Throwable t) {
       // cleanup offHeap
       offHeap.clean(false);
       viewHandle = 0;
-      throw ae;
+      throw t;
     }
   }
 
@@ -307,7 +307,7 @@ public final int getNumChildren() {
    * Returns the amount of device memory used.
    */
   public long getDeviceMemorySize() {
-    return getDeviceMemorySize(getNativeView());
+    return getDeviceMemorySize(getNativeView(), false);
   }
 
   @Override
@@ -669,12 +669,20 @@ public final ColumnVector[] slice(int... indices) {
     ColumnVector[] columnVectors = new ColumnVector[nativeHandles.length];
     try {
       for (int i = 0; i < nativeHandles.length; i++) {
-        columnVectors[i] = new ColumnVector(nativeHandles[i]);
+        long nativeHandle = nativeHandles[i];
+        // setting address to zero, so we don't clean it in case of an exception as it
+        // will be cleaned up by the constructor
         nativeHandles[i] = 0;
+        columnVectors[i] = new ColumnVector(nativeHandle);
       }
     } catch (Throwable t) {
-      cleanupColumnViews(nativeHandles, columnVectors);
-      throw t;
+      try {
+        cleanupColumnViews(nativeHandles, columnVectors, t);
+      } catch (Throwable s) {
+        t.addSuppressed(s);
+      } finally {
+        throw t;
+      }
     }
     return columnVectors;
   }
@@ -814,25 +822,41 @@ public ColumnView[] splitAsViews(int... indices) {
     ColumnView[] columnViews = new ColumnView[nativeHandles.length];
     try {
       for (int i = 0; i < nativeHandles.length; i++) {
-        columnViews[i] = new ColumnView(nativeHandles[i]);
+        long nativeHandle = nativeHandles[i];
+        // setting address to zero, so we don't clean it in case of an exception as it
+        // will be cleaned up by the constructor
         nativeHandles[i] = 0;
+        columnViews[i] = new ColumnView(nativeHandle);
       }
     } catch (Throwable t) {
-      cleanupColumnViews(nativeHandles, columnViews);
-      throw t;
+      try {
+        cleanupColumnViews(nativeHandles, columnViews, t);
+      } catch (Throwable s) {
+        t.addSuppressed(s);
+      } finally {
+        throw t;
+      }
     }
     return columnViews;
   }
 
-  static void cleanupColumnViews(long[] nativeHandles, ColumnView[] columnViews) {
-    for (ColumnView columnView: columnViews) {
+  static void cleanupColumnViews(long[] nativeHandles, ColumnView[] columnViews, Throwable throwable) {
+    for (ColumnView columnView : columnViews) {
       if (columnView != null) {
-        columnView.close();
+        try {
+          columnView.close();
+        } catch (Throwable s) {
+          throwable.addSuppressed(s);
+        }
       }
     }
-    for (long nativeHandle: nativeHandles) {
+    for (long nativeHandle : nativeHandles) {
       if (nativeHandle != 0) {
-        deleteColumnView(nativeHandle);
+        try {
+          deleteColumnView(nativeHandle);
+        } catch (Throwable s) {
+          throwable.addSuppressed(s);
+        }
       }
     }
   }
@@ -859,8 +883,9 @@ public final ColumnVector normalizeNANsAndZeros() {
   /**
    * Create a deep copy of the column while replacing the null mask. The resultant null mask is the
    * bitwise merge of null masks in the columns given as arguments.
+   * The result will be sanitized to not contain any non-empty nulls in case of nested types
    *
-   * @param mergeOp binary operator, currently only BITWISE_AND is supported.
+   * @param mergeOp binary operator (BITWISE_AND and BITWISE_OR only)
    * @param columns array of columns whose null masks are merged, must have identical number of rows.
    * @return the new ColumnVector with merged null mask.
    */
@@ -2490,6 +2515,31 @@ public final ColumnVector dropListDuplicatesWithKeysValues() {
     return new ColumnVector(dropListDuplicatesWithKeysValues(getNativeView()));
   }
 
+  /**
+   * Flatten each list of lists into a single list.
+   *
+   * The column must have rows that are lists of lists.
+   * Any row containing null list elements will result in a null output row.
+   *
+   * @return A new column vector containing the flattened result
+   */
+  public ColumnVector flattenLists() {
+    return flattenLists(false);
+  }
+
+  /**
+   * Flatten each list of lists into a single list.
+   *
+   * The column must have rows that are lists of lists.
+   *
+   * @param ignoreNull Whether to ignore null list elements in the input column from the operation,
+   *                   or any row containing null list elements will result in a null output row
+   * @return A new column vector containing the flattened result
+   */
+  public ColumnVector flattenLists(boolean ignoreNull) {
+    return new ColumnVector(flattenLists(getNativeView(), ignoreNull));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // STRINGS
   /////////////////////////////////////////////////////////////////////////////
@@ -3966,6 +4016,22 @@ static long getValidityBufferSize(int numRows) {
     return ((actualBytes + 63) >> 6) << 6;
   }
 
+  /**
+   * Count how many rows in the column are distinct from one another.
+   * @param nullPolicy if nulls should be included or not.
+   */
+  public int distinctCount(NullPolicy nullPolicy) {
+    return distinctCount(getNativeView(), nullPolicy.includeNulls);
+  }
+
+  /**
+   * Count how many rows in the column are distinct from one another.
+   * Nulls are included.
+   */
+  public int distinctCount() {
+    return distinctCount(getNativeView(), true);
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // INTERNAL/NATIVE ACCESS
   /////////////////////////////////////////////////////////////////////////////
@@ -3998,6 +4064,8 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) {
   }
 
   // Native Methods
+  private static native int distinctCount(long handle, boolean nullsIncluded);
+
   /**
    * Native method to parse and convert a string column vector to unix timestamp. A unix
    * timestamp is a long value representing how many units since 1970-01-01 00:00:00.000 in either
@@ -4448,6 +4516,8 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
 
   private static native long dropListDuplicatesWithKeysValues(long nativeHandle);
 
+  private static native long flattenLists(long inputHandle, boolean ignoreNull);
+
   /**
    * Native method for list lookup
    * @param nativeView the column view handle of the list
@@ -4719,7 +4789,7 @@ static native long makeCudfColumnView(int type, int scale, long data, long dataS
   static native int getNativeNumChildren(long viewHandle) throws CudfException;
 
   // calculate the amount of device memory used by this column including any child columns
-  static native long getDeviceMemorySize(long viewHandle) throws CudfException;
+  static native long getDeviceMemorySize(long viewHandle, boolean shouldPadForCpu) throws CudfException;
 
   static native long copyColumnViewToCV(long viewHandle) throws CudfException;
 
@@ -4933,7 +5003,7 @@ private static NestedColumnVector createNestedColumnVector(DType type, long rows
   /////////////////////////////////////////////////////////////////////////////
 
   private static HostColumnVectorCore copyToHostNestedHelper(
-      ColumnView deviceCvPointer) {
+      ColumnView deviceCvPointer, HostMemoryAllocator hostMemoryAllocator) {
     if (deviceCvPointer == null) {
       return null;
     }
@@ -4953,21 +5023,21 @@ private static HostColumnVectorCore copyToHostNestedHelper(
       currOffsets = deviceCvPointer.getOffsets();
       currValidity = deviceCvPointer.getValid();
       if (currData != null) {
-        hostData = HostMemoryBuffer.allocate(currData.length);
+        hostData = hostMemoryAllocator.allocate(currData.length);
         hostData.copyFromDeviceBuffer(currData);
       }
       if (currValidity != null) {
-        hostValid = HostMemoryBuffer.allocate(currValidity.length);
+        hostValid = hostMemoryAllocator.allocate(currValidity.length);
         hostValid.copyFromDeviceBuffer(currValidity);
       }
       if (currOffsets != null) {
-        hostOffsets = HostMemoryBuffer.allocate(currOffsets.length);
+        hostOffsets = hostMemoryAllocator.allocate(currOffsets.length);
         hostOffsets.copyFromDeviceBuffer(currOffsets);
       }
       int numChildren = deviceCvPointer.getNumChildren();
       for (int i = 0; i < numChildren; i++) {
         try(ColumnView childDevPtr = deviceCvPointer.getChildColumnView(i)) {
-          children.add(copyToHostNestedHelper(childDevPtr));
+          children.add(copyToHostNestedHelper(childDevPtr, hostMemoryAllocator));
         }
       }
       currNullCount = deviceCvPointer.getNullCount();
@@ -5004,7 +5074,7 @@ private static HostColumnVectorCore copyToHostNestedHelper(
   /**
    * Copy the data to the host.
    */
-  public HostColumnVector copyToHost() {
+  public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
     try (NvtxRange toHost = new NvtxRange("ensureOnHost", NvtxColor.BLUE)) {
       HostMemoryBuffer hostDataBuffer = null;
       HostMemoryBuffer hostValidityBuffer = null;
@@ -5024,16 +5094,16 @@ public HostColumnVector copyToHost() {
         getNullCount();
         if (!type.isNestedType()) {
           if (valid != null) {
-            hostValidityBuffer = HostMemoryBuffer.allocate(valid.getLength());
+            hostValidityBuffer = hostMemoryAllocator.allocate(valid.getLength());
             hostValidityBuffer.copyFromDeviceBuffer(valid);
           }
           if (offsets != null) {
-            hostOffsetsBuffer = HostMemoryBuffer.allocate(offsets.length);
+            hostOffsetsBuffer = hostMemoryAllocator.allocate(offsets.length);
             hostOffsetsBuffer.copyFromDeviceBuffer(offsets);
           }
           // If a strings column is all null values there is no data buffer allocated
           if (data != null) {
-            hostDataBuffer = HostMemoryBuffer.allocate(data.length);
+            hostDataBuffer = hostMemoryAllocator.allocate(data.length);
             hostDataBuffer.copyFromDeviceBuffer(data);
           }
           HostColumnVector ret = new HostColumnVector(type, rows, Optional.of(nullCount),
@@ -5042,22 +5112,22 @@ public HostColumnVector copyToHost() {
           return ret;
         } else {
           if (data != null) {
-            hostDataBuffer = HostMemoryBuffer.allocate(data.length);
+            hostDataBuffer = hostMemoryAllocator.allocate(data.length);
             hostDataBuffer.copyFromDeviceBuffer(data);
           }
 
           if (valid != null) {
-            hostValidityBuffer = HostMemoryBuffer.allocate(valid.getLength());
+            hostValidityBuffer = hostMemoryAllocator.allocate(valid.getLength());
             hostValidityBuffer.copyFromDeviceBuffer(valid);
           }
           if (offsets != null) {
-            hostOffsetsBuffer = HostMemoryBuffer.allocate(offsets.getLength());
+            hostOffsetsBuffer = hostMemoryAllocator.allocate(offsets.getLength());
             hostOffsetsBuffer.copyFromDeviceBuffer(offsets);
           }
           List<HostColumnVectorCore> children = new ArrayList<>();
           for (int i = 0; i < getNumChildren(); i++) {
             try (ColumnView childDevPtr = getChildColumnView(i)) {
-              children.add(copyToHostNestedHelper(childDevPtr));
+              children.add(copyToHostNestedHelper(childDevPtr, hostMemoryAllocator));
             }
           }
           HostColumnVector ret = new HostColumnVector(type, rows, Optional.of(nullCount),
@@ -5090,6 +5160,23 @@ public HostColumnVector copyToHost() {
     }
   }
 
+  public HostColumnVector copyToHost() {
+    return copyToHost(DefaultHostMemoryAllocator.get());
+  }
+
+  /**
+   * Calculate the total space required to copy the data to the host. This should be padded to
+   * the alignment that the CPU requires.
+   */
+  public long getHostBytesRequired() {
+    return getDeviceMemorySize(getNativeView(), true);
+  }
+
+  /**
+   * Get the size that the host will align memory allocations to in bytes.
+   */
+  public static native long hostPaddingSizeInBytes();
+
   /**
    * Exact check if a column or its descendants have non-empty null rows
    *
@@ -5122,4 +5209,26 @@ public boolean hasNonEmptyNulls() {
   public ColumnVector purgeNonEmptyNulls() {
     return new ColumnVector(purgeNonEmptyNulls(viewHandle));
   }
+
+  static ColumnView[] getColumnViewsFromPointers(long[] nativeHandles) {
+    ColumnView[] columns = new ColumnView[nativeHandles.length];
+    try {
+      for (int i = 0; i < nativeHandles.length; i++) {
+        long nativeHandle = nativeHandles[i];
+        // setting address to zero, so we don't clean it in case of an exception as it
+        // will be cleaned up by the constructor
+        nativeHandles[i] = 0;
+        columns[i] = new ColumnView(nativeHandle);
+      }
+      return columns;
+    } catch (Throwable t) {
+      try {
+        cleanupColumnViews(nativeHandles, columns, t);
+      } catch (Throwable s) {
+        t.addSuppressed(s);
+      } finally {
+        throw t;
+      }
+    }
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
index 2177f58c9de..a95c5f58f09 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
@@ -522,7 +522,11 @@ protected String[] getFlatColumnNames(String[] ret) {
    * Maps are List columns with a Struct named 'key_value' with a child named 'key' and a child
    * named 'value'. The caller of this method doesn't need to worry about this as this method will
    * take care of this without the knowledge of the caller.
+   *
+   * Note: This method always returns a nullabe column, cannot return non-nullable column.
+   * Do not use this, use the next function with the parameter `isNullable`.
    */
+  @Deprecated
   public static ColumnWriterOptions mapColumn(String name, ColumnWriterOptions key,
                                               ColumnWriterOptions value) {
     StructColumnWriterOptions struct = structBuilder("key_value").build();
@@ -537,6 +541,32 @@ public static ColumnWriterOptions mapColumn(String name, ColumnWriterOptions key
     return opt;
   }
 
+  /**
+   * Add a Map Column to the schema.
+   * <p>
+   * Maps are List columns with a Struct named 'key_value' with a child named 'key' and a child
+   * named 'value'. The caller of this method doesn't need to worry about this as this method will
+   * take care of this without the knowledge of the caller.
+   *
+   * Note: If this map column is a key of another map, should pass isNullable = false.
+   * e.g.: map1(map2(int, int), int) the map2 should be non-nullable.
+   *
+   * @param isNullable is the returned map nullable.
+   */
+  public static ColumnWriterOptions mapColumn(String name, ColumnWriterOptions key,
+                                              ColumnWriterOptions value, Boolean isNullable) {
+    if (key.isNullable) {
+      throw new IllegalArgumentException("key column can not be nullable");
+    }
+    StructColumnWriterOptions struct = structBuilder("key_value").build();
+    struct.childColumnOptions = new ColumnWriterOptions[]{key, value};
+    ColumnWriterOptions opt = listBuilder(name, isNullable)
+        .withStructColumn(struct)
+        .build();
+    opt.isMap = true;
+    return opt;
+  }
+
   /**
    * Creates a ListBuilder for column called 'name'
    */
diff --git a/java/src/main/java/ai/rapids/cudf/ContiguousTable.java b/java/src/main/java/ai/rapids/cudf/ContiguousTable.java
index 87a3f5f0ddf..8193e4f943b 100644
--- a/java/src/main/java/ai/rapids/cudf/ContiguousTable.java
+++ b/java/src/main/java/ai/rapids/cudf/ContiguousTable.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -25,11 +25,11 @@
  * much simpler.
  */
 public final class ContiguousTable implements AutoCloseable {
-  private long metadataHandle = 0;
   private Table table = null;
   private DeviceMemoryBuffer buffer;
-  private ByteBuffer metadataBuffer = null;
   private final long rowCount;
+  private PackedColumnMetadata meta;
+  private ByteBuffer metadataBuffer;
 
   // This method is invoked by JNI
   static ContiguousTable fromPackedTable(long metadataHandle,
@@ -43,8 +43,8 @@ static ContiguousTable fromPackedTable(long metadataHandle,
 
   /** Construct a contiguous table instance given a table and the device buffer backing it. */
   ContiguousTable(Table table, DeviceMemoryBuffer buffer) {
-    this.metadataHandle = createPackedMetadata(table.getNativeView(),
-            buffer.getAddress(), buffer.getLength());
+    this.meta = new PackedColumnMetadata(createPackedMetadata(table.getNativeView(),
+            buffer.getAddress(), buffer.getLength()));
     this.table = table;
     this.buffer = buffer;
     this.rowCount = table.getRowCount();
@@ -57,7 +57,7 @@ static ContiguousTable fromPackedTable(long metadataHandle,
    * @param rowCount number of rows in the table
    */
   ContiguousTable(long metadataHandle, DeviceMemoryBuffer buffer, long rowCount) {
-    this.metadataHandle = metadataHandle;
+    this.meta = new PackedColumnMetadata(metadataHandle);
     this.buffer = buffer;
     this.rowCount = rowCount;
   }
@@ -94,18 +94,14 @@ public DeviceMemoryBuffer getBuffer() {
    *       or data corruption.
    */
   public ByteBuffer getMetadataDirectBuffer() {
-    if (metadataBuffer == null) {
-      metadataBuffer = createMetadataDirectBuffer(metadataHandle);
-    }
-    return metadataBuffer.asReadOnlyBuffer();
+    return meta.getMetadataDirectBuffer();
   }
 
   /** Close the contiguous table instance and its underlying resources. */
   @Override
   public void close() {
-    if (metadataHandle != 0) {
-      closeMetadata(metadataHandle);
-      metadataHandle = 0;
+    if (meta != null) {
+      meta.close();
     }
 
     if (table != null) {
@@ -122,9 +118,4 @@ public void close() {
   // create packed metadata for a table backed by a single data buffer
   private static native long createPackedMetadata(long tableView, long dataAddress, long dataSize);
 
-  // create a DirectByteBuffer for the packed table metadata
-  private static native ByteBuffer createMetadataDirectBuffer(long metadataHandle);
-
-  // release the native metadata resources for a packed table
-  private static native void closeMetadata(long metadataHandle);
 }
diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java
index ff7ca308f3c..f73a6244cdb 100755
--- a/java/src/main/java/ai/rapids/cudf/CudaException.java
+++ b/java/src/main/java/ai/rapids/cudf/CudaException.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,9 +19,9 @@
 import java.util.Map;
 
 /**
- * Exception from the cuda language/library.  Be aware that because of how cuda does asynchronous
+ * Exception from the cuda language/library. Be aware that because of how cuda does asynchronous
  * processing exceptions from cuda can be thrown by method calls that did not cause the exception
- * to take place.  These will take place on the same thread that caused the error.
+ * to take place. These will take place on the same thread that caused the error.
  * <p>
  * Please See
  * <a href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html">the cuda docs</a>
@@ -32,16 +32,32 @@
  */
 public class CudaException extends RuntimeException {
   CudaException(String message, int errorCode) {
+    this(message, "No native stacktrace is available.", errorCode);
+  }
+
+  CudaException(String message, String nativeStacktrace, int errorCode) {
     super(message);
+    this.nativeStacktrace = nativeStacktrace;
     cudaError = CudaError.parseErrorCode(errorCode);
   }
 
-  CudaException(String message, int errorCode, Throwable cause) {
+  CudaException(String message, String nativeStacktrace, int errorCode, Throwable cause) {
     super(message, cause);
+    this.nativeStacktrace = nativeStacktrace;
     cudaError = CudaError.parseErrorCode(errorCode);
   }
 
-  public final CudaError cudaError;
+  public String getNativeStacktrace() {
+    return nativeStacktrace;
+  }
+
+  public CudaError getCudaError() {
+    return cudaError;
+  }
+
+  private final String nativeStacktrace;
+
+  private final CudaError cudaError;
 
   /**
    * The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM.
diff --git a/java/src/main/java/ai/rapids/cudf/CudaFatalException.java b/java/src/main/java/ai/rapids/cudf/CudaFatalException.java
index cf36726aa80..067e2c25207 100644
--- a/java/src/main/java/ai/rapids/cudf/CudaFatalException.java
+++ b/java/src/main/java/ai/rapids/cudf/CudaFatalException.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,10 +22,14 @@
  */
 public class CudaFatalException extends CudaException {
   CudaFatalException(String message, int errorCode) {
-    super(message, errorCode);
+    this(message, "No native stacktrace is available.", errorCode);
   }
 
-  CudaFatalException(String message, int errorCode, Throwable cause) {
-    super(message, errorCode, cause);
+  CudaFatalException(String message, String nativeStacktrace, int errorCode) {
+    super(message, nativeStacktrace, errorCode);
+  }
+
+  CudaFatalException(String message, String nativeStacktrace, int errorCode, Throwable cause) {
+    super(message, nativeStacktrace, errorCode, cause);
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/CudfColumnSizeOverflowException.java b/java/src/main/java/ai/rapids/cudf/CudfColumnSizeOverflowException.java
new file mode 100755
index 00000000000..9e724907a3c
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/CudfColumnSizeOverflowException.java
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ai.rapids.cudf;
+
+/**
+ * Exception thrown when CUDF operation results in a column size
+ * exceeding CUDF column size limits
+ */
+public class CudfColumnSizeOverflowException extends CudfException {
+  CudfColumnSizeOverflowException(String message) {
+    super(message);
+  }
+
+  CudfColumnSizeOverflowException(String message, String nativeStacktrace) {
+    super(message, nativeStacktrace);
+  }
+
+  CudfColumnSizeOverflowException(String message, String nativeStacktrace, Throwable cause) {
+    super(message, nativeStacktrace, cause);
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/CudfException.java b/java/src/main/java/ai/rapids/cudf/CudfException.java
index 2e9c332091a..4038c563754 100755
--- a/java/src/main/java/ai/rapids/cudf/CudfException.java
+++ b/java/src/main/java/ai/rapids/cudf/CudfException.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,10 +20,22 @@
  */
 public class CudfException extends RuntimeException {
   CudfException(String message) {
+    this(message, "No native stacktrace is available.");
+  }
+
+  CudfException(String message, String nativeStacktrace) {
     super(message);
+    this.nativeStacktrace = nativeStacktrace;
   }
 
-  CudfException(String message, Throwable cause) {
+  CudfException(String message, String nativeStacktrace, Throwable cause) {
     super(message, cause);
+    this.nativeStacktrace = nativeStacktrace;
   }
+
+  public final String getNativeStacktrace() {
+    return nativeStacktrace;
+  }
+
+  private final String nativeStacktrace;
 }
diff --git a/java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java b/java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java
new file mode 100644
index 00000000000..98a5b00cf85
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java
@@ -0,0 +1,36 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+public class DefaultHostMemoryAllocator implements HostMemoryAllocator {
+  private static final HostMemoryAllocator INSTANCE = new DefaultHostMemoryAllocator();
+  public static HostMemoryAllocator get() {
+    return INSTANCE;
+  }
+
+  @Override
+  public HostMemoryBuffer allocate(long bytes, boolean preferPinned) {
+    return HostMemoryBuffer.allocate(bytes, preferPinned);
+  }
+
+  @Override
+  public HostMemoryBuffer allocate(long bytes) {
+    return HostMemoryBuffer.allocate(bytes);
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index 6cb7767784a..3e4baf962bc 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -39,12 +39,31 @@
  * and call incRefCount to increment the reference count.
  */
 public final class HostColumnVector extends HostColumnVectorCore {
+  /**
+   * Interface to handle events for this HostColumnVector. Only invoked during
+   * close, hence `onClosed` is the only event.
+   */
+  public interface EventHandler {
+    /**
+     * `onClosed` is invoked with the updated `refCount` during `close`.
+     * The last invocation of `onClosed` will be with `refCount=0`.
+     *
+     * @note the callback is invoked with this `HostColumnVector`'s lock held.
+     *
+     * @param cv reference to the HostColumnVector we are closing
+     * @param refCount the updated ref count for this HostColumnVector at
+     *                 the time of invocation
+     */
+    void onClosed(HostColumnVector cv, int refCount);
+  }
+
   /**
    * The size in bytes of an offset entry
    */
   static final int OFFSET_SIZE = DType.INT32.getSizeInBytes();
 
   private int refCount;
+  private EventHandler eventHandler;
 
   /**
    * Create a new column vector with data populated on the host.
@@ -93,6 +112,27 @@ public HostColumnVector(DType type, long rows, Optional<Long> nullCount,
     incRefCountInternal(true);
   }
 
+  /**
+   * Set an event handler for this host vector. This method can be invoked with
+   * null to unset the handler.
+   *
+   * @param newHandler - the EventHandler to use from this point forward
+   * @return the prior event handler, or null if not set.
+   */
+  public synchronized EventHandler setEventHandler(EventHandler newHandler) {
+    EventHandler prev = this.eventHandler;
+    this.eventHandler = newHandler;
+    return prev;
+  }
+
+  /**
+   * Returns the current event handler for this HostColumnVector or null if no
+   * handler is associated.
+   */
+  public synchronized EventHandler getEventHandler() {
+    return this.eventHandler;
+  }
+
   /**
    * This is a really ugly API, but it is possible that the lifecycle of a column of
    * data may not have a clear lifecycle thanks to java and GC. This API informs the leak
@@ -110,6 +150,9 @@ public void noWarnLeakExpected() {
   public synchronized void close() {
     refCount--;
     offHeap.delRef();
+    if (eventHandler != null) {
+      eventHandler.onClosed(this, refCount);
+    }
     if (refCount == 0) {
       offHeap.clean(false);
       for( HostColumnVectorCore child : children) {
@@ -156,7 +199,7 @@ private synchronized HostColumnVector incRefCountInternal(boolean isFirstTime) {
   /**
    * Returns this column's current refcount
    */
-  synchronized int getRefCount() {
+  public synchronized int getRefCount() {
     return refCount;
   }
 
diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryAllocator.java b/java/src/main/java/ai/rapids/cudf/HostMemoryAllocator.java
new file mode 100644
index 00000000000..9834eb85e18
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/HostMemoryAllocator.java
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+public interface HostMemoryAllocator {
+
+  /**
+   * Allocate memory, but be sure to close the returned buffer to avoid memory leaks.
+   * @param bytes size in bytes to allocate
+   * @param preferPinned If set to true, the pinned memory pool will be used if possible with a
+   *                    fallback to off-heap memory.  If set to false, the allocation will always
+   *                    be from off-heap memory.
+   * @return the newly created buffer
+   */
+  HostMemoryBuffer allocate(long bytes, boolean preferPinned);
+
+  /**
+   * Allocate memory, but be sure to close the returned buffer to avoid memory leaks. Pinned memory
+   * for allocations preference is up to the implementor
+   *
+   * @param bytes size in bytes to allocate
+   * @return the newly created buffer
+   */
+  HostMemoryBuffer allocate(long bytes);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java b/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java
new file mode 100644
index 00000000000..72c2e659372
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java
@@ -0,0 +1,32 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Represents some amount of host memory that has been reserved. A reservation guarantees that one
+ * or more allocations up to the reserved amount, minus padding for alignment will succeed. A
+ * reservation typically guarantees the amount can be allocated one, meaning when a buffer
+ * allocated from a reservation is freed it is not returned to the reservation, but to the pool of
+ * memory the reservation originally came from. If more memory is allocated from the reservation
+ * an OutOfMemoryError may be thrown, but it is not guaranteed to happen.
+ *
+ * When the reservation is closed any unused reservation will be returned to the pool of memory
+ * the reservation came from.
+ */
+public interface HostMemoryReservation extends HostMemoryAllocator, AutoCloseable {}
diff --git a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
index 40a22604f49..7deb5bae541 100644
--- a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
+++ b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
@@ -1810,14 +1810,17 @@ public static ContiguousTable concatToContiguousTable(SerializedTableHeader[] he
    * Concatenate multiple tables in host memory into a single host table buffer.
    * @param headers table headers corresponding to the host table buffers
    * @param dataBuffers host table buffer for each input table to be concatenated
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return host table header and buffer
    */
   public static HostConcatResult concatToHostBuffer(SerializedTableHeader[] headers,
-                                                    HostMemoryBuffer[] dataBuffers) throws IOException {
+                                                    HostMemoryBuffer[] dataBuffers,
+                                                    HostMemoryAllocator hostMemoryAllocator
+                                                    ) throws IOException {
     ColumnBufferProvider[][] providersPerColumn = providersFrom(headers, dataBuffers);
     try {
       SerializedTableHeader combined = calcConcatHeader(providersPerColumn);
-      HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(combined.dataLen);
+      HostMemoryBuffer hostBuffer = hostMemoryAllocator.allocate(combined.dataLen);
       try {
         try (NvtxRange range = new NvtxRange("Concat Host Side", NvtxColor.GREEN)) {
           DataWriter writer = writerFrom(hostBuffer);
@@ -1837,6 +1840,12 @@ public static HostConcatResult concatToHostBuffer(SerializedTableHeader[] header
     }
   }
 
+    public static HostConcatResult concatToHostBuffer(SerializedTableHeader[] headers,
+                                                      HostMemoryBuffer[] dataBuffers
+                                                      ) throws IOException {
+      return concatToHostBuffer(headers, dataBuffers, DefaultHostMemoryAllocator.get());
+    }
+
   /**
    * Deserialize a serialized contiguous table into an array of host columns.
    *
@@ -1916,12 +1925,14 @@ public static TableAndRowCountPair readTableFrom(SerializedTableHeader header,
   /**
    * Read a serialize table from the given InputStream.
    * @param in the stream to read the table data from.
+   * @param hostMemoryAllocator a host memory allocator for an intermediate host memory buffer
    * @return the deserialized table in device memory, or null if the stream has no table to read
    * from, an end of the stream at the very beginning.
    * @throws IOException on any error.
    * @throws EOFException if the data stream ended unexpectedly in the middle of processing.
    */
-  public static TableAndRowCountPair readTableFrom(InputStream in) throws IOException {
+  public static TableAndRowCountPair readTableFrom(InputStream in,
+      HostMemoryAllocator hostMemoryAllocator) throws IOException {
     DataInputStream din;
     if (in instanceof DataInputStream) {
       din = (DataInputStream) in;
@@ -1934,7 +1945,7 @@ public static TableAndRowCountPair readTableFrom(InputStream in) throws IOExcept
       return new TableAndRowCountPair(0, null);
     }
 
-    try (HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(header.dataLen)) {
+    try (HostMemoryBuffer hostBuffer = hostMemoryAllocator.allocate(header.dataLen)) {
       if (header.dataLen > 0) {
         readTableIntoBuffer(din, header, hostBuffer);
       }
@@ -1942,6 +1953,10 @@ public static TableAndRowCountPair readTableFrom(InputStream in) throws IOExcept
     }
   }
 
+  public static TableAndRowCountPair readTableFrom(InputStream in) throws IOException {
+    return readTableFrom(in, DefaultHostMemoryAllocator.get());
+  }
+
   /** Holds the result of deserializing a table. */
   public static final class TableAndRowCountPair implements Closeable {
     private final int numRows;
diff --git a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
index 8e7c5f62054..032b075bab7 100644
--- a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
+++ b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -26,11 +26,10 @@
 import java.lang.ref.WeakReference;
 import java.text.SimpleDateFormat;
 import java.util.Arrays;
-import java.util.Collections;
 import java.util.Date;
 import java.util.LinkedList;
 import java.util.List;
-import java.util.Set;
+import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Collectors;
@@ -115,7 +114,11 @@ public final void logRefCountDebug(String message) {
      * @return true if resources were cleaned up else false.
      */
     public final boolean clean(boolean logErrorIfNotClean) {
-      return cleanImpl(logErrorIfNotClean && !leakExpected);
+      boolean cleaned = cleanImpl(logErrorIfNotClean && !leakExpected);
+      if (cleaned) {
+        all.remove(id);
+      }
+      return cleaned;
     }
 
     /**
@@ -145,8 +148,8 @@ public void noWarnLeakExpected() {
   }
 
   static final AtomicLong leakCount = new AtomicLong();
-  private static final Set<CleanerWeakReference> all =
-      Collections.newSetFromMap(new ConcurrentHashMap()); // We want to be thread safe
+  private static final Map<Long, CleanerWeakReference> all =
+      new ConcurrentHashMap(); // We want to be thread safe
   private static final ReferenceQueue<?> collected = new ReferenceQueue<>();
 
   private static class CleanerWeakReference<T> extends WeakReference<T> {
@@ -198,7 +201,7 @@ static void setDefaultGpu(int defaultGpuId) {
           } catch (Throwable t) {
             log.error("CAUGHT EXCEPTION WHILE TRYING TO CLEAN " + next, t);
           }
-          all.remove(next);
+          all.remove(next.cleaner.id);
         }
       }
     } catch (InterruptedException e) {
@@ -225,7 +228,7 @@ static void setDefaultGpu(int defaultGpuId) {
       Cuda.setDevice(defaultGpu);
     }
 
-    for (CleanerWeakReference cwr : all) {
+    for (CleanerWeakReference cwr : all.values()) {
       cwr.clean();
     }
   };
@@ -255,50 +258,50 @@ public static Runnable removeDefaultShutdownHook() {
 
   static void register(ColumnVector vec, Cleaner cleaner) {
     // It is now registered...
-    all.add(new CleanerWeakReference(vec, cleaner, collected, true));
+    all.put(cleaner.id, new CleanerWeakReference(vec, cleaner, collected, true));
   }
 
   static void register(HostColumnVectorCore vec, Cleaner cleaner) {
     // It is now registered...
-    all.add(new CleanerWeakReference(vec, cleaner, collected, false));
+    all.put(cleaner.id, new CleanerWeakReference(vec, cleaner, collected, false));
   }
 
   static void register(MemoryBuffer buf, Cleaner cleaner) {
     // It is now registered...
-    all.add(new CleanerWeakReference(buf, cleaner, collected, buf instanceof BaseDeviceMemoryBuffer));
+    all.put(cleaner.id, new CleanerWeakReference(buf, cleaner, collected, buf instanceof BaseDeviceMemoryBuffer));
   }
 
   static void register(Cuda.Stream stream, Cleaner cleaner) {
     // It is now registered...
-    all.add(new CleanerWeakReference(stream, cleaner, collected, false));
+    all.put(cleaner.id, new CleanerWeakReference(stream, cleaner, collected, false));
   }
 
   static void register(Cuda.Event event, Cleaner cleaner) {
     // It is now registered...
-    all.add(new CleanerWeakReference(event, cleaner, collected, false));
+    all.put(cleaner.id, new CleanerWeakReference(event, cleaner, collected, false));
   }
 
   static void register(CuFileDriver driver, Cleaner cleaner) {
     // It is now registered...
-    all.add(new CleanerWeakReference(driver, cleaner, collected, false));
+    all.put(cleaner.id, new CleanerWeakReference(driver, cleaner, collected, false));
   }
 
   static void register(CuFileBuffer buffer, Cleaner cleaner) {
     // It is now registered...
-    all.add(new CleanerWeakReference(buffer, cleaner, collected, false));
+    all.put(cleaner.id, new CleanerWeakReference(buffer, cleaner, collected, false));
   }
 
   static void register(CuFileHandle handle, Cleaner cleaner) {
     // It is now registered...
-    all.add(new CleanerWeakReference(handle, cleaner, collected, false));
+    all.put(cleaner.id, new CleanerWeakReference(handle, cleaner, collected, false));
   }
 
   public static void register(CompiledExpression expr, Cleaner cleaner) {
-    all.add(new CleanerWeakReference(expr, cleaner, collected, false));
+    all.put(cleaner.id, new CleanerWeakReference(expr, cleaner, collected, false));
   }
 
   static void register(HashJoin hashJoin, Cleaner cleaner) {
-    all.add(new CleanerWeakReference(hashJoin, cleaner, collected, true));
+    all.put(cleaner.id, new CleanerWeakReference(hashJoin, cleaner, collected, true));
   }
 
   /**
@@ -307,7 +310,7 @@ static void register(HashJoin hashJoin, Cleaner cleaner) {
    * @return true if there are rmm blockers else false.
    */
   static boolean bestEffortHasRmmBlockers() {
-    return all.stream().anyMatch(cwr -> cwr.isRmmBlocker && !cwr.cleaner.isClean());
+    return all.values().stream().anyMatch(cwr -> cwr.isRmmBlocker && !cwr.cleaner.isClean());
   }
 
   /**
diff --git a/java/src/main/java/ai/rapids/cudf/PackedColumnMetadata.java b/java/src/main/java/ai/rapids/cudf/PackedColumnMetadata.java
new file mode 100644
index 00000000000..5ee4ddcf80b
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/PackedColumnMetadata.java
@@ -0,0 +1,74 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Metadata for a table that is backed by a single contiguous device buffer.
+ */
+public final class PackedColumnMetadata implements AutoCloseable {
+  private long metadataHandle = 0;
+  private ByteBuffer metadataBuffer = null;
+
+  // This method is invoked by JNI
+  static PackedColumnMetadata fromPackedColumnMeta(long metadataHandle) {
+    return new PackedColumnMetadata(metadataHandle);
+  }
+
+  /**
+   * Construct the PackedColumnMetadata instance given a metadata handle.
+   * @param metadataHandle address of the cudf packed_table host-based metadata instance
+   */
+  PackedColumnMetadata(long metadataHandle) {
+    this.metadataHandle = metadataHandle;
+  }
+
+  /**
+   * Get the byte buffer containing the host metadata describing the schema and layout of the
+   * contiguous table.
+   * <p>
+   * NOTE: This is a direct byte buffer that is backed by the underlying native metadata instance
+   *       and therefore is only valid to be used while this PackedColumnMetadata instance is valid.
+   *       Attempts to cache and access the resulting buffer after this instance has been destroyed
+   *       will result in undefined behavior including the possibility of segmentation faults
+   *       or data corruption.
+   */
+  public ByteBuffer getMetadataDirectBuffer() {
+    if (metadataBuffer == null) {
+      metadataBuffer = createMetadataDirectBuffer(metadataHandle);
+    }
+    return metadataBuffer.asReadOnlyBuffer();
+  }
+
+  /** Close the PackedColumnMetadata instance and its underlying resources. */
+  @Override
+  public void close() {
+    if (metadataHandle != 0) {
+      closeMetadata(metadataHandle);
+      metadataHandle = 0;
+    }
+  }
+
+  // create a DirectByteBuffer for the packed metadata
+  private static native ByteBuffer createMetadataDirectBuffer(long metadataHandle);
+
+  // release the native metadata resources for a packed table
+  private static native void closeMetadata(long metadataHandle);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 6eee935748e..9ce72ba237e 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -37,13 +37,14 @@
  */
 public final class PinnedMemoryPool implements AutoCloseable {
   private static final Logger log = LoggerFactory.getLogger(PinnedMemoryPool.class);
-  private static final long ALIGNMENT = 8;
+  private static final long ALIGNMENT = ColumnView.hostPaddingSizeInBytes();
 
   // These static fields should only ever be accessed when class-synchronized.
   // Do NOT use singleton_ directly!  Use the getSingleton accessor instead.
   private static volatile PinnedMemoryPool singleton_ = null;
   private static Future<PinnedMemoryPool> initFuture = null;
 
+  private final long totalPoolSize;
   private final long pinnedPoolBase;
   private final SortedSet<MemorySection> freeHeap = new TreeSet<>(new SortedByAddress());
   private int numAllocatedSections = 0;
@@ -164,6 +165,14 @@ private static void freeInternal(MemorySection section) {
     Objects.requireNonNull(getSingleton()).free(section);
   }
 
+  /**
+   * Used to indicate that memory was allocated from a reservation. This primarily is for
+   * keeping track of outstanding allocations.
+   */
+  private static void reserveAllocInternal(MemorySection section) {
+    Objects.requireNonNull(getSingleton()).reserveAllocHappened(section);
+  }
+
   /**
    * Initialize the pool.
    *
@@ -226,6 +235,21 @@ public static HostMemoryBuffer tryAllocate(long bytes) {
     return result;
   }
 
+  /**
+   * Factory method to create a pinned host memory reservation.
+   *
+   * @param bytes size in bytes to reserve
+   * @return newly created reservation or null if insufficient pinned memory to cover it.
+   */
+  public static HostMemoryReservation tryReserve(long bytes) {
+    HostMemoryReservation result = null;
+    PinnedMemoryPool pool = getSingleton();
+    if (pool != null) {
+      result = pool.tryReserveInternal(bytes);
+    }
+    return result;
+  }
+
   /**
    * Factory method to create a host buffer but preferably pointing to pinned memory.
    * It is not guaranteed that the returned buffer will be pointer to pinned memory.
@@ -233,14 +257,25 @@ public static HostMemoryBuffer tryAllocate(long bytes) {
    * @param bytes size in bytes to allocate
    * @return newly created buffer
    */
-  public static HostMemoryBuffer allocate(long bytes) {
+  public static HostMemoryBuffer allocate(long bytes, HostMemoryAllocator hostMemoryAllocator) {
     HostMemoryBuffer result = tryAllocate(bytes);
     if (result == null) {
-      result = HostMemoryBuffer.allocate(bytes, false);
+      result = hostMemoryAllocator.allocate(bytes, false);
     }
     return result;
   }
 
+  /**
+   * Factory method to create a host buffer but preferably pointing to pinned memory.
+   * It is not guaranteed that the returned buffer will be pointer to pinned memory.
+   *
+   * @param bytes size in bytes to allocate
+   * @return newly created buffer
+   */
+  public static HostMemoryBuffer allocate(long bytes) {
+    return allocate(bytes, DefaultHostMemoryAllocator.get());
+  }
+
   /**
    * Get the number of bytes free in the pinned memory pool.
    *
@@ -254,12 +289,24 @@ public static long getAvailableBytes() {
     return 0;
   }
 
+  /**
+   * Get the number of bytes that the pinned memory pool was allocated with.
+   */
+  public static long getTotalPoolSizeBytes() {
+    PinnedMemoryPool pool = getSingleton();
+    if (pool != null) {
+      return pool.getTotalPoolSizeInternal();
+    }
+    return 0;
+  }
+
   private PinnedMemoryPool(long poolSize, int gpuId) {
     if (gpuId > -1) {
       // set the gpu device to use
       Cuda.setDevice(gpuId);
       Cuda.freeZero();
     }
+    this.totalPoolSize = poolSize;
     this.pinnedPoolBase = Cuda.hostAllocPinned(poolSize);
     freeHeap.add(new MemorySection(pinnedPoolBase, poolSize));
     this.availableBytes = poolSize;
@@ -267,32 +314,42 @@ private PinnedMemoryPool(long poolSize, int gpuId) {
 
   @Override
   public void close() {
-    assert numAllocatedSections == 0;
+    assert numAllocatedSections == 0 : "Leaked " + numAllocatedSections + " pinned allocations";
     Cuda.freePinned(pinnedPoolBase);
   }
 
-  private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
+  /**
+   * Pads a length of bytes to the alignment the CPU wants in the worst case. This helps to
+   * calculate the size needed for a reservation if there are multiple buffers.
+   * @param bytes the size in bytes
+   * @return the new padded size in bytes.
+   */
+  public static long padToCpuAlignment(long bytes) {
+    return  ((bytes + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT;
+  }
+
+  private synchronized MemorySection tryGetInternal(long bytes, String what) {
     if (freeHeap.isEmpty()) {
       log.debug("No free pinned memory left");
       return null;
     }
     // Align the allocation
-    long alignedBytes = ((bytes + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT;
+    long alignedBytes = padToCpuAlignment(bytes);
     Optional<MemorySection> firstFit = freeHeap.stream()
-        .filter(section -> section.size >= alignedBytes)
-        .findFirst();
+            .filter(section -> section.size >= alignedBytes)
+            .findFirst();
     if (!firstFit.isPresent()) {
       if (log.isDebugEnabled()) {
         MemorySection largest = freeHeap.stream()
-            .max(new SortedBySize())
-            .orElse(new MemorySection(0, 0));
+                .max(new SortedBySize())
+                .orElse(new MemorySection(0, 0));
         log.debug("Insufficient pinned memory. {} needed, {} found", alignedBytes, largest.size);
       }
       return null;
     }
     MemorySection first = firstFit.get();
-    log.debug("Allocating {}/{} bytes pinned from {} FREE COUNT {} OUTSTANDING COUNT {}",
-        bytes, alignedBytes, first, freeHeap.size(), numAllocatedSections);
+    log.debug("{} {}/{} bytes pinned from {} FREE COUNT {} OUTSTANDING COUNT {}",
+            what, bytes, alignedBytes, first, freeHeap.size(), numAllocatedSections);
     freeHeap.remove(first);
     MemorySection allocated;
     if (first.size == alignedBytes) {
@@ -303,9 +360,74 @@ private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
     }
     numAllocatedSections++;
     availableBytes -= allocated.size;
-    log.debug("Allocated {} free {} outstanding {}", allocated, freeHeap, numAllocatedSections);
-    return new HostMemoryBuffer(allocated.baseAddress, bytes,
-        new PinnedHostBufferCleaner(allocated, bytes));
+    log.debug("{} {} free {} outstanding {}", what, allocated, freeHeap, numAllocatedSections);
+    return allocated;
+  }
+
+  private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
+    MemorySection allocated = tryGetInternal(bytes, "allocate");
+    if (allocated == null) {
+      return null;
+    } else {
+      return new HostMemoryBuffer(allocated.baseAddress, bytes,
+              new PinnedHostBufferCleaner(allocated, bytes));
+    }
+  }
+
+  private class PinnedReservation implements HostMemoryReservation {
+    private MemorySection section = null;
+
+    public PinnedReservation(MemorySection section) {
+      this.section = section;
+    }
+
+    @Override
+    public synchronized HostMemoryBuffer allocate(long bytes, boolean preferPinned) {
+      return this.allocate(bytes);
+    }
+
+    @Override
+    public synchronized HostMemoryBuffer allocate(long bytes) {
+      if (section == null || section.size < bytes) {
+        throw new OutOfMemoryError("Reservation didn't have enough space " + bytes + " / " +
+                (section == null ? 0 : section.size));
+      }
+      long alignedSize = padToCpuAlignment(bytes);
+      MemorySection allocated;
+      if (section.size >= bytes && section.size <= alignedSize) {
+        allocated = section;
+        section = null;
+        // No need for reserveAllocInternal because the original section is already tracked
+      } else {
+        allocated = section.splitOff(alignedSize);
+        PinnedMemoryPool.reserveAllocInternal(allocated);
+      }
+      return new HostMemoryBuffer(allocated.baseAddress, bytes,
+              new PinnedHostBufferCleaner(allocated, bytes));
+    }
+
+    @Override
+    public synchronized void close() throws Exception {
+      if (section != null) {
+        try {
+          PinnedMemoryPool.freeInternal(section);
+        } finally {
+          // Always mark the resource as freed even if an exception is thrown.
+          // We cannot know how far it progressed before the exception, and
+          // therefore it is unsafe to retry.
+          section = null;
+        }
+      }
+    }
+  }
+
+  private HostMemoryReservation tryReserveInternal(long bytes) {
+    MemorySection allocated = tryGetInternal(bytes, "allocate");
+    if (allocated == null) {
+      return null;
+    } else {
+      return new PinnedReservation(allocated);
+    }
   }
 
   private synchronized void free(MemorySection section) {
@@ -324,7 +446,17 @@ private synchronized void free(MemorySection section) {
     log.debug("After freeing {} outstanding {}", freeHeap, numAllocatedSections);
   }
 
+  private synchronized void reserveAllocHappened(MemorySection section) {
+    if (section != null && section.size > 0) {
+      numAllocatedSections++;
+    }
+  }
+
   private synchronized long getAvailableBytesInternal() {
     return this.availableBytes;
   }
+
+  private long getTotalPoolSizeInternal() {
+    return this.totalPoolSize;
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java
index 2fb202a72b1..70538ab082f 100644
--- a/java/src/main/java/ai/rapids/cudf/Scalar.java
+++ b/java/src/main/java/ai/rapids/cudf/Scalar.java
@@ -677,26 +677,7 @@ public ColumnView[] getChildrenFromStructScalar() {
     assert DType.STRUCT.equals(type) : "Cannot get table for the vector of type " + type;
 
     long[] childHandles = getChildrenFromStructScalar(getScalarHandle());
-    ColumnView[] children = new ColumnView[childHandles.length];
-    try {
-      for (int i = 0; i < children.length; i++) {
-        children[i] = new ColumnView(childHandles[i]);
-      }
-    } catch (Exception ex) {
-      // close all created ColumnViews if exception thrown
-      for (ColumnView child : children) {
-        // We closed all created ColumnViews when we hit null. Therefore we exit the loop.
-        if (child == null) break;
-        // make sure the close process is exception-free
-        try {
-          child.close();
-        } catch (Exception suppressed) {
-          ex.addSuppressed(suppressed);
-        }
-      }
-      throw ex;
-    }
-    return children;
+    return ColumnView.getColumnViewsFromPointers(childHandles);
   }
 
   @Override
diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index c90d27efa97..79e66cb608e 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -75,6 +75,17 @@ int[] getTypeScales() {
     return ret;
   }
 
+  DType[] getTypes() {
+    if (types == null) {
+      return null;
+    }
+    DType[] ret = new DType[types.size()];
+    for (int i = 0; i < types.size(); i++) {
+      ret[i] = types.get(i);
+    }
+    return ret;
+  }
+
   public static class Builder {
     private final List<String> names = new ArrayList<>();
     private final List<DType> types = new ArrayList<>();
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 93cb1acfae4..b2eb33d47dc 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -83,12 +83,8 @@ public Table(ColumnVector... columns) {
    */
   public Table(long[] cudfColumns) {
     assert cudfColumns != null && cudfColumns.length > 0 : "CudfColumns can't be null or empty";
-    this.columns = new ColumnVector[cudfColumns.length];
+    this.columns = ColumnVector.getColumnVectorsFromPointers(cudfColumns);
     try {
-      for (int i = 0; i < cudfColumns.length; i++) {
-        this.columns[i] = new ColumnVector(cudfColumns[i]);
-        cudfColumns[i] = 0;
-      }
       long[] views = new long[columns.length];
       for (int i = 0; i < columns.length; i++) {
         views[i] = columns[i].getNativeView();
@@ -96,7 +92,13 @@ public Table(long[] cudfColumns) {
       nativeHandle = createCudfTableView(views);
       this.rows = columns[0].getRowCount();
     } catch (Throwable t) {
-      ColumnView.cleanupColumnViews(cudfColumns, this.columns);
+      for (ColumnVector column : columns) {
+        try {
+          column.close();
+        } catch (Throwable s) {
+          t.addSuppressed(s);
+        }
+      }
       throw t;
     }
   }
@@ -182,6 +184,8 @@ static Table removeNullMasksIfNeeded(Table table) {
 
   private static native ContiguousTable[] contiguousSplit(long inputTable, int[] indices);
 
+  private static native long makeChunkedPack(long inputTable, long bounceBufferSize, long tempMemoryResource);
+
   private static native long[] partition(long inputTable, long partitionView,
       int numberOfPartitions, int[] outputOffsets);
 
@@ -231,7 +235,10 @@ private static native long[] readCSV(String[] columnNames,
                                        byte comment, String[] nullValues,
                                        String[] trueValues, String[] falseValues) throws CudfException;
 
-  private static native long[] readJSON(String[] columnNames,
+  /**
+   * read JSON data and return a pointer to a TableWithMeta object.
+   */
+  private static native long readJSON(String[] columnNames,
                                         int[] dTypeIds, int[] dTypeScales,
                                         String filePath, long address, long length,
                                         boolean dayFirst, boolean lines) throws CudfException;
@@ -329,7 +336,9 @@ private static native long writeParquetBufferBegin(String[] columnNames,
                                                      boolean[] isBinaryValues,
                                                      boolean[] hasParquetFieldIds,
                                                      int[] parquetFieldIds,
-                                                     HostBufferConsumer consumer) throws CudfException;
+                                                     HostBufferConsumer consumer,
+                                                     HostMemoryAllocator hostMemoryAllocator
+                                                     ) throws CudfException;
 
   /**
    * Write out a table to an open handle.
@@ -412,7 +421,9 @@ private static native long writeORCBufferBegin(String[] columnNames,
                                                  int compression,
                                                  int[] precisions,
                                                  boolean[] isMapValues,
-                                                 HostBufferConsumer consumer) throws CudfException;
+                                                 HostBufferConsumer consumer,
+                                                 HostMemoryAllocator hostMemoryAllocator
+                                                 ) throws CudfException;
 
   /**
    * Write out a table to an open handle.
@@ -440,10 +451,12 @@ private static native long writeORCBufferBegin(String[] columnNames,
    * Setup everything to write Arrow IPC formatted data to a buffer.
    * @param columnNames names that correspond to the table columns
    * @param consumer consumer of host buffers produced.
+   * @param hostMemoryAllocator allocator for host memory buffers.
    * @return a handle that is used in later calls to writeArrowIPCChunk and writeArrowIPCEnd.
    */
   private static native long writeArrowIPCBufferBegin(String[] columnNames,
-                                                      HostBufferConsumer consumer);
+                                                      HostBufferConsumer consumer,
+                                                      HostMemoryAllocator hostMemoryAllocator);
 
   /**
    * Convert a cudf table to an arrow table handle.
@@ -538,6 +551,8 @@ private static native long[] rollingWindowAggregate(
       int[] minPeriods,
       int[] preceding,
       int[] following,
+      boolean[] unboundedPreceding,
+      boolean[] unboundedFollowing,
       boolean ignoreNullKeys) throws CudfException;
 
   private static native long[] rangeRollingWindowAggregate(long inputTable, int[] keyIndices, int[] orderByIndices, boolean[] isOrderByAscending,
@@ -746,6 +761,8 @@ private static native ContigSplitGroupByResult contiguousSplitGroups(long inputT
 
   private static native long[] sample(long tableHandle, long n, boolean replacement, long seed);
 
+  private static native int distinctCount(long handle, boolean nullsEqual);
+
   /////////////////////////////////////////////////////////////////////////////
   // TABLE CREATION APIs
   /////////////////////////////////////////////////////////////////////////////
@@ -810,22 +827,29 @@ public static Table readCSV(Schema schema, CSVOptions opts, byte[] buffer) {
    * @param buffer raw UTF8 formatted bytes.
    * @param offset the starting offset into buffer.
    * @param len the number of bytes to parse.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return the data parsed as a table on the GPU.
    */
   public static Table readCSV(Schema schema, CSVOptions opts, byte[] buffer, long offset,
-                              long len) {
+                              long len, HostMemoryAllocator hostMemoryAllocator) {
     if (len <= 0) {
       len = buffer.length - offset;
     }
     assert len > 0;
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
-    try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) {
+    try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
       newBuf.setBytes(0, buffer, offset, len);
       return readCSV(schema, opts, newBuf, 0, len);
     }
   }
 
+
+  public static Table readCSV(Schema schema, CSVOptions opts, byte[] buffer, long offset,
+                              long len) {
+    return readCSV(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
+  }
+
   /**
    * Read CSV formatted data.
    * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
@@ -888,26 +912,28 @@ private static native long startWriteCSVToBuffer(String[] columnNames,
                                                    String trueValue,
                                                    String falseValue,
                                                    int quoteStyle,
-                                                   HostBufferConsumer buffer) throws CudfException;
+                                                   HostBufferConsumer buffer,
+                                                   HostMemoryAllocator hostMemoryAllocator
+                                                   ) throws CudfException;
 
   private static native void writeCSVChunkToBuffer(long writerHandle, long tableHandle);
 
   private static native void endWriteCSVToBuffer(long writerHandle);
 
-  private static class CSVTableWriter implements TableWriter {
-    private long writerHandle;
+  private static class CSVTableWriter extends TableWriter {
     private HostBufferConsumer consumer;
 
-    private CSVTableWriter(CSVWriterOptions options, HostBufferConsumer consumer) {
-      this.writerHandle = startWriteCSVToBuffer(options.getColumnNames(),
-                                                options.getIncludeHeader(),
-                                                options.getRowDelimiter(),
-                                                options.getFieldDelimiter(),
-                                                options.getNullValue(),
-                                                options.getTrueValue(),
-                                                options.getFalseValue(),
-                                                options.getQuoteStyle().nativeId,
-                                                consumer);
+    private CSVTableWriter(CSVWriterOptions options, HostBufferConsumer consumer,
+        HostMemoryAllocator hostMemoryAllocator) {
+      super(startWriteCSVToBuffer(options.getColumnNames(),
+          options.getIncludeHeader(),
+          options.getRowDelimiter(),
+          options.getFieldDelimiter(),
+          options.getNullValue(),
+          options.getTrueValue(),
+          options.getFalseValue(),
+          options.getQuoteStyle().nativeId,
+          consumer, hostMemoryAllocator));
       this.consumer = consumer;
     }
 
@@ -932,8 +958,14 @@ public void close() throws CudfException {
     }
   }
 
-  public static TableWriter getCSVBufferWriter(CSVWriterOptions options, HostBufferConsumer bufferConsumer) {
-    return new CSVTableWriter(options, bufferConsumer);
+  public static TableWriter getCSVBufferWriter(CSVWriterOptions options,
+      HostBufferConsumer bufferConsumer, HostMemoryAllocator hostMemoryAllocator) {
+    return new CSVTableWriter(options, bufferConsumer, hostMemoryAllocator);
+  }
+
+   public static TableWriter getCSVBufferWriter(CSVWriterOptions options,
+      HostBufferConsumer bufferConsumer) {
+    return getCSVBufferWriter(options, bufferConsumer, DefaultHostMemoryAllocator.get());
   }
 
   /**
@@ -967,6 +999,42 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) {
     return readJSON(schema, opts, buffer, 0, buffer.length);
   }
 
+  private static Table gatherJSONColumns(Schema schema, TableWithMeta twm) {
+    String[] neededColumns = schema.getColumnNames();
+    if (neededColumns == null || neededColumns.length == 0) {
+      return twm.releaseTable();
+    } else {
+      String[] foundNames = twm.getColumnNames();
+      HashMap<String, Integer> indices = new HashMap<>();
+      for (int i = 0; i < foundNames.length; i++) {
+        indices.put(foundNames[i], i);
+      }
+      // We might need to rearrange the columns to match what we want.
+      DType[] types = schema.getTypes();
+      ColumnVector[] columns = new ColumnVector[neededColumns.length];
+      try (Table tbl = twm.releaseTable()) {
+        for (int i = 0; i < columns.length; i++) {
+          String neededColumnName = neededColumns[i];
+          Integer index = indices.get(neededColumnName);
+          if (index != null) {
+            columns[i] = tbl.getColumn(index).incRefCount();
+          } else {
+            try (Scalar s = Scalar.fromNull(types[i])) {
+              columns[i] = ColumnVector.fromScalar(s, (int)tbl.getRowCount());
+            }
+          }
+        }
+        return new Table(columns);
+      } finally {
+        for (ColumnVector c: columns) {
+          if (c != null) {
+            c.close();
+          }
+        }
+      }
+    }
+  }
+
   /**
    * Read a JSON file.
    * @param schema the schema of the file.  You may use Schema.INFERRED to infer the schema.
@@ -975,11 +1043,14 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) {
    * @return the file parsed as a table on the GPU.
    */
   public static Table readJSON(Schema schema, JSONOptions opts, File path) {
-    return new Table(
-        readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
-            path.getAbsolutePath(),
-            0, 0,
-            opts.isDayFirst(), opts.isLines()));
+    try (TableWithMeta twm = new TableWithMeta(
+            readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
+                    path.getAbsolutePath(),
+                    0, 0,
+                    opts.isDayFirst(), opts.isLines()))) {
+
+      return gatherJSONColumns(schema, twm);
+    }
   }
 
   /**
@@ -989,22 +1060,28 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
    * @param buffer raw UTF8 formatted bytes.
    * @param offset the starting offset into buffer.
    * @param len the number of bytes to parse.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return the data parsed as a table on the GPU.
    */
   public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
-                               long len) {
+                               long len, HostMemoryAllocator hostMemoryAllocator) {
     if (len <= 0) {
       len = buffer.length - offset;
     }
     assert len > 0;
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
-    try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) {
+    try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
       newBuf.setBytes(0, buffer, offset, len);
       return readJSON(schema, opts, newBuf, 0, len);
     }
   }
 
+  public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
+                               long len) {
+    return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
+  }
+
   /**
    * Read JSON formatted data and infer the column names and schema.
    * @param opts various JSON parsing options.
@@ -1042,9 +1119,11 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     assert len > 0;
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
-    return new Table(readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
-        null, buffer.getAddress() + offset, len,
-        opts.isDayFirst(), opts.isLines()));
+    try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(),
+            schema.getTypeIds(), schema.getTypeScales(), null,
+            buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines()))) {
+      return gatherJSONColumns(schema, twm);
+    }
   }
 
   /**
@@ -1092,21 +1171,27 @@ public static Table readParquet(ParquetOptions opts, byte[] buffer) {
    * @param buffer raw parquet formatted bytes.
    * @param offset the starting offset into buffer.
    * @param len the number of bytes to parse.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return the data parsed as a table on the GPU.
    */
-  public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len) {
+  public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len,
+      HostMemoryAllocator hostMemoryAllocator) {
     if (len <= 0) {
       len = buffer.length - offset;
     }
     assert len > 0;
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
-    try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) {
+    try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
       newBuf.setBytes(0, buffer, offset, len);
       return readParquet(opts, newBuf, 0, len);
     }
   }
 
+  public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len) {
+    return readParquet(opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
+  }
+
   /**
    * Read parquet formatted data.
    * @param opts various parquet parsing options.
@@ -1172,19 +1257,26 @@ public static Table readAvro(AvroOptions opts, byte[] buffer) {
    * @param buffer raw Avro formatted bytes.
    * @param offset the starting offset into buffer.
    * @param len the number of bytes to parse.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return the data parsed as a table on the GPU.
    */
-  public static Table readAvro(AvroOptions opts, byte[] buffer, long offset, long len) {
+  public static Table readAvro(AvroOptions opts, byte[] buffer, long offset, long len,
+      HostMemoryAllocator hostMemoryAllocator) {
     assert offset >= 0 && offset < buffer.length;
     assert len <= buffer.length - offset;
     len = len > 0 ? len : buffer.length - offset;
 
-    try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) {
+    try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
       newBuf.setBytes(0, buffer, offset, len);
       return readAvro(opts, newBuf, 0, len);
     }
   }
 
+  public static Table readAvro(AvroOptions opts, byte[] buffer, long offset, long len) {
+    return readAvro(opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
+  }
+
+
   /**
    * Read Avro formatted data.
    * @param opts various Avro parsing options.
@@ -1250,21 +1342,28 @@ public static Table readORC(ORCOptions opts, byte[] buffer) {
    * @param buffer raw ORC formatted bytes.
    * @param offset the starting offset into buffer.
    * @param len the number of bytes to parse.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return the data parsed as a table on the GPU.
    */
-  public static Table readORC(ORCOptions opts, byte[] buffer, long offset, long len) {
+  public static Table readORC(ORCOptions opts, byte[] buffer, long offset, long len,
+      HostMemoryAllocator hostMemoryAllocator) {
     if (len <= 0) {
       len = buffer.length - offset;
     }
     assert len > 0;
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
-    try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) {
+    try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
       newBuf.setBytes(0, buffer, offset, len);
       return readORC(opts, newBuf, 0, len);
     }
   }
 
+  public static Table readORC(ORCOptions opts, byte[] buffer, long offset, long len) {
+    return readORC(opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
+  }
+
+
   /**
    * Read ORC formatted data.
    * @param opts various ORC parsing options.
@@ -1287,82 +1386,62 @@ public static Table readORC(ORCOptions opts, HostMemoryBuffer buffer,
         opts.getDecimal128Columns()));
   }
 
-  private static class ParquetTableWriter implements TableWriter {
-    private long handle;
+  private static class ParquetTableWriter extends TableWriter {
     HostBufferConsumer consumer;
 
     private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
-      String[] columnNames = options.getFlatColumnNames();
-      boolean[] columnNullabilities = options.getFlatIsNullable();
-      boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96();
-      boolean[] isMapValues = options.getFlatIsMap();
-      boolean[] isBinaryValues = options.getFlatIsBinary();
-      int[] precisions = options.getFlatPrecision();
-      boolean[] hasParquetFieldIds = options.getFlatHasParquetFieldId();
-      int[] parquetFieldIds = options.getFlatParquetFieldId();
-      int[] flatNumChildren = options.getFlatNumChildren();
-
-      this.consumer = null;
-      this.handle = writeParquetFileBegin(columnNames,
+      super(writeParquetFileBegin(options.getFlatColumnNames(),
           options.getTopLevelChildren(),
-          flatNumChildren,
-          columnNullabilities,
+          options.getFlatNumChildren(),
+          options.getFlatIsNullable(),
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
           options.getStatisticsFrequency().nativeId,
-          timeInt96Values,
-          precisions,
-          isMapValues,
-          isBinaryValues,
-          hasParquetFieldIds,
-          parquetFieldIds,
-          outputFile.getAbsolutePath());
-    }
-
-    private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer consumer) {
-      String[] columnNames = options.getFlatColumnNames();
-      boolean[] columnNullabilities = options.getFlatIsNullable();
-      boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96();
-      boolean[] isMapValues = options.getFlatIsMap();
-      boolean[] isBinaryValues = options.getFlatIsBinary();
-      int[] precisions = options.getFlatPrecision();
-      boolean[] hasParquetFieldIds = options.getFlatHasParquetFieldId();
-      int[] parquetFieldIds = options.getFlatParquetFieldId();
-      int[] flatNumChildren = options.getFlatNumChildren();
+          options.getFlatIsTimeTypeInt96(),
+          options.getFlatPrecision(),
+          options.getFlatIsMap(),
+          options.getFlatIsBinary(),
+          options.getFlatHasParquetFieldId(),
+          options.getFlatParquetFieldId(),
+          outputFile.getAbsolutePath()));
+      this.consumer = null;
+    }
 
-      this.consumer = consumer;
-      this.handle = writeParquetBufferBegin(columnNames,
+    private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer consumer,
+        HostMemoryAllocator hostMemoryAllocator) {
+      super(writeParquetBufferBegin(options.getFlatColumnNames(),
           options.getTopLevelChildren(),
-          flatNumChildren,
-          columnNullabilities,
+          options.getFlatNumChildren(),
+          options.getFlatIsNullable(),
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
           options.getStatisticsFrequency().nativeId,
-          timeInt96Values,
-          precisions,
-          isMapValues,
-          isBinaryValues,
-          hasParquetFieldIds,
-          parquetFieldIds,
-          consumer);
+          options.getFlatIsTimeTypeInt96(),
+          options.getFlatPrecision(),
+          options.getFlatIsMap(),
+          options.getFlatIsBinary(),
+          options.getFlatHasParquetFieldId(),
+          options.getFlatParquetFieldId(),
+          consumer, hostMemoryAllocator));
+      this.consumer = consumer;
     }
 
     @Override
     public void write(Table table) {
-      if (handle == 0) {
+      if (writerHandle == 0) {
         throw new IllegalStateException("Writer was already closed");
       }
-      writeParquetChunk(handle, table.nativeHandle, table.getDeviceMemorySize());
+      writeParquetChunk(writerHandle, table.nativeHandle, table.getDeviceMemorySize());
     }
 
     @Override
     public void close() throws CudfException {
-      if (handle != 0) {
-        writeParquetEnd(handle);
+      if (writerHandle != 0) {
+        writeParquetEnd(writerHandle);
       }
-      handle = 0;
+      writerHandle = 0;
       if (consumer != null) {
         consumer.done();
         consumer = null;
@@ -1385,11 +1464,18 @@ public static TableWriter writeParquetChunked(ParquetWriterOptions options, File
    * @param options the parquet writer options.
    * @param consumer a class that will be called when host buffers are ready with parquet
    *                 formatted data in them.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return a table writer to use for writing out multiple tables.
    */
+  public static TableWriter writeParquetChunked(ParquetWriterOptions options,
+                                                HostBufferConsumer consumer,
+                                                HostMemoryAllocator hostMemoryAllocator) {
+    return new ParquetTableWriter(options, consumer, hostMemoryAllocator);
+  }
+
   public static TableWriter writeParquetChunked(ParquetWriterOptions options,
                                                 HostBufferConsumer consumer) {
-    return new ParquetTableWriter(options, consumer);
+    return writeParquetChunked(options, consumer, DefaultHostMemoryAllocator.get());
   }
 
   /**
@@ -1398,10 +1484,12 @@ public static TableWriter writeParquetChunked(ParquetWriterOptions options,
    * @param options the Parquet writer options.
    * @param consumer a class that will be called when host buffers are ready with Parquet
    *                 formatted data in them.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @param columnViews ColumnViews to write to Parquet
    */
   public static void writeColumnViewsToParquet(ParquetWriterOptions options,
                                                HostBufferConsumer consumer,
+                                               HostMemoryAllocator hostMemoryAllocator,
                                                ColumnView... columnViews) {
     assert columnViews != null && columnViews.length > 0 : "ColumnViews can't be null or empty";
     long rows = columnViews[0].getRowCount();
@@ -1420,24 +1508,31 @@ public static void writeColumnViewsToParquet(ParquetWriterOptions options,
 
     long nativeHandle = createCudfTableView(viewPointers);
     try {
-      try (ParquetTableWriter writer = new ParquetTableWriter(options, consumer)) {
+      try (
+        ParquetTableWriter writer = new ParquetTableWriter(options, consumer, hostMemoryAllocator)
+      ) {
         long total = 0;
         for (ColumnView cv : columnViews) {
           total += cv.getDeviceMemorySize();
         }
-        writeParquetChunk(writer.handle, nativeHandle, total);
+        writeParquetChunk(writer.writerHandle, nativeHandle, total);
       }
     } finally {
       deleteCudfTable(nativeHandle);
     }
   }
 
-  private static class ORCTableWriter implements TableWriter {
-    private long handle;
+  public static void writeColumnViewsToParquet(ParquetWriterOptions options,
+                                               HostBufferConsumer consumer,
+                                               ColumnView... columnViews) {
+    writeColumnViewsToParquet(options, consumer, DefaultHostMemoryAllocator.get(), columnViews);
+  }
+
+  private static class ORCTableWriter extends TableWriter {
     HostBufferConsumer consumer;
 
     private ORCTableWriter(ORCWriterOptions options, File outputFile) {
-      this.handle = writeORCFileBegin(options.getFlatColumnNames(),
+      super(writeORCFileBegin(options.getFlatColumnNames(),
           options.getTopLevelChildren(),
           options.getFlatNumChildren(),
           options.getFlatIsNullable(),
@@ -1446,12 +1541,13 @@ private ORCTableWriter(ORCWriterOptions options, File outputFile) {
           options.getCompressionType().nativeId,
           options.getFlatPrecision(),
           options.getFlatIsMap(),
-          outputFile.getAbsolutePath());
+          outputFile.getAbsolutePath()));
       this.consumer = null;
     }
 
-    private ORCTableWriter(ORCWriterOptions options, HostBufferConsumer consumer) {
-      this.handle = writeORCBufferBegin(options.getFlatColumnNames(),
+    private ORCTableWriter(ORCWriterOptions options, HostBufferConsumer consumer,
+        HostMemoryAllocator hostMemoryAllocator) {
+      super(writeORCBufferBegin(options.getFlatColumnNames(),
           options.getTopLevelChildren(),
           options.getFlatNumChildren(),
           options.getFlatIsNullable(),
@@ -1460,24 +1556,24 @@ private ORCTableWriter(ORCWriterOptions options, HostBufferConsumer consumer) {
           options.getCompressionType().nativeId,
           options.getFlatPrecision(),
           options.getFlatIsMap(),
-          consumer);
+          consumer, hostMemoryAllocator));
       this.consumer = consumer;
     }
 
     @Override
     public void write(Table table) {
-      if (handle == 0) {
+      if (writerHandle == 0) {
         throw new IllegalStateException("Writer was already closed");
       }
-      writeORCChunk(handle, table.nativeHandle, table.getDeviceMemorySize());
+      writeORCChunk(writerHandle, table.nativeHandle, table.getDeviceMemorySize());
     }
 
     @Override
     public void close() throws CudfException {
-      if (handle != 0) {
-        writeORCEnd(handle);
+      if (writerHandle != 0) {
+        writeORCEnd(writerHandle);
       }
-      handle = 0;
+      writerHandle = 0;
       if (consumer != null) {
         consumer.done();
         consumer = null;
@@ -1500,47 +1596,47 @@ public static TableWriter writeORCChunked(ORCWriterOptions options, File outputF
    * @param options the ORC writer options.
    * @param consumer a class that will be called when host buffers are ready with ORC
    *                 formatted data in them.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return a table writer to use for writing out multiple tables.
    */
+  public static TableWriter writeORCChunked(ORCWriterOptions options, HostBufferConsumer consumer,
+      HostMemoryAllocator hostMemoryAllocator) {
+    return new ORCTableWriter(options, consumer, hostMemoryAllocator);
+  }
+
   public static TableWriter writeORCChunked(ORCWriterOptions options, HostBufferConsumer consumer) {
-    return new ORCTableWriter(options, consumer);
+    return writeORCChunked(options, consumer, DefaultHostMemoryAllocator.get());
   }
 
-  private static class ArrowIPCTableWriter implements TableWriter {
+  private static class ArrowIPCTableWriter extends TableWriter {
     private final ArrowIPCWriterOptions.DoneOnGpu callback;
-    private long handle;
     private HostBufferConsumer consumer;
     private long maxChunkSize;
 
-    private ArrowIPCTableWriter(ArrowIPCWriterOptions options,
-                                File outputFile) {
+    private ArrowIPCTableWriter(ArrowIPCWriterOptions options, File outputFile) {
+      super(writeArrowIPCFileBegin(options.getColumnNames(), outputFile.getAbsolutePath()));
       this.callback = options.getCallback();
       this.consumer = null;
       this.maxChunkSize = options.getMaxChunkSize();
-      this.handle = writeArrowIPCFileBegin(
-              options.getColumnNames(),
-              outputFile.getAbsolutePath());
     }
 
-    private ArrowIPCTableWriter(ArrowIPCWriterOptions options,
-                                HostBufferConsumer consumer) {
+    private ArrowIPCTableWriter(ArrowIPCWriterOptions options, HostBufferConsumer consumer,
+        HostMemoryAllocator hostMemoryAllocator) {
+      super(writeArrowIPCBufferBegin(options.getColumnNames(), consumer, hostMemoryAllocator));
       this.callback = options.getCallback();
       this.consumer = consumer;
       this.maxChunkSize = options.getMaxChunkSize();
-      this.handle = writeArrowIPCBufferBegin(
-              options.getColumnNames(),
-              consumer);
     }
 
     @Override
     public void write(Table table) {
-      if (handle == 0) {
+      if (writerHandle == 0) {
         throw new IllegalStateException("Writer was already closed");
       }
-      long arrowHandle = convertCudfToArrowTable(handle, table.nativeHandle);
+      long arrowHandle = convertCudfToArrowTable(writerHandle, table.nativeHandle);
       try {
         callback.doneWithTheGpu(table);
-        writeArrowIPCArrowChunk(handle, arrowHandle, maxChunkSize);
+        writeArrowIPCArrowChunk(writerHandle, arrowHandle, maxChunkSize);
       } finally {
         closeArrowTable(arrowHandle);
       }
@@ -1548,10 +1644,10 @@ public void write(Table table) {
 
     @Override
     public void close() throws CudfException {
-      if (handle != 0) {
-        writeArrowIPCEnd(handle);
+      if (writerHandle != 0) {
+        writeArrowIPCEnd(writerHandle);
       }
-      handle = 0;
+      writerHandle = 0;
       if (consumer != null) {
         consumer.done();
         consumer = null;
@@ -1574,20 +1670,30 @@ public static TableWriter writeArrowIPCChunked(ArrowIPCWriterOptions options, Fi
    * @param options the arrow IPC writer options.
    * @param consumer a class that will be called when host buffers are ready with arrow IPC
    *                 formatted data in them.
+   * @param hostMemoryAllocator allocator for host memory buffers
    * @return a table writer to use for writing out multiple tables.
    */
+  public static TableWriter writeArrowIPCChunked(ArrowIPCWriterOptions options,
+                                                 HostBufferConsumer consumer,
+                                                 HostMemoryAllocator hostMemoryAllocator) {
+    return new ArrowIPCTableWriter(options, consumer, hostMemoryAllocator);
+  }
+
   public static TableWriter writeArrowIPCChunked(ArrowIPCWriterOptions options,
                                                  HostBufferConsumer consumer) {
-    return new ArrowIPCTableWriter(options, consumer);
+    return writeArrowIPCChunked(options, consumer, DefaultHostMemoryAllocator.get());
   }
 
   private static class ArrowReaderWrapper implements AutoCloseable {
     private HostBufferProvider provider;
     private HostMemoryBuffer buffer;
+    private final HostMemoryAllocator hostMemoryAllocator;
 
-    private ArrowReaderWrapper(HostBufferProvider provider) {
+    private ArrowReaderWrapper(HostBufferProvider provider,
+        HostMemoryAllocator hostMemoryAllocator) {
       this.provider = provider;
-      buffer = HostMemoryBuffer.allocate(10 * 1024 * 1024, false);
+      this.hostMemoryAllocator = hostMemoryAllocator;
+      buffer = this.hostMemoryAllocator.allocate(10 * 1024 * 1024, false);
     }
 
     // Called From JNI
@@ -1634,8 +1740,9 @@ private ArrowIPCStreamedTableReader(ArrowIPCOptions options, File inputFile) {
       this.callback = options.getCallback();
     }
 
-    private ArrowIPCStreamedTableReader(ArrowIPCOptions options, HostBufferProvider provider) {
-      this.provider = new ArrowReaderWrapper(provider);
+    private ArrowIPCStreamedTableReader(ArrowIPCOptions options, HostBufferProvider provider,
+      HostMemoryAllocator hostMemoryAllocator) {
+      this.provider = new ArrowReaderWrapper(provider, hostMemoryAllocator);
       this.handle = readArrowIPCBufferBegin(this.provider);
       this.callback = options.getCallback();
     }
@@ -1698,9 +1805,16 @@ public static StreamedTableReader readArrowIPCChunked(File inputFile) {
    * @param provider what will provide the data being read.
    * @return a reader.
    */
+
+  public static StreamedTableReader readArrowIPCChunked(ArrowIPCOptions options,
+                                                        HostBufferProvider provider,
+                                                        HostMemoryAllocator hostMemoryAllocator) {
+    return new ArrowIPCStreamedTableReader(options, provider, hostMemoryAllocator);
+  }
+
   public static StreamedTableReader readArrowIPCChunked(ArrowIPCOptions options,
                                                         HostBufferProvider provider) {
-    return new ArrowIPCStreamedTableReader(options, provider);
+    return new ArrowIPCStreamedTableReader(options, provider, DefaultHostMemoryAllocator.get());
   }
 
   /**
@@ -2144,6 +2258,22 @@ public Table dropDuplicates(int[] keyColumns, DuplicateKeepOption keep, boolean
     return new Table(dropDuplicates(nativeHandle, keyColumns, keep.keepValue, nullsEqual));
   }
 
+  /**
+   * Count how many rows in the table are distinct from one another.
+   * @param nullEqual if nulls should be considered equal to each other or not.
+   */
+  public int distinctCount(NullEquality nullsEqual) {
+    return distinctCount(nativeHandle, nullsEqual.nullsEqual);
+  }
+
+  /**
+   * Count how many rows in the table are distinct from one another.
+   * Nulls are considered to be equal to one another.
+   */
+  public int distinctCount() {
+    return distinctCount(nativeHandle, true);
+  }
+
   /**
    * Split a table at given boundaries, but the result of each split has memory that is laid out
    * in a contiguous range of memory.  This allows for us to optimize copying the data in a single
@@ -2166,6 +2296,44 @@ public ContiguousTable[] contiguousSplit(int... indices) {
     return contiguousSplit(nativeHandle, indices);
   }
 
+  /**
+   * Create an instance of `ChunkedPack` which can be used to pack this table
+   * contiguously in memory utilizing a bounce buffer of size `bounceBufferSize`.
+   *
+   * This version of `makeChunkedPack` takes a `RmmDviceMemoryResource`, which can be used
+   * to pre-allocate all scratch and temporary space required for the state of `cudf::chunked_pack`.
+   *
+   * The caller is responsible for calling close on the returned `ChunkedPack` object.
+   *
+   * @param bounceBufferSize The size of bounce buffer that will be utilized to pack into
+   * @param tempMemoryResource A memory resource that is used to satisfy allocations for
+   *                           temporary and thrust scratch space.
+   * @return An instance of `ChunkedPack` that the caller must use to finish the operation.
+   */
+  public ChunkedPack makeChunkedPack(
+      long bounceBufferSize, RmmDeviceMemoryResource tempMemoryResource) {
+    long tempMemoryResourceHandle = tempMemoryResource.getHandle();
+    return new ChunkedPack(
+      makeChunkedPack(nativeHandle, bounceBufferSize, tempMemoryResourceHandle));
+  }
+
+  /**
+   * Create an instance of `ChunkedPack` which can be used to pack this table
+   * contiguously in memory utilizing a bounce buffer of size `bounceBufferSize`.
+   *
+   * This version of `makeChunkedPack` makes use of the default per-device memory resource,
+   * for scratch and temporary space required for the state of `cudf::chunked_pack`.
+   *
+   * The caller is responsible for calling close on the returned `ChunkedPack` object.
+   *
+   * @param bounceBufferSize The size of bounce buffer that will be utilized to pack into
+   * @return An instance of `ChunkedPack` that the caller must use to finish the operation.
+   */
+  public ChunkedPack makeChunkedPack(long bounceBufferSize) {
+    return new ChunkedPack(
+      makeChunkedPack(nativeHandle, bounceBufferSize, 0));
+  }
+
   /**
    * Explodes a list column's elements.
    *
@@ -3390,17 +3558,7 @@ public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKe
    */
   public ColumnVector[] convertToRows() {
     long[] ptrs = convertToRows(nativeHandle);
-    ColumnVector[] ret = new ColumnVector[ptrs.length];
-    try {
-      for (int i = 0; i < ptrs.length; i++) {
-        ret[i] = new ColumnVector(ptrs[i]);
-        ptrs[i] = 0;
-      }
-    } catch (Throwable t) {
-      ColumnView.cleanupColumnViews(ptrs, ret);
-      throw t;
-    }
-    return ret;
+    return ColumnVector.getColumnVectorsFromPointers(ptrs);
   }
 
   /**
@@ -3479,17 +3637,7 @@ public ColumnVector[] convertToRows() {
    */
   public ColumnVector[] convertToRowsFixedWidthOptimized() {
     long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle);
-    ColumnVector[] ret = new ColumnVector[ptrs.length];
-    try {
-      for (int i = 0; i < ptrs.length; i++) {
-        ret[i] = new ColumnVector(ptrs[i]);
-        ptrs[i] = 0;
-      }
-    } catch (Throwable t) {
-      ColumnView.cleanupColumnViews(ptrs, ret);
-      throw t;
-    }
-    return ret;
+    return ColumnVector.getColumnVectorsFromPointers(ptrs);
   }
 
   /**
@@ -3554,13 +3702,21 @@ public static Table fromPackedTable(ByteBuffer metadata, DeviceMemoryBuffer data
     Table result = null;
     try {
       for (int i = 0; i < columns.length; i++) {
-        columns[i] = ColumnVector.fromViewWithContiguousAllocation(columnViewAddresses[i], data);
+        long columnViewAddress = columnViewAddresses[i];
+        // setting address to zero, so we don't clean it in case of an exception as it
+        // will be cleaned up by the ColumnView  constructor
         columnViewAddresses[i] = 0;
+        columns[i] = ColumnVector.fromViewWithContiguousAllocation(columnViewAddress, data);
       }
       result = new Table(columns);
     } catch (Throwable t) {
-      ColumnView.cleanupColumnViews(columnViewAddresses, columns);
-      throw t;
+      try {
+        ColumnView.cleanupColumnViews(columnViewAddresses, columns, t);
+      } catch (Throwable s){
+        t.addSuppressed(s);
+      } finally {
+        throw t;
+      }
     }
 
     // close columns to leave the resulting table responsible for freeing underlying columns
@@ -3848,6 +4004,8 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
       try {
         int[] aggPrecedingWindows = new int[totalOps];
         int[] aggFollowingWindows = new int[totalOps];
+        boolean[] unboundedPreceding = new boolean[totalOps];
+        boolean[] unboundedFollowing = new boolean[totalOps];
         int[] aggMinPeriods = new int[totalOps];
         long[] defaultOutputs = new long[totalOps];
         int opIndex = 0;
@@ -3860,6 +4018,8 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
             aggPrecedingWindows[opIndex] = p == null || !p.isValid() ? 0 : p.getInt();
             Scalar f = operation.getWindowOptions().getFollowingScalar();
             aggFollowingWindows[opIndex] = f == null || ! f.isValid() ? 1 : f.getInt();
+            unboundedPreceding[opIndex] = operation.getWindowOptions().isUnboundedPreceding();
+            unboundedFollowing[opIndex] = operation.getWindowOptions().isUnboundedFollowing();
             aggMinPeriods[opIndex] = operation.getWindowOptions().getMinPeriods();
             defaultOutputs[opIndex] = operation.getDefaultOutput();
             opIndex++;
@@ -3873,6 +4033,7 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
             defaultOutputs,
             aggColumnIndexes,
             aggInstances, aggMinPeriods, aggPrecedingWindows, aggFollowingWindows,
+            unboundedPreceding, unboundedFollowing,
             groupByOptions.getIgnoreNullKeys()))) {
           // prepare the final table
           ColumnVector[] finalCols = new ColumnVector[windowAggregates.length];
@@ -3973,6 +4134,8 @@ public Table aggregateWindowsOverRanges(AggregationOverWindow... windowAggregate
           case UINT16:
           case UINT32:
           case UINT64:
+          case FLOAT32:
+          case FLOAT64:
           case TIMESTAMP_MILLISECONDS:
           case TIMESTAMP_SECONDS:
           case TIMESTAMP_DAYS:
diff --git a/java/src/main/java/ai/rapids/cudf/TableDebug.java b/java/src/main/java/ai/rapids/cudf/TableDebug.java
new file mode 100644
index 00000000000..18ec4b086df
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/TableDebug.java
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ai.rapids.cudf;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Locale;
+import java.util.function.Consumer;
+
+public class TableDebug {
+
+  /**
+   * Specify one of
+   * -Dai.rapids.cudf.debug.output=stderr       to print directly to standard error (default)
+   * -Dai.rapids.cudf.debug.output=stdout       to print directly to standard output
+   * -Dai.rapids.cudf.debug.output=log[_level]  to redirect to a logging subsystem that can
+   * further be
+   * configured.
+   * Supported log levels:
+   * debug (default)
+   * info
+   * warn
+   * error
+   */
+  public static final String OUTPUT_STREAM = "ai.rapids.cudf.debug.output";
+  private static final Logger log = LoggerFactory.getLogger(TableDebug.class);
+
+  public enum Output {
+    STDOUT(System.out::println),
+    STDERR(System.err::println),
+    LOG(log::debug),
+    LOG_DEBUG(log::debug),
+    LOG_INFO(log::info),
+    LOG_WARN(log::warn),
+    LOG_ERROR(log::error);
+
+    private final Consumer<String> printFunc;
+
+    Output(Consumer<String> pf) {
+      this.printFunc = pf;
+    }
+
+    final void println(String s) {
+      printFunc.accept(s);
+    }
+  }
+
+
+  public static class Builder {
+    private Output outputMode = Output.STDERR;
+
+    public Builder() {
+      try {
+        outputMode = Output.valueOf(
+            System.getProperty(OUTPUT_STREAM, Output.STDERR.name())
+                .toUpperCase(Locale.US));
+      } catch (Throwable e) {
+        log.warn("Failed to parse the output mode", e);
+      }
+    }
+
+    public Builder withOutput(Output outputMode) {
+      this.outputMode = outputMode;
+      return this;
+    }
+
+    public final TableDebug build() {
+      return new TableDebug(outputMode);
+    }
+  }
+
+  public static Builder builder() {
+    return new Builder();
+  }
+
+  private static final TableDebug DEFAULT_DEBUG = builder().build();
+
+  public static TableDebug get() {
+    return DEFAULT_DEBUG;
+  }
+
+  private final Output output;
+
+  private TableDebug(Output output) {
+    this.output = output;
+  }
+
+  /**
+   * Print the contents of a table. Note that this should never be
+   * called from production code, as it is very slow.  Also note that this is not production
+   * code.  You might need/want to update how the data shows up or add in support for more
+   * types as this really is just for debugging.
+   * @param name  the name of the table to print out.
+   * @param table the table to print out.
+   */
+  public synchronized void debug(String name, Table table) {
+    output.println("DEBUG " + name + " " + table);
+    for (int col = 0; col < table.getNumberOfColumns(); col++) {
+      debug(String.valueOf(col), table.getColumn(col));
+    }
+  }
+
+  /**
+   * Print the contents of a column. Note that this should never be
+   * called from production code, as it is very slow.  Also note that this is not production
+   * code.  You might need/want to update how the data shows up or add in support for more
+   * types as this really is just for debugging.
+   * @param name the name of the column to print out.
+   * @param col  the column to print out.
+   */
+  public synchronized void debug(String name, ColumnView col) {
+    debugGPUAddrs(name, col);
+    try (HostColumnVector hostCol = col.copyToHost()) {
+      debug(name, hostCol);
+    }
+  }
+
+  private synchronized void debugGPUAddrs(String name, ColumnView col) {
+    try (BaseDeviceMemoryBuffer data = col.getData();
+         BaseDeviceMemoryBuffer validity = col.getValid()) {
+      output.println("GPU COLUMN " + name + " - NC: " + col.getNullCount()
+          + " DATA: " + data + " VAL: " + validity);
+    }
+    if (col.getType() == DType.STRUCT) {
+      for (int i = 0; i < col.getNumChildren(); i++) {
+        try (ColumnView child = col.getChildColumnView(i)) {
+          debugGPUAddrs(name + ":CHILD_" + i, child);
+        }
+      }
+    } else if (col.getType() == DType.LIST) {
+      try (ColumnView child = col.getChildColumnView(0)) {
+        debugGPUAddrs(name + ":DATA", child);
+      }
+    }
+  }
+
+
+  /**
+   * Print the contents of a column. Note that this should never be
+   * called from production code, as it is very slow.  Also note that this is not production
+   * code.  You might need/want to update how the data shows up or add in support for more
+   * types as this really is just for debugging.
+   * @param name    the name of the column to print out.
+   * @param hostCol the column to print out.
+   */
+  public synchronized void debug(String name, HostColumnVectorCore hostCol) {
+    DType type = hostCol.getType();
+    output.println("COLUMN " + name + " - " + type);
+    if (type.isDecimalType()) {
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } else {
+          output.println(i + " " + hostCol.getBigDecimal(i));
+        }
+      }
+    } else if (DType.STRING.equals(type)) {
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } else {
+          output.println(i + " \"" + hostCol.getJavaString(i) + "\" " +
+              hexString(hostCol.getUTF8(i)));
+        }
+      }
+    } else if (DType.INT32.equals(type)
+        || DType.INT8.equals(type)
+        || DType.INT16.equals(type)
+        || DType.INT64.equals(type)
+        || DType.TIMESTAMP_DAYS.equals(type)
+        || DType.TIMESTAMP_SECONDS.equals(type)
+        || DType.TIMESTAMP_MICROSECONDS.equals(type)
+        || DType.TIMESTAMP_MILLISECONDS.equals(type)
+        || DType.TIMESTAMP_NANOSECONDS.equals(type)
+        || DType.UINT8.equals(type)
+        || DType.UINT16.equals(type)
+        || DType.UINT32.equals(type)
+        || DType.UINT64.equals(type)) {
+      debugInteger(hostCol, type);
+    } else if (DType.BOOL8.equals(type)) {
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } else {
+          output.println(i + " " + hostCol.getBoolean(i));
+        }
+      }
+    } else if (DType.FLOAT64.equals(type)) {
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } else {
+          output.println(i + " " + hostCol.getDouble(i));
+        }
+      }
+    } else if (DType.FLOAT32.equals(type)) {
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } else {
+          output.println(i + " " + hostCol.getFloat(i));
+        }
+      }
+    } else if (DType.STRUCT.equals(type)) {
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } // The struct child columns are printed out later on.
+      }
+      for (int i = 0; i < hostCol.getNumChildren(); i++) {
+        debug(name + ":CHILD_" + i, hostCol.getChildColumnView(i));
+      }
+    } else if (DType.LIST.equals(type)) {
+      output.println("OFFSETS");
+      for (int i = 0; i < hostCol.getRowCount(); i++) {
+        if (hostCol.isNull(i)) {
+          output.println(i + " NULL");
+        } else {
+          output.println(i + " [" + hostCol.getStartListOffset(i) + " - " +
+              hostCol.getEndListOffset(i) + ")");
+        }
+      }
+      debug(name + ":DATA", hostCol.getChildColumnView(0));
+    } else {
+      output.println("TYPE " + type + " NOT SUPPORTED FOR DEBUG PRINT");
+    }
+  }
+
+
+  private void debugInteger(HostColumnVectorCore hostCol, DType intType) {
+    for (int i = 0; i < hostCol.getRowCount(); i++) {
+      if (hostCol.isNull(i)) {
+        output.println(i + " NULL");
+      } else {
+        final int sizeInBytes = intType.getSizeInBytes();
+        final Object value;
+        switch (sizeInBytes) {
+          case Byte.BYTES:
+            value = hostCol.getByte(i);
+            break;
+          case Short.BYTES:
+            value = hostCol.getShort(i);
+            break;
+          case Integer.BYTES:
+            value = hostCol.getInt(i);
+            break;
+          case Long.BYTES:
+            value = hostCol.getLong(i);
+            break;
+          default:
+            throw new IllegalArgumentException("INFEASIBLE: Unsupported integer-like type " + intType);
+        }
+        output.println(i + " " + value);
+      }
+    }
+  }
+
+
+  private static String hexString(byte[] bytes) {
+    StringBuilder str = new StringBuilder();
+    for (byte b : bytes) {
+      str.append(String.format("%02x", b & 0xff));
+    }
+    return str.toString();
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/TableWriter.java b/java/src/main/java/ai/rapids/cudf/TableWriter.java
index 748e1c79084..c5bf735756c 100644
--- a/java/src/main/java/ai/rapids/cudf/TableWriter.java
+++ b/java/src/main/java/ai/rapids/cudf/TableWriter.java
@@ -1,37 +1,75 @@
 /*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 package ai.rapids.cudf;
 
 /**
  * Provides an interface for writing out Table information in multiple steps.
- * A TableWriter will be returned from one of various factory functions in Table that
- * let you set the format of the data and its destination.  After that write can be called one or
- * more times.  When you are done writing call close to finish.
+ * A TableWriter will be returned from one of various factory functions in the Table class that
+ * let you set the format of the data and its destination. After that write can be called one or
+ * more times. When you are done writing call close to finish.
  */
-public interface TableWriter extends AutoCloseable {
+public abstract class TableWriter implements AutoCloseable {
+  protected long writerHandle;
+
+  TableWriter(long writerHandle) { this.writerHandle = writerHandle; }
+
   /**
-   * Write out a table.  Note that all columns must be in the same order each time this is called
+   * Write out a table. Note that all columns must be in the same order each time this is called
    * and the format of each table cannot change.
    * @param table what to write out.
    */
-  void write(Table table) throws CudfException;
+  abstract public void write(Table table) throws CudfException;
 
   @Override
-  void close() throws CudfException;
+  abstract public void close() throws CudfException;
+
+  public static class WriteStatistics {
+    public final long numCompressedBytes; // The number of bytes that were successfully compressed
+    public final long numFailedBytes;     // The number of bytes that failed to compress
+    public final long numSkippedBytes;    // The number of bytes that were skipped during compression
+    public final double compressionRatio; // The compression ratio for the successfully compressed data
+
+    public WriteStatistics(long numCompressedBytes, long numFailedBytes, long numSkippedBytes,
+        double compressionRatio) {
+      this.numCompressedBytes = numCompressedBytes;
+      this.numFailedBytes = numFailedBytes;
+      this.numSkippedBytes = numSkippedBytes;
+      this.compressionRatio = compressionRatio;
+    }
+  }
+
+  /**
+   * Get the write statistics for the writer up to the last write call.
+   * Currently, only ORC and Parquet writers support write statistics.
+   * Calling this method on other writers will return null.
+   * @return The write statistics.
+   */
+  public WriteStatistics getWriteStatistics() {
+    double[] statsData = getWriteStatistics(writerHandle);
+    assert statsData.length == 4 : "Unexpected write statistics data length";
+    return new WriteStatistics((long) statsData[0], (long) statsData[1], (long) statsData[2],
+        statsData[3]);
+  }
+
+  /**
+   * Get the write statistics for the writer up to the last write call.
+   * The data returned from native method is encoded as an array of doubles.
+   * @param writerHandle The handle to the writer.
+   * @return The write statistics.
+   */
+  private static native double[] getWriteStatistics(long writerHandle);
 }
diff --git a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
index 6fb5a16d888..530a2ee3f1c 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,31 +24,32 @@
  */
 public enum UnaryOperator {
   IDENTITY(0),          // Identity function
-  SIN(1),               // Trigonometric sine
-  COS(2),               // Trigonometric cosine
-  TAN(3),               // Trigonometric tangent
-  ARCSIN(4),            // Trigonometric sine inverse
-  ARCCOS(5),            // Trigonometric cosine inverse
-  ARCTAN(6),            // Trigonometric tangent inverse
-  SINH(7),              // Hyperbolic sine
-  COSH(8),              // Hyperbolic cosine
-  TANH(9),              // Hyperbolic tangent
-  ARCSINH(10),          // Hyperbolic sine inverse
-  ARCCOSH(11),          // Hyperbolic cosine inverse
-  ARCTANH(12),          // Hyperbolic tangent inverse
-  EXP(13),              // Exponential (base e, Euler number)
-  LOG(14),              // Natural Logarithm (base e)
-  SQRT(15),             // Square-root (x^0.5)
-  CBRT(16),             // Cube-root (x^(1.0/3))
-  CEIL(17),             // Smallest integer value not less than arg
-  FLOOR(18),            // largest integer value not greater than arg
-  ABS(19),              // Absolute value
-  RINT(20),             // Rounds the floating-point argument arg to an integer value
-  BIT_INVERT(21),       // Bitwise Not (~)
-  NOT(22),              // Logical Not (!)
-  CAST_TO_INT64(23),    // Cast value to int64_t
-  CAST_TO_UINT64(24),   // Cast value to uint64_t
-  CAST_TO_FLOAT64(25);  // Cast value to double
+  IS_NULL(1),           // Check if operand is null
+  SIN(2),               // Trigonometric sine
+  COS(3),               // Trigonometric cosine
+  TAN(4),               // Trigonometric tangent
+  ARCSIN(5),            // Trigonometric sine inverse
+  ARCCOS(6),            // Trigonometric cosine inverse
+  ARCTAN(7),            // Trigonometric tangent inverse
+  SINH(8),              // Hyperbolic sine
+  COSH(9),              // Hyperbolic cosine
+  TANH(10),              // Hyperbolic tangent
+  ARCSINH(11),          // Hyperbolic sine inverse
+  ARCCOSH(12),          // Hyperbolic cosine inverse
+  ARCTANH(13),          // Hyperbolic tangent inverse
+  EXP(14),              // Exponential (base e, Euler number)
+  LOG(15),              // Natural Logarithm (base e)
+  SQRT(16),             // Square-root (x^0.5)
+  CBRT(17),             // Cube-root (x^(1.0/3))
+  CEIL(18),             // Smallest integer value not less than arg
+  FLOOR(19),            // largest integer value not greater than arg
+  ABS(20),              // Absolute value
+  RINT(21),             // Rounds the floating-point argument arg to an integer value
+  BIT_INVERT(22),       // Bitwise Not (~)
+  NOT(23),              // Logical Not (!)
+  CAST_TO_INT64(24),    // Cast value to int64_t
+  CAST_TO_UINT64(25),   // Cast value to uint64_t
+  CAST_TO_FLOAT64(26);  // Cast value to double
 
   private final byte nativeId;
 
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
index 1ab3b97945d..1aa7e5e11a0 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
@@ -19,16 +19,18 @@
 import ai.rapids.cudf.BaseDeviceMemoryBuffer;
 import ai.rapids.cudf.CloseableArray;
 import ai.rapids.cudf.Cuda;
+import ai.rapids.cudf.DefaultHostMemoryAllocator;
 import ai.rapids.cudf.DeviceMemoryBuffer;
+import ai.rapids.cudf.HostMemoryAllocator;
 import ai.rapids.cudf.HostMemoryBuffer;
 import ai.rapids.cudf.MemoryBuffer;
 import ai.rapids.cudf.NvtxColor;
 import ai.rapids.cudf.NvtxRange;
 
-import java.util.Arrays;
-
 /** Multi-buffer LZ4 compressor */
 public class BatchedLZ4Compressor {
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   static final long MAX_CHUNK_SIZE = 16777216;  // in bytes
   // each chunk has a 64-bit integer value as metadata containing the compressed size
   static final long METADATA_BYTES_PER_CHUNK = 8;
@@ -207,7 +209,7 @@ private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs,
     final long outputAddrsOffset = inputAddrs.length * 8L;
     final long sizesOffset = outputAddrsOffset + inputAddrs.length * 8L;
     try (NvtxRange range = new NvtxRange("putAddrsAndSizesOnDevice", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(totalSize);
+      try (HostMemoryBuffer hostbuf = hostMemoryAllocator.allocate(totalSize);
            DeviceMemoryBuffer result = DeviceMemoryBuffer.allocate(totalSize)) {
         hostbuf.setLongs(0, inputAddrs, 0, inputAddrs.length);
         hostbuf.setLongs(outputAddrsOffset, outputAddrs, 0, outputAddrs.length);
@@ -224,7 +226,7 @@ private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs,
   // Synchronously copy the resulting compressed sizes from device memory to host memory.
   private long[] getOutputChunkSizes(BaseDeviceMemoryBuffer devChunkSizes, Cuda.Stream stream) {
     try (NvtxRange range = new NvtxRange("getOutputChunkSizes", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(devChunkSizes.getLength())) {
+      try (HostMemoryBuffer hostbuf = hostMemoryAllocator.allocate(devChunkSizes.getLength())) {
         hostbuf.copyFromDeviceBuffer(devChunkSizes, stream);
         int numChunks = (int) (devChunkSizes.getLength() / 8);
         long[] result = new long[numChunks];
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 57372bf7c09..128989fe77f 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -11,7 +11,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
 include(../../../../fetch_rapids.cmake)
 include(rapids-cmake)
@@ -28,7 +28,7 @@ rapids_cuda_init_architectures(CUDF_JNI)
 
 project(
   CUDF_JNI
-  VERSION 23.06.00
+  VERSION 23.10.00
   LANGUAGES C CXX CUDA
 )
 
@@ -126,6 +126,7 @@ add_library(
   cudfjni
   src/Aggregation128UtilsJni.cpp
   src/AggregationJni.cpp
+  src/ChunkedPackJni.cpp
   src/ChunkedReaderJni.cpp
   src/CudfJni.cpp
   src/CudaJni.cpp
@@ -139,6 +140,7 @@ add_library(
   src/NvcompJni.cpp
   src/NvtxRangeJni.cpp
   src/NvtxUniqueRangeJni.cpp
+  src/PackedColumnMetadataJni.cpp
   src/RmmJni.cpp
   src/ScalarJni.cpp
   src/TableJni.cpp
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index ee2325cc76f..f342fca8933 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -32,6 +32,7 @@ constexpr jint MINIMUM_JNI_VERSION = JNI_VERSION_1_6;
 constexpr char const *CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaException";
 constexpr char const *CUDA_FATAL_ERROR_CLASS = "ai/rapids/cudf/CudaFatalException";
 constexpr char const *CUDF_ERROR_CLASS = "ai/rapids/cudf/CudfException";
+constexpr char const *CUDF_OVERFLOW_ERROR_CLASS = "ai/rapids/cudf/CudfColumnSizeOverflowException";
 constexpr char const *CUDF_DTYPE_ERROR_CLASS = "ai/rapids/cudf/CudfException";
 constexpr char const *INDEX_OOB_CLASS = "java/lang/ArrayIndexOutOfBoundsException";
 constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
@@ -785,9 +786,31 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
   }
 }
 
+inline auto add_global_ref(JNIEnv *env, jobject jobj) {
+  auto new_global_ref = env->NewGlobalRef(jobj);
+  if (new_global_ref == nullptr) {
+    throw cudf::jni::jni_exception("global ref");
+  }
+  return new_global_ref;
+}
+
+inline nullptr_t del_global_ref(JNIEnv *env, jobject jobj) {
+  if (jobj != nullptr) {
+    env->DeleteGlobalRef(jobj);
+  }
+  return nullptr;
+}
+
 } // namespace jni
 } // namespace cudf
 
+#define JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val)                                                 \
+  {                                                                                                \
+    if (env->ExceptionOccurred()) {                                                                \
+      return ret_val;                                                                              \
+    }                                                                                              \
+  }
+
 #define JNI_THROW_NEW(env, class_name, message, ret_val)                                           \
   {                                                                                                \
     jclass ex_class = env->FindClass(class_name);                                                  \
@@ -799,33 +822,65 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
   }
 
 // Throw a new exception only if one is not pending then always return with the specified value
-#define JNI_CHECK_THROW_NEW(env, class_name, message, ret_val)                                     \
+#define JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, message, stacktrace, ret_val)              \
   {                                                                                                \
-    if (env->ExceptionOccurred()) {                                                                \
+    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                    \
+    auto const ex_class = env->FindClass(class_name);                                              \
+    if (ex_class == nullptr) {                                                                     \
+      return ret_val;                                                                              \
+    }                                                                                              \
+    auto const ctor_id =                                                                           \
+        env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;)V");           \
+    if (ctor_id == nullptr) {                                                                      \
       return ret_val;                                                                              \
     }                                                                                              \
-    JNI_THROW_NEW(env, class_name, message, ret_val)                                               \
+    auto const empty_str = std::string{""};                                                        \
+    auto const jmessage = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message);     \
+    if (jmessage == nullptr) {                                                                     \
+      return ret_val;                                                                              \
+    }                                                                                              \
+    auto const jstacktrace =                                                                       \
+        env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                 \
+    if (jstacktrace == nullptr) {                                                                  \
+      return ret_val;                                                                              \
+    }                                                                                              \
+    auto const jobj = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace);                    \
+    if (jobj == nullptr) {                                                                         \
+      return ret_val;                                                                              \
+    }                                                                                              \
+    env->Throw(reinterpret_cast<jthrowable>(jobj));                                                \
+    return ret_val;                                                                                \
   }
 
 // Throw a new exception only if one is not pending then always return with the specified value
-#define JNI_CHECK_CUDA_ERROR(env, class_name, e, ret_val)                                          \
+#define JNI_CHECK_THROW_CUDA_EXCEPTION(env, class_name, message, stacktrace, error_code, ret_val)  \
   {                                                                                                \
-    if (env->ExceptionOccurred()) {                                                                \
+    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                    \
+    auto const ex_class = env->FindClass(class_name);                                              \
+    if (ex_class == nullptr) {                                                                     \
       return ret_val;                                                                              \
     }                                                                                              \
-    std::string n_msg = e.what() == nullptr ? "" : e.what();                                       \
-    jstring j_msg = env->NewStringUTF(n_msg.c_str());                                              \
-    jint e_code = static_cast<jint>(e.error_code());                                               \
-    jclass ex_class = env->FindClass(class_name);                                                  \
-    if (ex_class != NULL) {                                                                        \
-      jmethodID ctor_id = env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;I)V");          \
-      if (ctor_id != NULL) {                                                                       \
-        jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, e_code);                     \
-        if (cuda_error != NULL) {                                                                  \
-          env->Throw((jthrowable)cuda_error);                                                      \
-        }                                                                                          \
-      }                                                                                            \
+    auto const ctor_id =                                                                           \
+        env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;I)V");          \
+    if (ctor_id == nullptr) {                                                                      \
+      return ret_val;                                                                              \
+    }                                                                                              \
+    auto const empty_str = std::string{""};                                                        \
+    auto const jmessage = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message);     \
+    if (jmessage == nullptr) {                                                                     \
+      return ret_val;                                                                              \
+    }                                                                                              \
+    auto const jstacktrace =                                                                       \
+        env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                 \
+    if (jstacktrace == nullptr) {                                                                  \
+      return ret_val;                                                                              \
+    }                                                                                              \
+    auto const jerror_code = static_cast<jint>(error_code);                                        \
+    auto const jobj = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace, jerror_code);       \
+    if (jobj == nullptr) {                                                                         \
+      return ret_val;                                                                              \
     }                                                                                              \
+    env->Throw(reinterpret_cast<jthrowable>(jobj));                                                \
     return ret_val;                                                                                \
   }
 
@@ -843,41 +898,45 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
     }                                                                                              \
   }
 
-#define JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val)                                                 \
-  {                                                                                                \
-    if (env->ExceptionOccurred()) {                                                                \
-      return ret_val;                                                                              \
-    }                                                                                              \
-  }
-
 #define CATCH_STD_CLASS(env, class_name, ret_val)                                                  \
   catch (const rmm::out_of_memory &e) {                                                            \
-    auto what =                                                                                    \
+    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                    \
+    auto const what =                                                                              \
         std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \
-    JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                         \
+    JNI_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                               \
   }                                                                                                \
   catch (const cudf::fatal_cuda_error &e) {                                                        \
-    JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val);                      \
+    JNI_CHECK_THROW_CUDA_EXCEPTION(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(),               \
+                                   e.stacktrace(), e.error_code(), ret_val);                       \
   }                                                                                                \
   catch (const cudf::cuda_error &e) {                                                              \
-    JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val);                            \
+    JNI_CHECK_THROW_CUDA_EXCEPTION(env, cudf::jni::CUDA_ERROR_CLASS, e.what(), e.stacktrace(),     \
+                                   e.error_code(), ret_val);                                       \
   }                                                                                                \
   catch (const cudf::data_type_error &e) {                                                         \
-    JNI_CHECK_THROW_NEW(env, cudf::jni::CUDF_DTYPE_ERROR_CLASS, e.what(), ret_val);                \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(env, cudf::jni::CUDF_DTYPE_ERROR_CLASS, e.what(),               \
+                                   e.stacktrace(), ret_val);                                       \
+  }                                                                                                \
+  catch (std::overflow_error const &e) {                                                           \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(env, cudf::jni::CUDF_OVERFLOW_ERROR_CLASS, e.what(),            \
+                                   "No native stacktrace is available.", ret_val);                 \
   }                                                                                                \
   catch (const std::exception &e) {                                                                \
+    char const *stacktrace = "No native stacktrace is available.";                                 \
+    if (auto const cudf_ex = dynamic_cast<cudf::logic_error const *>(&e); cudf_ex != nullptr) {    \
+      stacktrace = cudf_ex->stacktrace();                                                          \
+    }                                                                                              \
     /* Double check whether the thrown exception is unrecoverable CUDA error or not. */            \
     /* Like cudf::detail::throw_cuda_error, it is nearly certain that a fatal error  */            \
     /* occurred if the second call doesn't return with cudaSuccess. */                             \
     cudaGetLastError();                                                                            \
     auto const last = cudaFree(0);                                                                 \
     if (cudaSuccess != last && last == cudaDeviceSynchronize()) {                                  \
-      auto msg = e.what() == nullptr ? std::string{""} : e.what();                                 \
-      auto cuda_error = cudf::fatal_cuda_error{msg, last};                                         \
-      JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, cuda_error, ret_val);           \
+      /* Throw CudaFatalException since the thrown exception is unrecoverable CUDA error */        \
+      JNI_CHECK_THROW_CUDA_EXCEPTION(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(), stacktrace, \
+                                     last, ret_val);                                               \
     }                                                                                              \
-    /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \
-    JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val);                                       \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, e.what(), stacktrace, ret_val);                \
   }
 
 #define CATCH_STD(env, ret_val) CATCH_STD_CLASS(env, cudf::jni::CUDF_ERROR_CLASS, ret_val)
diff --git a/java/src/main/native/src/ChunkedPackJni.cpp b/java/src/main/native/src/ChunkedPackJni.cpp
new file mode 100644
index 00000000000..746a67e1463
--- /dev/null
+++ b/java/src/main/native/src/ChunkedPackJni.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf_jni_apis.hpp"
+
+extern "C" {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackDelete(JNIEnv *env, jclass,
+                                                                         jlong chunked_pack) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    delete cs;
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackGetTotalContiguousSize(
+    JNIEnv *env, jclass, jlong chunked_pack) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    return cs->get_total_contiguous_size();
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackHasNext(JNIEnv *env, jclass,
+                                                                              jlong chunked_pack) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    return cs->has_next();
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackNext(JNIEnv *env, jclass,
+                                                                        jlong chunked_pack,
+                                                                        jlong user_ptr,
+                                                                        jlong user_ptr_size) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto user_buffer_span = cudf::device_span<uint8_t>(reinterpret_cast<uint8_t *>(user_ptr),
+                                                       static_cast<std::size_t>(user_ptr_size));
+    return cs->next(user_buffer_span);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ChunkedPack_chunkedPackBuildMetadata(JNIEnv *env, jclass, jlong chunked_pack) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    std::unique_ptr<std::vector<uint8_t>> result = cs->build_metadata();
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
+} // extern "C"
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 1213ab305fe..d5aad03645f 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -27,6 +27,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/hashing.hpp>
+#include <cudf/lists/combine.hpp>
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/count_elements.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
@@ -46,6 +47,7 @@
 #include <cudf/round.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/search.hpp>
+#include <cudf/stream_compaction.hpp>
 #include <cudf/strings/attributes.hpp>
 #include <cudf/strings/capitalize.hpp>
 #include <cudf/strings/case.hpp>
@@ -89,22 +91,32 @@ using cudf::jni::release_as_jlong;
 
 namespace {
 
-std::size_t calc_device_memory_size(cudf::column_view const &view) {
+std::size_t pad_size(std::size_t size, bool const should_pad_for_cpu) {
+  if (should_pad_for_cpu) {
+    constexpr std::size_t ALIGN = sizeof(std::max_align_t);
+    return (size + (ALIGN - 1)) & ~(ALIGN - 1);
+  } else {
+    return size;
+  }
+}
+
+std::size_t calc_device_memory_size(cudf::column_view const &view, bool const pad_for_cpu) {
   std::size_t total = 0;
   auto row_count = view.size();
 
   if (view.nullable()) {
-    total += cudf::bitmask_allocation_size_bytes(row_count);
+    total += pad_size(cudf::bitmask_allocation_size_bytes(row_count), pad_for_cpu);
   }
 
   auto dtype = view.type();
   if (cudf::is_fixed_width(dtype)) {
-    total += cudf::size_of(dtype) * view.size();
+    total += pad_size(cudf::size_of(dtype) * view.size(), pad_for_cpu);
   }
 
-  return std::accumulate(
-      view.child_begin(), view.child_end(), total,
-      [](std::size_t t, cudf::column_view const &v) { return t + calc_device_memory_size(v); });
+  return std::accumulate(view.child_begin(), view.child_end(), total,
+                         [pad_for_cpu](std::size_t t, cudf::column_view const &v) {
+                           return t + calc_device_memory_size(v, pad_for_cpu);
+                         });
 }
 
 } // anonymous namespace
@@ -178,6 +190,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsPolicy(JNIEnv
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_distinctCount(JNIEnv *env, jclass,
+                                                                    jlong j_col,
+                                                                    jboolean nulls_included) {
+  JNI_NULL_CHECK(env, j_col, "column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view col = *reinterpret_cast<cudf::column_view *>(j_col);
+
+    return cudf::distinct_count(
+        col, nulls_included ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE,
+        cudf::nan_policy::NAN_IS_VALID);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVV(JNIEnv *env, jclass,
                                                                 jlong j_pred_vec, jlong j_true_vec,
                                                                 jlong j_false_vec) {
@@ -488,6 +515,20 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicatesWithKey
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_flattenLists(JNIEnv *env, jclass,
+                                                                    jlong input_handle,
+                                                                    jboolean ignore_null) {
+  JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE :
+                                           cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
+    auto const input_cv = reinterpret_cast<cudf::column_view const *>(input_handle);
+    return release_as_jlong(cudf::lists::concatenate_list_elements(*input_cv, null_policy));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContains(JNIEnv *env, jclass,
                                                                     jlong column_view,
                                                                     jlong lookup_key) {
@@ -1837,21 +1878,32 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
       return release_as_jlong(copy);
     }
 
-    auto input_table = cudf::table_view{n_cudf_columns.get_dereferenced()};
     cudf::binary_operator op = static_cast<cudf::binary_operator>(bin_op);
     switch (op) {
       case cudf::binary_operator::BITWISE_AND: {
-        auto [new_bitmask, null_count] = cudf::bitmask_and(input_table);
+        auto cols = n_cudf_columns.get_dereferenced();
+        cols.push_back(copy->view());
+        auto table_view = cudf::table_view{cols};
+        auto [new_bitmask, null_count] = cudf::bitmask_and(table_view);
         copy->set_null_mask(std::move(new_bitmask), null_count);
         break;
       }
       case cudf::binary_operator::BITWISE_OR: {
-        auto [new_bitmask, null_count] = cudf::bitmask_or(input_table);
+        auto input_table = cudf::table_view{n_cudf_columns.get_dereferenced()};
+        auto [tmp_new_bitmask, tmp_null_count] = cudf::bitmask_or(input_table);
+        copy->set_null_mask(std::move(tmp_new_bitmask), tmp_null_count);
+        // and the bitmask with the original column
+        cudf::table_view table_view{std::vector<cudf::column_view>{copy->view(), *original_column}};
+        auto [new_bitmask, null_count] = cudf::bitmask_and(table_view);
         copy->set_null_mask(std::move(new_bitmask), null_count);
         break;
       }
       default: JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Unsupported merge operation", 0);
     }
+    auto const copy_cv = copy->view();
+    if (cudf::has_nonempty_nulls(copy_cv)) {
+      copy = cudf::purge_nonempty_nulls(copy_cv);
+    }
 
     return release_as_jlong(copy);
   }
@@ -1893,6 +1945,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
       j_null_count = 0;
     }
 
+    if (j_null_count < 0) { // Check for unknown null count.
+      // Calculate concrete null count.
+      j_null_count = cudf::null_count(valid, 0, size);
+    }
+
     if (n_type == cudf::type_id::STRING) {
       if (size == 0) {
         return ptr_as_jlong(
@@ -1904,8 +1961,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
         // data is the second child
 
         cudf::size_type *offsets = reinterpret_cast<cudf::size_type *>(j_offset);
-        cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, size + 1, offsets);
-        cudf::column_view data_column(cudf::data_type{cudf::type_id::INT8}, j_data_size, data);
+        cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, size + 1, offsets,
+                                         nullptr, 0);
+        cudf::column_view data_column(cudf::data_type{cudf::type_id::INT8}, j_data_size, data,
+                                      nullptr, 0);
         return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, size,
                                                   nullptr, valid, j_null_count, 0,
                                                   {offsets_column, data_column}));
@@ -1921,8 +1980,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
         offsets_size = size + 1;
         offsets = reinterpret_cast<cudf::size_type *>(j_offset);
       }
-      cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, offsets_size,
-                                       offsets);
+      cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, offsets_size, offsets,
+                                       nullptr, 0);
       return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::LIST}, size, nullptr,
                                                 valid, j_null_count, 0,
                                                 {offsets_column, *children[0]}));
@@ -2168,16 +2227,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(J
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv *env, jclass,
-                                                                           jlong handle) {
+                                                                           jlong handle,
+                                                                           jboolean pad_for_cpu) {
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto view = reinterpret_cast<cudf::column_view const *>(handle);
-    return calc_device_memory_size(*view);
+    return calc_device_memory_size(*view, pad_for_cpu);
   }
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hostPaddingSizeInBytes(JNIEnv *env, jclass) {
+  return sizeof(std::max_align_t);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_clamper(JNIEnv *env, jobject j_object,
                                                                jlong handle, jlong j_lo_scalar,
                                                                jlong j_lo_replace_scalar,
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 9a96374688a..56aea0b45e2 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -146,7 +146,7 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
 
                      // Here, the input lists satisfy all the conditions below so we output a
                      // null:
-                     //  - Both of the the input lists have no non-null common element, and
+                     //  - Both of the input lists have no non-null common element, and
                      //  - They are both non-empty, and
                      //  - Either of them contains null elements.
                      return false;
@@ -208,10 +208,10 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
       cudf::make_structs_column(out_labels.size(), std::move(out_structs_members), 0, {});
 
   // Assemble a lists column of structs<out_keys, out_vals>.
-  auto out_offsets = make_numeric_column(data_type{type_to_id<offset_type>()}, input.size() + 1,
+  auto out_offsets = make_numeric_column(data_type{type_to_id<size_type>()}, input.size() + 1,
                                          mask_state::UNALLOCATED, stream);
-  auto const offsets_begin = out_offsets->mutable_view().template begin<offset_type>();
-  auto const labels_begin = out_labels.template begin<offset_type>();
+  auto const offsets_begin = out_offsets->mutable_view().template begin<size_type>();
+  auto const labels_begin = out_labels.template begin<size_type>();
   cudf::detail::labels_to_offsets(labels_begin, labels_begin + out_labels.size(), offsets_begin,
                                   offsets_begin + out_offsets->size(), stream);
 
diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp
index 29158cbd98f..12061119402 100644
--- a/java/src/main/native/src/ColumnViewJni.hpp
+++ b/java/src/main/native/src/ColumnViewJni.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ generate_list_offsets(cudf::column_view const &list_length,
  * The function `arrays_overlap` of Apache Spark has a special behavior that needs to be addressed.
  * In particular, the result of checking overlap between two lists will be a null element instead of
  * a `false` value (as output by `cudf::lists::have_overlap`) if:
- *  - Both of the the input lists have no non-null common element, and
+ *  - Both of the input lists have no non-null common element, and
  *  - They are both non-empty, and
  *  - Either of them contains null elements.
  *
diff --git a/java/src/main/native/src/CompiledExpression.cpp b/java/src/main/native/src/CompiledExpression.cpp
index 55bd7af3a79..56c96b26200 100644
--- a/java/src/main/native/src/CompiledExpression.cpp
+++ b/java/src/main/native/src/CompiledExpression.cpp
@@ -131,31 +131,32 @@ enum class jni_serialized_expression_type : int8_t {
 cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value) {
   switch (jni_op_value) {
     case 0: return cudf::ast::ast_operator::IDENTITY;
-    case 1: return cudf::ast::ast_operator::SIN;
-    case 2: return cudf::ast::ast_operator::COS;
-    case 3: return cudf::ast::ast_operator::TAN;
-    case 4: return cudf::ast::ast_operator::ARCSIN;
-    case 5: return cudf::ast::ast_operator::ARCCOS;
-    case 6: return cudf::ast::ast_operator::ARCTAN;
-    case 7: return cudf::ast::ast_operator::SINH;
-    case 8: return cudf::ast::ast_operator::COSH;
-    case 9: return cudf::ast::ast_operator::TANH;
-    case 10: return cudf::ast::ast_operator::ARCSINH;
-    case 11: return cudf::ast::ast_operator::ARCCOSH;
-    case 12: return cudf::ast::ast_operator::ARCTANH;
-    case 13: return cudf::ast::ast_operator::EXP;
-    case 14: return cudf::ast::ast_operator::LOG;
-    case 15: return cudf::ast::ast_operator::SQRT;
-    case 16: return cudf::ast::ast_operator::CBRT;
-    case 17: return cudf::ast::ast_operator::CEIL;
-    case 18: return cudf::ast::ast_operator::FLOOR;
-    case 19: return cudf::ast::ast_operator::ABS;
-    case 20: return cudf::ast::ast_operator::RINT;
-    case 21: return cudf::ast::ast_operator::BIT_INVERT;
-    case 22: return cudf::ast::ast_operator::NOT;
-    case 23: return cudf::ast::ast_operator::CAST_TO_INT64;
-    case 24: return cudf::ast::ast_operator::CAST_TO_UINT64;
-    case 25: return cudf::ast::ast_operator::CAST_TO_FLOAT64;
+    case 1: return cudf::ast::ast_operator::IS_NULL;
+    case 2: return cudf::ast::ast_operator::SIN;
+    case 3: return cudf::ast::ast_operator::COS;
+    case 4: return cudf::ast::ast_operator::TAN;
+    case 5: return cudf::ast::ast_operator::ARCSIN;
+    case 6: return cudf::ast::ast_operator::ARCCOS;
+    case 7: return cudf::ast::ast_operator::ARCTAN;
+    case 8: return cudf::ast::ast_operator::SINH;
+    case 9: return cudf::ast::ast_operator::COSH;
+    case 10: return cudf::ast::ast_operator::TANH;
+    case 11: return cudf::ast::ast_operator::ARCSINH;
+    case 12: return cudf::ast::ast_operator::ARCCOSH;
+    case 13: return cudf::ast::ast_operator::ARCTANH;
+    case 14: return cudf::ast::ast_operator::EXP;
+    case 15: return cudf::ast::ast_operator::LOG;
+    case 16: return cudf::ast::ast_operator::SQRT;
+    case 17: return cudf::ast::ast_operator::CBRT;
+    case 18: return cudf::ast::ast_operator::CEIL;
+    case 19: return cudf::ast::ast_operator::FLOOR;
+    case 20: return cudf::ast::ast_operator::ABS;
+    case 21: return cudf::ast::ast_operator::RINT;
+    case 22: return cudf::ast::ast_operator::BIT_INVERT;
+    case 23: return cudf::ast::ast_operator::NOT;
+    case 24: return cudf::ast::ast_operator::CAST_TO_INT64;
+    case 25: return cudf::ast::ast_operator::CAST_TO_UINT64;
+    case 26: return cudf::ast::ast_operator::CAST_TO_FLOAT64;
     default: throw std::invalid_argument("unexpected JNI AST unary operator value");
   }
 }
diff --git a/java/src/main/native/src/ContiguousTableJni.cpp b/java/src/main/native/src/ContiguousTableJni.cpp
index 85a5a24262e..8c99c77ca1f 100644
--- a/java/src/main/native/src/ContiguousTableJni.cpp
+++ b/java/src/main/native/src/ContiguousTableJni.cpp
@@ -55,10 +55,7 @@ bool cache_contiguous_table_jni(JNIEnv *env) {
 }
 
 void release_contiguous_table_jni(JNIEnv *env) {
-  if (Contiguous_table_jclass != nullptr) {
-    env->DeleteGlobalRef(Contiguous_table_jclass);
-    Contiguous_table_jclass = nullptr;
-  }
+  Contiguous_table_jclass = cudf::jni::del_global_ref(env, Contiguous_table_jclass);
 }
 
 bool cache_contig_split_group_by_result_jni(JNIEnv *env) {
@@ -87,10 +84,7 @@ bool cache_contig_split_group_by_result_jni(JNIEnv *env) {
 }
 
 void release_contig_split_group_by_result_jni(JNIEnv *env) {
-  if (Contig_split_group_by_result_jclass != nullptr) {
-    env->DeleteGlobalRef(Contig_split_group_by_result_jclass);
-    Contig_split_group_by_result_jclass = nullptr;
-  }
+  Contig_split_group_by_result_jclass = del_global_ref(env, Contig_split_group_by_result_jclass);
 }
 
 jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups) {
@@ -149,24 +143,4 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ContiguousTable_createPackedMetadata
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_ContiguousTable_createMetadataDirectBuffer(
-    JNIEnv *env, jclass, jlong j_metadata_ptr) {
-  JNI_NULL_CHECK(env, j_metadata_ptr, "metadata is null", nullptr);
-  try {
-    auto metadata = reinterpret_cast<std::vector<uint8_t> *>(j_metadata_ptr);
-    return env->NewDirectByteBuffer(const_cast<uint8_t *>(metadata->data()), metadata->size());
-  }
-  CATCH_STD(env, nullptr);
-}
-
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ContiguousTable_closeMetadata(JNIEnv *env, jclass,
-                                                                         jlong j_metadata_ptr) {
-  JNI_NULL_CHECK(env, j_metadata_ptr, "metadata is null", );
-  try {
-    auto metadata = reinterpret_cast<std::vector<uint8_t> *>(j_metadata_ptr);
-    delete metadata;
-  }
-  CATCH_STD(env, );
-}
-
 } // extern "C"
diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp
index 1fdadb4b7e2..0f143086451 100644
--- a/java/src/main/native/src/CudfJni.cpp
+++ b/java/src/main/native/src/CudfJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,6 @@ constexpr bool is_ptds_enabled{false};
 #endif
 
 static jclass Host_memory_buffer_jclass;
-static jmethodID Host_buffer_allocate;
 static jfieldID Host_buffer_address;
 static jfieldID Host_buffer_length;
 
@@ -59,11 +58,6 @@ static bool cache_host_memory_buffer_jni(JNIEnv *env) {
     return false;
   }
 
-  Host_buffer_allocate = env->GetStaticMethodID(cls, "allocate", HOST_MEMORY_BUFFER_SIG("JZ"));
-  if (Host_buffer_allocate == nullptr) {
-    return false;
-  }
-
   Host_buffer_address = env->GetFieldID(cls, "address", "J");
   if (Host_buffer_address == nullptr) {
     return false;
@@ -83,15 +77,16 @@ static bool cache_host_memory_buffer_jni(JNIEnv *env) {
 }
 
 static void release_host_memory_buffer_jni(JNIEnv *env) {
-  if (Host_memory_buffer_jclass != nullptr) {
-    env->DeleteGlobalRef(Host_memory_buffer_jclass);
-    Host_memory_buffer_jclass = nullptr;
-  }
+  Host_memory_buffer_jclass = del_global_ref(env, Host_memory_buffer_jclass);
 }
 
-jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned) {
-  jobject ret = env->CallStaticObjectMethod(Host_memory_buffer_jclass, Host_buffer_allocate, amount,
-                                            prefer_pinned);
+jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned,
+                             jobject host_memory_allocator) {
+  auto const host_memory_allocator_class = env->GetObjectClass(host_memory_allocator);
+  auto const allocateMethodId =
+      env->GetMethodID(host_memory_allocator_class, "allocate", HOST_MEMORY_BUFFER_SIG("JZ"));
+  jobject ret =
+      env->CallObjectMethod(host_memory_allocator, allocateMethodId, amount, prefer_pinned);
 
   if (env->ExceptionCheck()) {
     throw std::runtime_error("allocateHostBuffer threw an exception");
@@ -195,7 +190,7 @@ JNIEXPORT void JNI_OnUnload(JavaVM *vm, void *) {
   cudf::jni::release_host_memory_buffer_jni(env);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_isPtdsEnabled(JNIEnv *env, jclass, jlong size) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_isPtdsEnabled(JNIEnv *env, jclass) {
   return cudf::jni::is_ptds_enabled;
 }
 
diff --git a/java/src/main/native/src/PackedColumnMetadataJni.cpp b/java/src/main/native/src/PackedColumnMetadataJni.cpp
new file mode 100644
index 00000000000..7ec3e1294ce
--- /dev/null
+++ b/java/src/main/native/src/PackedColumnMetadataJni.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf_jni_apis.hpp"
+
+extern "C" {
+
+JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_PackedColumnMetadata_createMetadataDirectBuffer(
+    JNIEnv *env, jclass, jlong j_metadata_ptr) {
+  JNI_NULL_CHECK(env, j_metadata_ptr, "metadata is null", nullptr);
+  try {
+    auto metadata = reinterpret_cast<std::vector<uint8_t> *>(j_metadata_ptr);
+    return env->NewDirectByteBuffer(const_cast<uint8_t *>(metadata->data()), metadata->size());
+  }
+  CATCH_STD(env, nullptr);
+}
+
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_PackedColumnMetadata_closeMetadata(JNIEnv *env, jclass, jlong j_metadata_ptr) {
+  JNI_NULL_CHECK(env, j_metadata_ptr, "metadata is null", );
+  try {
+    auto metadata = reinterpret_cast<std::vector<uint8_t> *>(j_metadata_ptr);
+    delete metadata;
+  }
+  CATCH_STD(env, );
+}
+
+} // extern "C"
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 5bbb5383d93..3c49d153cb6 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -197,10 +197,7 @@ class java_event_handler_memory_resource : public device_memory_resource {
     update_thresholds(env, alloc_thresholds, jalloc_thresholds);
     update_thresholds(env, dealloc_thresholds, jdealloc_thresholds);
 
-    handler_obj = env->NewGlobalRef(jhandler);
-    if (handler_obj == nullptr) {
-      throw cudf::jni::jni_exception("global ref");
-    }
+    handler_obj = cudf::jni::add_global_ref(env, jhandler);
   }
 
   virtual ~java_event_handler_memory_resource() {
@@ -209,7 +206,7 @@ class java_event_handler_memory_resource : public device_memory_resource {
     // already be destroyed and this thread should not try to attach to get an environment.
     JNIEnv *env = nullptr;
     if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      env->DeleteGlobalRef(handler_obj);
+      handler_obj = cudf::jni::del_global_ref(env, handler_obj);
     }
     handler_obj = nullptr;
   }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 51c2e92492f..b05fc9b7bc4 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -45,6 +45,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <thrust/iterator/counting_iterator.h>
 
 #include "csv_chunked_writer.hpp"
@@ -58,16 +59,34 @@
 namespace cudf {
 namespace jni {
 
-template <typename WRITER> class jni_table_writer_handle final {
-public:
-  explicit jni_table_writer_handle(std::unique_ptr<WRITER> writer)
-      : writer(std::move(writer)), sink() {}
-  jni_table_writer_handle(std::unique_ptr<WRITER> writer,
-                          std::unique_ptr<jni_writer_data_sink> sink)
-      : writer(std::move(writer)), sink(std::move(sink)) {}
+/**
+ * @brief The base class for table writer.
+ *
+ * By storing a pointer to this base class instead of pointer to specific writer class, we can
+ * retrieve common data like `sink` and `stats` for any derived writer class without the need of
+ * casting or knowing its type.
+ */
+struct jni_table_writer_handle_base {
+  explicit jni_table_writer_handle_base(
+      std::unique_ptr<jni_writer_data_sink> &&sink_,
+      std::shared_ptr<cudf::io::writer_compression_statistics> &&stats_)
+      : sink{std::move(sink_)}, stats{std::move(stats_)} {}
 
-  std::unique_ptr<WRITER> writer;
   std::unique_ptr<jni_writer_data_sink> sink;
+  std::shared_ptr<cudf::io::writer_compression_statistics> stats;
+};
+
+template <typename Writer>
+struct jni_table_writer_handle final : public jni_table_writer_handle_base {
+  explicit jni_table_writer_handle(std::unique_ptr<Writer> &&writer_)
+      : jni_table_writer_handle_base(nullptr, nullptr), writer{std::move(writer_)} {}
+  explicit jni_table_writer_handle(
+      std::unique_ptr<Writer> &&writer_, std::unique_ptr<jni_writer_data_sink> &&sink_,
+      std::shared_ptr<cudf::io::writer_compression_statistics> &&stats_)
+      : jni_table_writer_handle_base(std::move(sink_), std::move(stats_)),
+        writer{std::move(writer_)} {}
+
+  std::unique_ptr<Writer> writer;
 };
 
 typedef jni_table_writer_handle<cudf::io::parquet_chunked_writer> native_parquet_writer_handle;
@@ -205,7 +224,7 @@ class native_arrow_ipc_writer_handle final {
 
 class jni_arrow_output_stream final : public arrow::io::OutputStream {
 public:
-  explicit jni_arrow_output_stream(JNIEnv *env, jobject callback) {
+  explicit jni_arrow_output_stream(JNIEnv *env, jobject callback, jobject host_memory_allocator) {
     if (env->GetJavaVM(&jvm) < 0) {
       throw std::runtime_error("GetJavaVM failed");
     }
@@ -220,11 +239,8 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
     if (handle_buffer_method == nullptr) {
       throw cudf::jni::jni_exception("handleBuffer method");
     }
-
-    this->callback = env->NewGlobalRef(callback);
-    if (this->callback == nullptr) {
-      throw cudf::jni::jni_exception("global ref");
-    }
+    this->callback = add_global_ref(env, callback);
+    this->host_memory_allocator = add_global_ref(env, host_memory_allocator);
   }
 
   virtual ~jni_arrow_output_stream() {
@@ -233,13 +249,13 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
     // already be destroyed and this thread should not try to attach to get an environment.
     JNIEnv *env = nullptr;
     if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      env->DeleteGlobalRef(callback);
-      if (current_buffer != nullptr) {
-        env->DeleteGlobalRef(current_buffer);
-      }
+      callback = del_global_ref(env, callback);
+      current_buffer = del_global_ref(env, current_buffer);
+      host_memory_allocator = del_global_ref(env, host_memory_allocator);
     }
     callback = nullptr;
     current_buffer = nullptr;
+    host_memory_allocator = nullptr;
   }
 
   arrow::Status Write(const std::shared_ptr<arrow::Buffer> &data) override {
@@ -274,10 +290,7 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
     if (current_buffer_written > 0) {
       JNIEnv *env = cudf::jni::get_jni_env(jvm);
       handle_buffer(env, current_buffer, current_buffer_written);
-      if (current_buffer != nullptr) {
-        env->DeleteGlobalRef(current_buffer);
-      }
-      current_buffer = nullptr;
+      current_buffer = del_global_ref(env, current_buffer);
       current_buffer_len = 0;
       current_buffer_data = nullptr;
       current_buffer_written = 0;
@@ -304,11 +317,10 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
   void rotate_buffer(JNIEnv *env) {
     if (current_buffer != nullptr) {
       handle_buffer(env, current_buffer, current_buffer_written);
-      env->DeleteGlobalRef(current_buffer);
-      current_buffer = nullptr;
     }
-    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true);
-    current_buffer = env->NewGlobalRef(tmp_buffer);
+    current_buffer = del_global_ref(env, current_buffer);
+    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
+    current_buffer = add_global_ref(env, tmp_buffer);
     current_buffer_len = get_host_buffer_length(env, current_buffer);
     current_buffer_data = reinterpret_cast<char *>(get_host_buffer_address(env, current_buffer));
     current_buffer_written = 0;
@@ -331,6 +343,7 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
   int64_t total_written = 0;
   long alloc_size = MINIMUM_WRITE_BUFFER_SIZE;
   bool is_closed = false;
+  jobject host_memory_allocator;
 };
 
 class jni_arrow_input_stream final : public arrow::io::InputStream {
@@ -351,10 +364,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
       throw cudf::jni::jni_exception("readInto method");
     }
 
-    this->callback = env->NewGlobalRef(callback);
-    if (this->callback == nullptr) {
-      throw cudf::jni::jni_exception("global ref");
-    }
+    this->callback = add_global_ref(env, callback);
   }
 
   virtual ~jni_arrow_input_stream() {
@@ -363,7 +373,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
     // already be destroyed and this thread should not try to attach to get an environment.
     JNIEnv *env = nullptr;
     if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      env->DeleteGlobalRef(callback);
+      callback = del_global_ref(env, callback);
     }
     callback = nullptr;
   }
@@ -1250,7 +1260,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(
     JNIEnv *env, jclass, jobjectArray j_column_names, jboolean include_header,
     jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_true_value,
-    jstring j_false_value, jint j_quote_style, jobject j_buffer) {
+    jstring j_false_value, jint j_quote_style, jobject j_buffer, jobject host_memory_allocator) {
   JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", 0);
   JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", 0);
   JNI_NULL_CHECK(env, j_field_delimiter, "field delimiter cannot be null", 0);
@@ -1260,7 +1270,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(
   try {
     cudf::jni::auto_set_device(env);
 
-    auto data_sink = std::make_unique<cudf::jni::jni_writer_data_sink>(env, j_buffer);
+    auto data_sink =
+        std::make_unique<cudf::jni::jni_writer_data_sink>(env, j_buffer, host_memory_allocator);
 
     auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names};
     auto const column_names = n_column_names.as_cpp_vector();
@@ -1391,20 +1402,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
 
   bool read_buffer = true;
   if (buffer == 0) {
-    JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
+    JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", 0);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
     JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+                  "cannot pass in both a buffer and an inputfilepath", 0);
   } else if (buffer_length <= 0) {
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported",
-                  NULL);
+    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", 0);
   }
 
   try {
@@ -1414,13 +1424,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
     cudf::jni::native_jintArray n_scales(env, j_scales);
     if (n_types.is_null() != n_scales.is_null()) {
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null",
-                    NULL);
+                    0);
     }
     std::vector<cudf::data_type> data_types;
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
         JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
-                      NULL);
+                      0);
       }
       data_types.reserve(n_types.size());
       std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
@@ -1431,8 +1441,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
 
     cudf::jni::native_jstring filename(env, inputfilepath);
     if (!read_buffer && filename.is_empty()) {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "inputfilepath can't be empty",
-                    NULL);
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "inputfilepath can't be empty", 0);
     }
 
     auto source = read_buffer ? cudf::io::source_info{reinterpret_cast<char *>(buffer),
@@ -1446,7 +1455,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
     if (!n_col_names.is_null() && data_types.size() > 0) {
       if (n_col_names.size() != n_types.size()) {
         JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                      "types and column names must match size", NULL);
+                      "types and column names must match size", 0);
       }
 
       std::map<std::string, cudf::data_type> map;
@@ -1462,47 +1471,12 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
       // should infer the types
     }
 
-    cudf::io::table_with_metadata result = cudf::io::read_json(opts.build());
-
-    // there is no need to re-order columns when inferring schema
-    if (result.metadata.schema_info.empty() || n_col_names.size() <= 0) {
-      return convert_table_for_return(env, result.tbl);
-    } else {
-      // json reader will not return the correct column order,
-      // so we need to re-order the column of table according to table meta.
-
-      // turn name and its index in table into map<name, index>
-      std::map<std::string, cudf::size_type> m;
-      std::transform(result.metadata.schema_info.cbegin(), result.metadata.schema_info.cend(),
-                     thrust::make_counting_iterator(0), std::inserter(m, m.end()),
-                     [](auto const &column_info, auto const &index) {
-                       return std::make_pair(column_info.name, index);
-                     });
-
-      auto col_names_vec = n_col_names.as_cpp_vector();
-      std::vector<cudf::size_type> indices;
-
-      bool match = true;
-      for (size_t i = 0; i < col_names_vec.size(); i++) {
-        if (m.find(col_names_vec[i]) == m.end()) {
-          match = false;
-          break;
-        } else {
-          indices.push_back(m.at(col_names_vec[i]));
-        }
-      }
+    auto result =
+        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
-      if (!match) {
-        // can't find some input column names in table meta, return what json reader reads.
-        return convert_table_for_return(env, result.tbl);
-      } else {
-        auto tbv = result.tbl->view().select(std::move(indices));
-        auto table = std::make_unique<cudf::table>(tbv);
-        return convert_table_for_return(env, table);
-      }
-    }
+    return reinterpret_cast<jlong>(result.release());
   }
-  CATCH_STD(env, NULL);
+  CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
@@ -1594,7 +1568,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
     jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
     jbooleanArray j_is_map, jbooleanArray j_is_binary, jbooleanArray j_hasParquetFieldIds,
-    jintArray j_parquetFieldIds, jobject consumer) {
+    jintArray j_parquetFieldIds, jobject consumer, jobject host_memory_allocator) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1602,7 +1576,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
   JNI_NULL_CHECK(env, consumer, "null consumer", 0);
   try {
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
-        new cudf::jni::jni_writer_data_sink(env, consumer));
+        new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
 
     using namespace cudf::io;
     using namespace cudf::jni;
@@ -1618,18 +1592,24 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     std::map<std::string, std::string> kv_metadata;
     std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](auto const &key, auto const &value) { return std::make_pair(key, value); });
+                   [](auto const &key, auto const &value) {
+                     // The metadata value will be ignored if it is empty.
+                     // We modify it into a space character to workaround such issue.
+                     return std::make_pair(key, value.empty() ? std::string(" ") : value);
+                   });
 
+    auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
-            .metadata(&metadata)
+            .metadata(std::move(metadata))
             .compression(static_cast<compression_type>(j_compression))
             .stats_level(static_cast<statistics_freq>(j_stats_freq))
             .key_value_metadata({kv_metadata})
+            .compression_statistics(stats)
             .build();
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
-    cudf::jni::native_parquet_writer_handle *ret =
-        new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr), std::move(data_sink));
+    cudf::jni::native_parquet_writer_handle *ret = new cudf::jni::native_parquet_writer_handle(
+        std::move(writer_ptr), std::move(data_sink), std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
@@ -1662,20 +1642,26 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     std::map<std::string, std::string> kv_metadata;
     std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](auto const &key, auto const &value) { return std::make_pair(key, value); });
+                   [](auto const &key, auto const &value) {
+                     // The metadata value will be ignored if it is empty.
+                     // We modify it into a space character to workaround such issue.
+                     return std::make_pair(key, value.empty() ? std::string(" ") : value);
+                   });
 
     sink_info sink{output_path.get()};
+    auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
-            .metadata(&metadata)
+            .metadata(std::move(metadata))
             .compression(static_cast<compression_type>(j_compression))
             .stats_level(static_cast<statistics_freq>(j_stats_freq))
             .key_value_metadata({kv_metadata})
+            .compression_statistics(stats)
             .build();
 
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
-    cudf::jni::native_parquet_writer_handle *ret =
-        new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr));
+    cudf::jni::native_parquet_writer_handle *ret = new cudf::jni::native_parquet_writer_handle(
+        std::move(writer_ptr), nullptr, std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
@@ -1769,7 +1755,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
 JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
     jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, jobject consumer) {
+    jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, jobject consumer,
+    jobject host_memory_allocator) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1801,17 +1788,20 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
                    [](const std::string &k, const std::string &v) { return std::make_pair(k, v); });
 
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
-        new cudf::jni::jni_writer_data_sink(env, consumer));
+        new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
     sink_info sink{data_sink.get()};
+
+    auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
-                                          .metadata(&metadata)
+                                          .metadata(std::move(metadata))
                                           .compression(static_cast<compression_type>(j_compression))
                                           .enable_statistics(ORC_STATISTICS_ROW_GROUP)
                                           .key_value_metadata(kv_metadata)
+                                          .compression_statistics(stats)
                                           .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
-    cudf::jni::native_orc_writer_handle *ret =
-        new cudf::jni::native_orc_writer_handle(std::move(writer_ptr), std::move(data_sink));
+    cudf::jni::native_orc_writer_handle *ret = new cudf::jni::native_orc_writer_handle(
+        std::move(writer_ptr), std::move(data_sink), std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
@@ -1852,15 +1842,17 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
                    [](const std::string &k, const std::string &v) { return std::make_pair(k, v); });
 
     sink_info sink{output_path.get()};
+    auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
-                                          .metadata(&metadata)
+                                          .metadata(std::move(metadata))
                                           .compression(static_cast<compression_type>(j_compression))
                                           .enable_statistics(ORC_STATISTICS_ROW_GROUP)
                                           .key_value_metadata(kv_metadata)
+                                          .compression_statistics(stats)
                                           .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
     cudf::jni::native_orc_writer_handle *ret =
-        new cudf::jni::native_orc_writer_handle(std::move(writer_ptr));
+        new cudf::jni::native_orc_writer_handle(std::move(writer_ptr), nullptr, std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
@@ -1902,9 +1894,34 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass
   CATCH_STD(env, )
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(JNIEnv *env, jclass,
-                                                                          jobjectArray j_col_names,
-                                                                          jobject consumer) {
+JNIEXPORT jdoubleArray JNICALL Java_ai_rapids_cudf_TableWriter_getWriteStatistics(JNIEnv *env,
+                                                                                  jclass,
+                                                                                  jlong j_state) {
+  JNI_NULL_CHECK(env, j_state, "null state", nullptr);
+
+  using namespace cudf::io;
+  auto const state = reinterpret_cast<cudf::jni::jni_table_writer_handle_base const *>(j_state);
+  try {
+    cudf::jni::auto_set_device(env);
+    if (!state->stats) {
+      return nullptr;
+    }
+
+    auto const &stats = *state->stats;
+    auto output = cudf::jni::native_jdoubleArray(env, 4);
+    output[0] = static_cast<jdouble>(stats.num_compressed_bytes());
+    output[1] = static_cast<jdouble>(stats.num_failed_bytes());
+    output[2] = static_cast<jdouble>(stats.num_skipped_bytes());
+    output[3] = static_cast<jdouble>(stats.compression_ratio());
+
+    return output.get_jArray();
+  }
+  CATCH_STD(env, nullptr)
+}
+
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(
+    JNIEnv *env, jclass, jobjectArray j_col_names, jobject consumer,
+    jobject host_memory_allocator) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, consumer, "null consumer", 0);
   try {
@@ -1912,7 +1929,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(JNIEnv
     cudf::jni::native_jstringArray col_names(env, j_col_names);
 
     std::shared_ptr<cudf::jni::jni_arrow_output_stream> data_sink(
-        new cudf::jni::jni_arrow_output_stream(env, consumer));
+        new cudf::jni::jni_arrow_output_stream(env, consumer, host_memory_allocator));
 
     cudf::jni::native_arrow_ipc_writer_handle *ret =
         new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink);
@@ -2905,6 +2922,20 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Table_distinctCount(JNIEnv *env, jclass,
+                                                               jlong input_jtable,
+                                                               jboolean nulls_equal) {
+  JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+
+    return cudf::distinct_count(*input, nulls_equal ? cudf::null_equality::EQUAL :
+                                                      cudf::null_equality::UNEQUAL);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(JNIEnv *env, jclass,
                                                                       jlong input_jtable,
                                                                       jintArray key_columns,
@@ -3158,10 +3189,31 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplit(JNIEnv
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_makeChunkedPack(JNIEnv *env, jclass,
+                                                                  jlong input_table,
+                                                                  jlong bounce_buffer_size,
+                                                                  jlong memoryResourceHandle) {
+  JNI_NULL_CHECK(env, input_table, "native handle is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *n_table = reinterpret_cast<cudf::table_view *>(input_table);
+    // `temp_mr` is the memory resource that `cudf::chunked_pack` will use to create temporary
+    // and scratch memory only.
+    auto temp_mr = memoryResourceHandle != 0 ?
+                       reinterpret_cast<rmm::mr::device_memory_resource *>(memoryResourceHandle) :
+                       rmm::mr::get_current_device_resource();
+    auto chunked_pack = cudf::chunked_pack::create(*n_table, bounce_buffer_size, temp_mr);
+    return reinterpret_cast<jlong>(chunked_pack.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
     JNIEnv *env, jclass, jlong j_input_table, jintArray j_keys, jlongArray j_default_output,
     jintArray j_aggregate_column_indices, jlongArray j_agg_instances, jintArray j_min_periods,
-    jintArray j_preceding, jintArray j_following, jboolean ignore_null_keys) {
+    jintArray j_preceding, jintArray j_following, jbooleanArray j_unbounded_preceding,
+    jbooleanArray j_unbounded_following, jboolean ignore_null_keys) {
 
   JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, j_keys, "input keys are null", NULL);
@@ -3183,6 +3235,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
     cudf::jni::native_jintArray min_periods{env, j_min_periods};
     cudf::jni::native_jintArray preceding{env, j_preceding};
     cudf::jni::native_jintArray following{env, j_following};
+    cudf::jni::native_jbooleanArray unbounded_preceding{env, j_unbounded_preceding};
+    cudf::jni::native_jbooleanArray unbounded_following{env, j_unbounded_following};
 
     if (not valid_window_parameters(values, agg_instances, min_periods, preceding, following)) {
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
@@ -3201,14 +3255,21 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
                     nullptr);
 
       int agg_column_index = values[i];
+      auto const preceding_window_bounds = unbounded_preceding[i] ?
+                                               cudf::window_bounds::unbounded() :
+                                               cudf::window_bounds::get(preceding[i]);
+      auto const following_window_bounds = unbounded_following[i] ?
+                                               cudf::window_bounds::unbounded() :
+                                               cudf::window_bounds::get(following[i]);
+
       if (default_output[i] != nullptr) {
         result_columns.emplace_back(cudf::grouped_rolling_window(
-            groupby_keys, input_table->column(agg_column_index), *default_output[i], preceding[i],
-            following[i], min_periods[i], *agg));
+            groupby_keys, input_table->column(agg_column_index), *default_output[i],
+            preceding_window_bounds, following_window_bounds, min_periods[i], *agg));
       } else {
-        result_columns.emplace_back(
-            cudf::grouped_rolling_window(groupby_keys, input_table->column(agg_column_index),
-                                         preceding[i], following[i], min_periods[i], *agg));
+        result_columns.emplace_back(cudf::grouped_rolling_window(
+            groupby_keys, input_table->column(agg_column_index), preceding_window_bounds,
+            following_window_bounds, min_periods[i], *agg));
       }
     }
 
@@ -3480,8 +3541,8 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
       auto const vec = thrust::host_vector<cudf::size_type>(begin, end);
       auto buf = rmm::device_buffer{vec.data(), size * sizeof(cudf::size_type),
                                     cudf::get_default_stream()};
-      auto gather_map_col = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
-                                                           size, std::move(buf));
+      auto gather_map_col = std::make_unique<cudf::column>(
+          cudf::data_type{cudf::type_id::INT32}, size, std::move(buf), rmm::device_buffer{}, 0);
 
       // gather the first key in each group to remove duplicated ones.
       group_by_result_table = cudf::gather(groups.keys->view(), gather_map_col->view());
diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp
index 18993aea294..867df80b722 100644
--- a/java/src/main/native/src/cudf_jni_apis.hpp
+++ b/java/src/main/native/src/cudf_jni_apis.hpp
@@ -100,7 +100,8 @@ jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups,
 /**
  * Allocate a HostMemoryBuffer
  */
-jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned);
+jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned,
+                             jobject host_memory_allocator);
 
 /**
  * Get the address of a HostMemoryBuffer
diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp
index 05fe594fcd5..efac6112c25 100644
--- a/java/src/main/native/src/jni_writer_data_sink.hpp
+++ b/java/src/main/native/src/jni_writer_data_sink.hpp
@@ -26,7 +26,7 @@ constexpr long MINIMUM_WRITE_BUFFER_SIZE = 10 * 1024 * 1024; // 10 MB
 
 class jni_writer_data_sink final : public cudf::io::data_sink {
 public:
-  explicit jni_writer_data_sink(JNIEnv *env, jobject callback) {
+  explicit jni_writer_data_sink(JNIEnv *env, jobject callback, jobject host_memory_allocator) {
     if (env->GetJavaVM(&jvm) < 0) {
       throw std::runtime_error("GetJavaVM failed");
     }
@@ -42,10 +42,8 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
       throw cudf::jni::jni_exception("handleBuffer method");
     }
 
-    this->callback = env->NewGlobalRef(callback);
-    if (this->callback == nullptr) {
-      throw cudf::jni::jni_exception("global ref");
-    }
+    this->callback = add_global_ref(env, callback);
+    this->host_memory_allocator = add_global_ref(env, host_memory_allocator);
   }
 
   virtual ~jni_writer_data_sink() {
@@ -54,13 +52,13 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
     // already be destroyed and this thread should not try to attach to get an environment.
     JNIEnv *env = nullptr;
     if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      env->DeleteGlobalRef(callback);
-      if (current_buffer != nullptr) {
-        env->DeleteGlobalRef(current_buffer);
-      }
+      callback = del_global_ref(env, callback);
+      current_buffer = del_global_ref(env, current_buffer);
+      host_memory_allocator = del_global_ref(env, host_memory_allocator);
     }
     callback = nullptr;
     current_buffer = nullptr;
+    host_memory_allocator = nullptr;
   }
 
   void host_write(void const *data, size_t size) override {
@@ -126,10 +124,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
     if (current_buffer_written > 0) {
       JNIEnv *env = cudf::jni::get_jni_env(jvm);
       handle_buffer(env, current_buffer, current_buffer_written);
-      if (current_buffer != nullptr) {
-        env->DeleteGlobalRef(current_buffer);
-      }
-      current_buffer = nullptr;
+      current_buffer = del_global_ref(env, current_buffer);
       current_buffer_len = 0;
       current_buffer_data = nullptr;
       current_buffer_written = 0;
@@ -144,11 +139,10 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
   void rotate_buffer(JNIEnv *env) {
     if (current_buffer != nullptr) {
       handle_buffer(env, current_buffer, current_buffer_written);
-      env->DeleteGlobalRef(current_buffer);
-      current_buffer = nullptr;
     }
-    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true);
-    current_buffer = env->NewGlobalRef(tmp_buffer);
+    current_buffer = del_global_ref(env, current_buffer);
+    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
+    current_buffer = add_global_ref(env, tmp_buffer);
     current_buffer_len = get_host_buffer_length(env, current_buffer);
     current_buffer_data = reinterpret_cast<char *>(get_host_buffer_address(env, current_buffer));
     current_buffer_written = 0;
@@ -170,6 +164,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
   long current_buffer_written = 0;
   size_t total_written = 0;
   long alloc_size = MINIMUM_WRITE_BUFFER_SIZE;
+  jobject host_memory_allocator;
 };
 
 } // namespace cudf::jni
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 84f84f8b46f..d93d38c7758 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -179,9 +179,9 @@ struct tile_info {
  *
  */
 struct row_batch {
-  size_type num_bytes;                     // number of bytes in this batch
-  size_type row_count;                     // number of rows in the batch
-  device_uvector<offset_type> row_offsets; // offsets column of output cudf column
+  size_type num_bytes;                   // number of bytes in this batch
+  size_type row_count;                   // number of rows in the batch
+  device_uvector<size_type> row_offsets; // offsets column of output cudf column
 };
 
 /**
@@ -1885,12 +1885,13 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
   std::transform(counting_iter, counting_iter + batch_info.row_batches.size(),
                  std::back_inserter(ret), [&](auto batch) {
                    auto const offset_count = batch_info.row_batches[batch].row_offsets.size();
-                   auto offsets = std::make_unique<column>(
-                       data_type{type_id::INT32}, (size_type)offset_count,
-                       batch_info.row_batches[batch].row_offsets.release());
-                   auto data = std::make_unique<column>(data_type{type_id::INT8},
-                                                        batch_info.row_batches[batch].num_bytes,
-                                                        std::move(output_buffers[batch]));
+                   auto offsets =
+                       std::make_unique<column>(data_type{type_id::INT32}, (size_type)offset_count,
+                                                batch_info.row_batches[batch].row_offsets.release(),
+                                                rmm::device_buffer{}, 0);
+                   auto data = std::make_unique<column>(
+                       data_type{type_id::INT8}, batch_info.row_batches[batch].num_bytes,
+                       std::move(output_buffers[batch]), rmm::device_buffer{}, 0);
 
                    return make_lists_column(
                        batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data),
@@ -2257,16 +2258,20 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
     for (int i = 0; i < static_cast<int>(schema.size()); ++i) {
       if (schema[i].id() == type_id::STRING) {
         // stuff real string column
+        auto const null_count = string_row_offset_columns[string_idx]->null_count();
         auto string_data = string_row_offset_columns[string_idx].release()->release();
-        output_columns[i] = make_strings_column(num_rows, std::move(string_col_offsets[string_idx]),
-                                                std::move(string_data_cols[string_idx]),
-                                                std::move(*string_data.null_mask.release()),
-                                                cudf::UNKNOWN_NULL_COUNT);
+        output_columns[i] =
+            make_strings_column(num_rows, std::move(string_col_offsets[string_idx]),
+                                std::move(string_data_cols[string_idx]),
+                                std::move(*string_data.null_mask.release()), null_count);
         string_idx++;
       }
     }
   }
 
+  for (auto &col : output_columns) {
+    col->set_null_count(cudf::null_count(col->view().null_mask(), 0, col->size()));
+  }
   return std::make_unique<table>(std::move(output_columns));
 }
 
@@ -2322,6 +2327,9 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
         num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
         dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
 
+    for (auto &col : output_columns) {
+      col->set_null_count(cudf::null_count(col->view().null_mask(), 0, col->size()));
+    }
     return std::make_unique<table>(std::move(output_columns));
   } else {
     CUDF_FAIL("Only fixed width types are currently supported");
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 59b4c9f9f67..f6dffc88b92 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -97,6 +97,14 @@ void testTransformVector() {
     }
   }
 
+  @Test
+  void testDistinctCount() {
+    try (ColumnVector cv = ColumnVector.fromBoxedLongs(5L, 3L, null, null, 5L)) {
+      assertEquals(3, cv.distinctCount());
+      assertEquals(2, cv.distinctCount(NullPolicy.EXCLUDE));
+    }
+  }
+
   @Test
   void testClampDouble() {
     try (ColumnVector cv = ColumnVector.fromDoubles(2.33d, 32.12d, -121.32d, 0.0d, 0.00001d,
@@ -1018,21 +1026,36 @@ void decimal128Cv() {
     }
   }
 
+  static final long HOST_ALIGN_BYTES = ColumnView.hostPaddingSizeInBytes();
+
+  static void assertHostAligned(long expectedDeviceSize, ColumnView cv) {
+      long deviceSize = cv.getDeviceMemorySize();
+      assertEquals(expectedDeviceSize, deviceSize);
+      long hostSize = cv.getHostBytesRequired();
+      assert(hostSize >= deviceSize);
+      long roundedHostSize = (hostSize / HOST_ALIGN_BYTES) * HOST_ALIGN_BYTES;
+      assertEquals(hostSize, roundedHostSize, "The host size should be a multiple of " +
+              HOST_ALIGN_BYTES);
+  }
+
   @Test
   void testGetDeviceMemorySizeNonStrings() {
     try (ColumnVector v0 = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6);
          ColumnVector v1 = ColumnVector.fromBoxedInts(1, 2, 3, null, null, 4, 5, 6)) {
-      assertEquals(24, v0.getDeviceMemorySize()); // (6*4B)
-      assertEquals(96, v1.getDeviceMemorySize()); // (8*4B) + 64B(for validity vector)
+      assertHostAligned(24, v0); // (6*4B)
+      assertHostAligned(96, v1); // (8*4B) + 64B(for validity vector)
     }
   }
 
   @Test
   void testGetDeviceMemorySizeStrings() {
+    if (ColumnView.hostPaddingSizeInBytes() != 8) {
+      System.err.println("HOST PADDING SIZE: " + ColumnView.hostPaddingSizeInBytes());
+    }
     try (ColumnVector v0 = ColumnVector.fromStrings("onetwothree", "four", "five");
          ColumnVector v1 = ColumnVector.fromStrings("onetwothree", "four", null, "five")) {
-      assertEquals(35, v0.getDeviceMemorySize()); //19B data + 4*4B offsets = 35
-      assertEquals(103, v1.getDeviceMemorySize()); //19B data + 5*4B + 64B validity vector = 103B
+      assertHostAligned(35, v0); //19B data + 4*4B offsets = 35
+      assertHostAligned(103, v1); //19B data + 5*4B + 64B validity vector = 103B
     }
   }
 
@@ -1053,13 +1076,13 @@ void testGetDeviceMemorySizeLists() {
       // 64 bytes for validity of list column
       // 16 bytes for offsets of list column
       // 64 bytes for validity of string column
-      // 24 bytes for offsets of of string column
+      // 24 bytes for offsets of string column
       // 22 bytes of string character size
-      assertEquals(64+16+64+24+22, sv.getDeviceMemorySize());
+      assertHostAligned(64+16+64+24+22, sv);
 
       // 20 bytes for offsets of list column
       // 28 bytes for data of INT32 column
-      assertEquals(20+28, iv.getDeviceMemorySize());
+      assertHostAligned(20+28, iv);
     }
   }
 
@@ -1083,11 +1106,11 @@ void testGetDeviceMemorySizeStructs() {
       // 64 bytes for validity of list column
       // 20 bytes for offsets of list column
       // 64 bytes for validity of string column
-      // 28 bytes for offsets of of string column
+      // 28 bytes for offsets of string column
       // 22 bytes of string character size
       // 64 bytes for validity of int64 column
       // 28 bytes for data of the int64 column
-      assertEquals(64+64+20+64+28+22+64+28, v.getDeviceMemorySize());
+      assertHostAligned(64+64+20+64+28+22+64+28, v);
     }
   }
 
@@ -2091,6 +2114,15 @@ void testTrimStringsThrowsException() {
     });
   }
 
+  @Test
+  void testTrimEmptyStringsWithNulls() {
+    try (ColumnVector cv = ColumnVector.fromStrings("", null);
+         ColumnVector trimmed = cv.strip();
+         ColumnVector expected = ColumnVector.fromStrings("", null)) {
+      assertColumnsAreEqual(expected, trimmed);
+    }
+  }
+
   @Test
   void testAppendStrings() {
     try (HostColumnVector cv = HostColumnVector.build(10, 0, (b) -> {
@@ -2867,6 +2899,42 @@ void testListConcatByRowIgnoreNull() {
     }
   }
 
+  @Test
+  void testFlattenLists() {
+    HostColumnVector.ListType listType = new HostColumnVector.ListType(true,
+        new HostColumnVector.BasicType(true, DType.INT32));
+    HostColumnVector.ListType listOfListsType = new HostColumnVector.ListType(true, listType);
+
+    // Input does not have nulls.
+    try (ColumnVector input = ColumnVector.fromLists(listOfListsType,
+           Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3), Arrays.asList(4, 5, 6)),
+           Arrays.asList(Arrays.asList(7, 8, 9), Arrays.asList(10, 11, 12, 13, 14, 15)));
+         ColumnVector result = input.flattenLists();
+         ColumnVector expected = ColumnVector.fromLists(listType,
+           Arrays.asList(1, 2, 3, 4, 5, 6),
+           Arrays.asList(7, 8, 9, 10, 11, 12, 13, 14, 15))) {
+      assertColumnsAreEqual(expected, result);
+    }
+
+    // Input has nulls.
+    try (ColumnVector input = ColumnVector.fromLists(listOfListsType,
+          Arrays.asList(null, Arrays.asList(3), Arrays.asList(4, 5, 6)),
+          Arrays.asList(Arrays.asList(null, 8, 9), Arrays.asList(10, 11, 12, 13, 14, null)))) {
+      try (ColumnVector result = input.flattenLists(false);
+           ColumnVector expected = ColumnVector.fromLists(listType,
+             null,
+             Arrays.asList(null, 8, 9, 10, 11, 12, 13, 14, null))) {
+        assertColumnsAreEqual(expected, result);
+      }
+      try (ColumnVector result = input.flattenLists(true);
+           ColumnVector expected = ColumnVector.fromLists(listType,
+             Arrays.asList(3, 4, 5, 6),
+             Arrays.asList(null, 8, 9, 10, 11, 12, 13, 14, null))) {
+        assertColumnsAreEqual(expected, result);
+      }
+    }
+  }
+
   @Test
   void testPrefixSum() {
     try (ColumnVector v1 = ColumnVector.fromLongs(1, 2, 3, 5, 8, 10);
@@ -6684,10 +6752,11 @@ void testColumnViewWithNonEmptyNullsIsCleared() {
     List<Integer> list1 = Arrays.asList(4, 5, null);
     List<Integer> list2 = Arrays.asList(7, 8, 9);
     List<Integer> list3 = null;
+    final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
     try (ColumnVector input = ColumnVectorTest.makeListsColumn(DType.INT32, list0, list1, list2, list3);
          BaseDeviceMemoryBuffer baseValidityBuffer = input.getDeviceBufferFor(BufferType.VALIDITY);
          BaseDeviceMemoryBuffer baseOffsetBuffer = input.getDeviceBufferFor(BufferType.OFFSET);
-         HostMemoryBuffer newValidity = HostMemoryBuffer.allocate(BitVectorHelper.getValidityAllocationSizeInBytes(4))) {
+         HostMemoryBuffer newValidity = hostMemoryAllocator.allocate(BitVectorHelper.getValidityAllocationSizeInBytes(4))) {
 
       newValidity.copyFromDeviceBuffer(baseValidityBuffer);
       // we are setting list1 with 3 elements to null. This will result in a non-empty null in the
@@ -6730,7 +6799,22 @@ void testColumnViewWithNonEmptyNullsIsCleared() {
   public void testEventHandlerIsCalledForEachClose() {
     final AtomicInteger onClosedWasCalled = new AtomicInteger(0);
     try (ColumnVector cv = ColumnVector.fromInts(1,2,3,4)) {
-      cv.setEventHandler(refCount -> onClosedWasCalled.incrementAndGet());
+      cv.setEventHandler((col, refCount) -> {
+        assertEquals(cv, col);
+        onClosedWasCalled.incrementAndGet();
+      });
+    }
+    assertEquals(1, onClosedWasCalled.get());
+  }
+
+  @Test
+  public void testHostEventHandlerIsCalledForEachClose() {
+    final AtomicInteger onClosedWasCalled = new AtomicInteger(0);
+    try (HostColumnVector cv = HostColumnVector.fromInts(1,2,3,4)) {
+      cv.setEventHandler((col, refCount) -> {
+        assertEquals(cv, col);
+        onClosedWasCalled.incrementAndGet();
+      });
     }
     assertEquals(1, onClosedWasCalled.get());
   }
@@ -6744,9 +6828,52 @@ public void testEventHandlerIsNotCalledIfNotSet() {
     assertEquals(0, onClosedWasCalled.get());
 
     try (ColumnVector cv = ColumnVector.fromInts(1,2,3,4)) {
-      cv.setEventHandler(refCount -> onClosedWasCalled.incrementAndGet());
+      cv.setEventHandler((col, refCount) -> {
+        onClosedWasCalled.incrementAndGet();
+      });
+      cv.setEventHandler(null);
+    }
+    assertEquals(0, onClosedWasCalled.get());
+  }
+
+  @Test
+  public void testHostEventHandlerIsNotCalledIfNotSet() {
+    final AtomicInteger onClosedWasCalled = new AtomicInteger(0);
+    try (HostColumnVector cv = HostColumnVector.fromInts(1,2,3,4)) {
+      assertNull(cv.getEventHandler());
+    }
+    assertEquals(0, onClosedWasCalled.get());
+
+    try (HostColumnVector cv = HostColumnVector.fromInts(1,2,3,4)) {
+      cv.setEventHandler((col, refCount) -> {
+        onClosedWasCalled.incrementAndGet();
+      });
       cv.setEventHandler(null);
     }
     assertEquals(0, onClosedWasCalled.get());
   }
+
+  /**
+   * Test that the ColumnView with unknown null-counts still returns
+   * the correct null-count when queried.
+   */
+  @Test
+  public void testColumnViewNullCount() {
+    try (ColumnVector vector = ColumnVector.fromBoxedInts(1, 2, null, 3, null, 4, null, 5, null, 6);
+         ColumnView view = new ColumnView(DType.INT32,
+                                          vector.getRowCount(),
+                                          Optional.empty(), // Unknown null count.
+                                          vector.getDeviceBufferFor(BufferType.DATA),
+                                          vector.getDeviceBufferFor(BufferType.VALIDITY),
+                                          vector.getDeviceBufferFor(BufferType.OFFSET))) {
+      assertEquals(vector.getNullCount(), view.getNullCount());
+    }
+  }
+
+  @Test
+  public void testUseAfterFree() {
+    ColumnVector vector = ColumnVector.fromBoxedInts(1, 2, 3);
+    vector.close();
+    assertThrows(NullPointerException.class, vector::getDeviceMemorySize);
+  }
 }
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnViewNonEmptyNullsTest.java b/java/src/test/java/ai/rapids/cudf/ColumnViewNonEmptyNullsTest.java
index 070f94395c5..8d5351d95f4 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnViewNonEmptyNullsTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnViewNonEmptyNullsTest.java
@@ -34,16 +34,20 @@
  */
 public class ColumnViewNonEmptyNullsTest extends CudfTestBase {
 
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   @Test
   void testAndNullReconfigureNulls() {
     try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
          ColumnVector v1 = ColumnVector.fromBoxedInts(0, 100, 1, 2, Integer.MIN_VALUE, null);
          ColumnVector intResult = v1.mergeAndSetValidity(BinaryOp.BITWISE_AND, v0);
          ColumnVector v2 = ColumnVector.fromStrings("0", "100", "1", "2", "MIN_VALUE", "3");
+         ColumnVector v3 = v0.mergeAndSetValidity(BinaryOp.BITWISE_AND, v1, v2);
          ColumnVector stringResult = v2.mergeAndSetValidity(BinaryOp.BITWISE_AND, v0, v1);
          ColumnVector stringExpected = ColumnVector.fromStrings("0", "100", null, null, "MIN_VALUE", null);
          ColumnVector noMaskResult = v2.mergeAndSetValidity(BinaryOp.BITWISE_AND)) {
       assertColumnsAreEqual(v0, intResult);
+      assertColumnsAreEqual(v0, v3);
       assertColumnsAreEqual(stringExpected, stringResult);
       assertColumnsAreEqual(v2, noMaskResult);
     }
@@ -82,7 +86,7 @@ private ColumnView[] getColumnViewWithNonEmptyNulls() {
     ColumnVector input = ColumnVectorTest.makeListsColumn(DType.INT32, list0, list1, list2, list3);
     // Modify the validity buffer
     BaseDeviceMemoryBuffer dmb = input.getDeviceBufferFor(BufferType.VALIDITY);
-    try (HostMemoryBuffer newValidity = HostMemoryBuffer.allocate(64)) {
+    try (HostMemoryBuffer newValidity = hostMemoryAllocator.allocate(64)) {
       newValidity.copyFromDeviceBuffer(dmb);
       BitVectorHelper.setNullAt(newValidity, 1);
       dmb.copyFromHostBuffer(newValidity);
diff --git a/java/src/test/java/ai/rapids/cudf/CuFileTest.java b/java/src/test/java/ai/rapids/cudf/CuFileTest.java
index 10415cae893..8945b6684d5 100644
--- a/java/src/test/java/ai/rapids/cudf/CuFileTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CuFileTest.java
@@ -27,6 +27,9 @@
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
 public class CuFileTest extends CudfTestBase {
+
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   @AfterEach
   void tearDown() {
     if (PinnedMemoryPool.isInitialized()) {
@@ -67,10 +70,10 @@ public void testAppendToExistingFile(@TempDir File tempDir) throws IOException {
   }
 
   private void verifyCopyToFile(File tempFile) {
-    try (HostMemoryBuffer orig = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer orig = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer dest = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer dest = hostMemoryAllocator.allocate(16)) {
       orig.setLong(0, 123456789);
       from.copyFromHostBuffer(orig);
       CuFile.writeDeviceBufferToFile(tempFile, 0, from);
@@ -81,10 +84,10 @@ private void verifyCopyToFile(File tempFile) {
   }
 
   private void verifyAppendToFile(File tempFile) {
-    try (HostMemoryBuffer orig = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer orig = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer dest = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer dest = hostMemoryAllocator.allocate(16)) {
       orig.setLong(0, 123456789);
       from.copyFromHostBuffer(orig);
       assertEquals(0, CuFile.appendDeviceBufferToFile(tempFile, from));
@@ -128,7 +131,7 @@ public void testReadWriteRegisteredBuffer(@TempDir File tempDir) {
   }
 
   private void verifyReadWrite(File tempFile, int length, boolean registerBuffer) {
-    try (HostMemoryBuffer orig = HostMemoryBuffer.allocate(length);
+    try (HostMemoryBuffer orig = hostMemoryAllocator.allocate(length);
          CuFileBuffer from = CuFileBuffer.allocate(length, registerBuffer);
          CuFileWriteHandle writer = new CuFileWriteHandle(tempFile.getAbsolutePath())) {
       orig.setLong(0, 123456789);
@@ -141,7 +144,7 @@ private void verifyReadWrite(File tempFile, int length, boolean registerBuffer)
     }
     try (CuFileBuffer to = CuFileBuffer.allocate(length, registerBuffer);
          CuFileReadHandle reader = new CuFileReadHandle(tempFile.getAbsolutePath());
-         HostMemoryBuffer dest = HostMemoryBuffer.allocate(length)) {
+         HostMemoryBuffer dest = hostMemoryAllocator.allocate(length)) {
       reader.read(to, 0);
       dest.copyFromDeviceBuffer(to);
       assertEquals(123456789, dest.getLong(0));
diff --git a/java/src/test/java/ai/rapids/cudf/CudaFatalTest.java b/java/src/test/java/ai/rapids/cudf/CudaFatalTest.java
index ef55ff84b68..70d0925f5b8 100644
--- a/java/src/test/java/ai/rapids/cudf/CudaFatalTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CudaFatalTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ public void testCudaFatalException() {
       assertThrows(CudaFatalException.class, () -> {
         try (ColumnVector cv2 = cv.asLongs()) {
         } catch (CudaFatalException ex) {
-          assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError);
+          assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.getCudaError());
           throw ex;
         }
       });
@@ -47,7 +47,7 @@ public void testCudaFatalException() {
     assertThrows(CudaFatalException.class, () -> {
       try (ColumnVector cv = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5)) {
       } catch (CudaFatalException ex) {
-        assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.cudaError);
+        assertEquals(CudaException.CudaError.cudaErrorIllegalAddress, ex.getCudaError());
         throw ex;
       }
     });
diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java
index c20f2435258..2edd7f36cb7 100644
--- a/java/src/test/java/ai/rapids/cudf/CudaTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 package ai.rapids.cudf;
 
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -33,6 +34,7 @@ public void testGetCudaRuntimeInfo() {
     assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testCudaException() {
     assertThrows(CudaException.class, () -> {
@@ -40,7 +42,7 @@ public void testCudaException() {
             Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024);
           } catch (CudaFatalException ignored) {
           } catch (CudaException ex) {
-            assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError);
+            assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.getCudaError());
             throw ex;
           }
         }
diff --git a/java/src/test/java/ai/rapids/cudf/GatherMapTest.java b/java/src/test/java/ai/rapids/cudf/GatherMapTest.java
index b0e78a2c2cd..8bab049c0af 100644
--- a/java/src/test/java/ai/rapids/cudf/GatherMapTest.java
+++ b/java/src/test/java/ai/rapids/cudf/GatherMapTest.java
@@ -24,6 +24,8 @@
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
 public class GatherMapTest {
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   @Test
   void testInvalidBuffer() {
     try (DeviceMemoryBuffer buffer = DeviceMemoryBuffer.allocate(707)) {
@@ -68,7 +70,7 @@ void testInvalidColumnView() {
 
   @Test
   void testToColumnView() {
-    try (HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(8 * 4)) {
+    try (HostMemoryBuffer hostBuffer = hostMemoryAllocator.allocate(8 * 4)) {
       hostBuffer.setInts(0, new int[]{10, 11, 12, 13, 14, 15, 16, 17}, 0, 8);
       try (DeviceMemoryBuffer devBuffer = DeviceMemoryBuffer.allocate(8*4)) {
         devBuffer.copyFromHostBuffer(hostBuffer);
@@ -78,7 +80,7 @@ void testToColumnView() {
           assertEquals(DType.INT32, view.getType());
           assertEquals(0, view.getNullCount());
           assertEquals(8, view.getRowCount());
-          try (HostMemoryBuffer viewHostBuffer = HostMemoryBuffer.allocate(8 * 4)) {
+          try (HostMemoryBuffer viewHostBuffer = hostMemoryAllocator.allocate(8 * 4)) {
             viewHostBuffer.copyFromDeviceBuffer(view.getData());
             for (int i = 0; i < 8; i++) {
               assertEquals(i + 10, viewHostBuffer.getInt(4*i));
@@ -88,7 +90,7 @@ void testToColumnView() {
           assertEquals(DType.INT32, view.getType());
           assertEquals(0, view.getNullCount());
           assertEquals(2, view.getRowCount());
-          try (HostMemoryBuffer viewHostBuffer = HostMemoryBuffer.allocate(8)) {
+          try (HostMemoryBuffer viewHostBuffer = hostMemoryAllocator.allocate(8)) {
             viewHostBuffer.copyFromDeviceBuffer(view.getData());
             assertEquals(13, viewHostBuffer.getInt(0));
             assertEquals(14, viewHostBuffer.getInt(4));
diff --git a/java/src/test/java/ai/rapids/cudf/LargeTableTest.java b/java/src/test/java/ai/rapids/cudf/LargeTableTest.java
new file mode 100644
index 00000000000..d5e0942dfdd
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/LargeTableTest.java
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package ai.rapids.cudf;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+/**
+ * Test for operations on tables with large row counts.
+ */
+public class LargeTableTest extends CudfTestBase {
+
+  static final long RMM_POOL_SIZE_LARGE = 10L * 1024 * 1024 * 1024;
+
+  public LargeTableTest() {
+    // Set large RMM pool size. Ensure that the test does not run out of memory,
+    // for large row counts.
+    super(RmmAllocationMode.POOL, RMM_POOL_SIZE_LARGE);
+  }
+
+  /**
+   * Tests that exploding large array columns will result in CudfColumnOverflowException
+   * if the column size limit is crossed.
+   */
+  @Test
+  public void testExplodeOverflow() {
+    int numRows = 1000_000;
+    int arraySize = 1000;
+    String str = "abc";
+
+    // 1 Million rows, each row being { "abc", [ 0, 0, 0... ] },
+    // with 1000 elements in the array in each row.
+    // When the second column is exploded, it produces 1 Billion rows.
+    // The string row is repeated once for each element in the array,
+    // thus producing a 1 Billion row string column, with 3 Billion chars
+    // in the child column. This should cause an overflow exception.
+    boolean [] arrBools = new boolean[arraySize];
+    for (char i = 0; i < arraySize; ++i) { arrBools[i] = false; }
+    Exception exception = assertThrows(CudfColumnSizeOverflowException.class, ()->{
+        try (Scalar strScalar = Scalar.fromString(str);
+             ColumnVector arrRow = ColumnVector.fromBooleans(arrBools);
+             Scalar arrScalar = Scalar.listFromColumnView(arrRow);
+             ColumnVector strVector = ColumnVector.fromScalar(strScalar, numRows);
+             ColumnVector arrVector = ColumnVector.fromScalar(arrScalar, numRows);
+             Table inputTable = new Table(strVector, arrVector);
+             Table outputTable = inputTable.explode(1)) {
+          assertEquals(outputTable.getColumns()[0].getRowCount(), numRows * arraySize);
+          fail("Exploding this large table should have caused a CudfColumnSizeOverflowException.");
+        }});
+    assertTrue(exception.getMessage().contains("Size of output exceeds the column size limit"));
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java
index c332ce660d1..ec36b4f82b0 100644
--- a/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java
+++ b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java
@@ -25,13 +25,15 @@
 import static org.junit.jupiter.api.Assertions.*;
 
 public class MemoryBufferTest extends CudfTestBase {
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   private static final byte[] BYTES = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
   private static final byte[] EXPECTED = {0, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
   @Test
   public void testAddressOutOfBoundsExceptionWhenCopying() {
-    try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16);
-         HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) {
+    try (HostMemoryBuffer from = hostMemoryAllocator.allocate(16);
+         HostMemoryBuffer to = hostMemoryAllocator.allocate(16)) {
       assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(-1, from, 0, 16, Cuda.DEFAULT_STREAM));
       assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(16, from, 0, 16, Cuda.DEFAULT_STREAM));
       assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, -1, 16, Cuda.DEFAULT_STREAM));
@@ -45,8 +47,8 @@ public void testAddressOutOfBoundsExceptionWhenCopying() {
 
   @Test
   public void testAddressOutOfBoundsExceptionWhenCopyingAsync() {
-    try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16);
-         HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) {
+    try (HostMemoryBuffer from = hostMemoryAllocator.allocate(16);
+         HostMemoryBuffer to = hostMemoryAllocator.allocate(16)) {
       assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(-1, from, 0, 16, Cuda.DEFAULT_STREAM));
       assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(16, from, 0, 16, Cuda.DEFAULT_STREAM));
       assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, -1, 16, Cuda.DEFAULT_STREAM));
@@ -60,10 +62,10 @@ public void testAddressOutOfBoundsExceptionWhenCopyingAsync() {
 
   @Test
   public void testCopyingFromDeviceToDevice() {
-    try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer in = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer out = hostMemoryAllocator.allocate(16)) {
       in.setBytes(0, BYTES, 0, 16);
       from.copyFromHostBuffer(in);
       to.copyFromMemoryBuffer(0, from, 0, 16, Cuda.DEFAULT_STREAM);
@@ -75,10 +77,10 @@ public void testCopyingFromDeviceToDevice() {
 
   @Test
   public void testCopyingFromDeviceToDeviceAsync() {
-    try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer in = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer out = hostMemoryAllocator.allocate(16)) {
       in.setBytes(0, BYTES, 0, 16);
       from.copyFromHostBuffer(in);
       to.copyFromMemoryBufferAsync(0, from, 0, 16, Cuda.DEFAULT_STREAM);
@@ -91,8 +93,8 @@ public void testCopyingFromDeviceToDeviceAsync() {
 
   @Test
   public void testCopyingFromHostToHost() {
-    try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16);
-         HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) {
+    try (HostMemoryBuffer from = hostMemoryAllocator.allocate(16);
+         HostMemoryBuffer to = hostMemoryAllocator.allocate(16)) {
       from.setBytes(0, BYTES, 0, 16);
       to.setBytes(0, BYTES, 0, 16);
       to.copyFromMemoryBuffer(1, from, 2, 3, Cuda.DEFAULT_STREAM);
@@ -102,8 +104,8 @@ public void testCopyingFromHostToHost() {
 
   @Test
   public void testCopyingFromHostToHostAsync() {
-    try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16);
-         HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) {
+    try (HostMemoryBuffer from = hostMemoryAllocator.allocate(16);
+         HostMemoryBuffer to = hostMemoryAllocator.allocate(16)) {
       from.setBytes(0, BYTES, 0, 16);
       to.setBytes(0, BYTES, 0, 16);
       to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM);
@@ -113,9 +115,9 @@ public void testCopyingFromHostToHostAsync() {
 
   @Test
   public void testCopyingFromHostToDevice() {
-    try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer from = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer out = hostMemoryAllocator.allocate(16)) {
       from.setBytes(0, BYTES, 0, 16);
       to.copyFromMemoryBuffer(0, from, 0, 16, Cuda.DEFAULT_STREAM);
       to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM);
@@ -126,9 +128,9 @@ public void testCopyingFromHostToDevice() {
 
   @Test
   public void testCopyingFromHostToDeviceAsync() {
-    try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer from = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer out = hostMemoryAllocator.allocate(16)) {
       from.setBytes(0, BYTES, 0, 16);
       to.copyFromMemoryBufferAsync(0, from, 0, 16, Cuda.DEFAULT_STREAM);
       to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM);
@@ -140,9 +142,9 @@ public void testCopyingFromHostToDeviceAsync() {
 
   @Test
   public void testCopyingFromDeviceToHost() {
-    try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer in = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer to = hostMemoryAllocator.allocate(16)) {
       in.setBytes(0, BYTES, 0, 16);
       from.copyFromHostBuffer(in);
       to.setBytes(0, BYTES, 0, 16);
@@ -153,9 +155,9 @@ public void testCopyingFromDeviceToHost() {
 
   @Test
   public void testCopyingFromDeviceToHostAsync() {
-    try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16);
+    try (HostMemoryBuffer in = hostMemoryAllocator.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) {
+         HostMemoryBuffer to = hostMemoryAllocator.allocate(16)) {
       in.setBytes(0, BYTES, 0, 16);
       from.copyFromHostBuffer(in);
       to.setBytes(0, BYTES, 0, 16);
diff --git a/java/src/test/java/ai/rapids/cudf/ReductionTest.java b/java/src/test/java/ai/rapids/cudf/ReductionTest.java
index cc172204ed3..8cc7df1ce7f 100644
--- a/java/src/test/java/ai/rapids/cudf/ReductionTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ReductionTest.java
@@ -18,6 +18,7 @@
 package ai.rapids.cudf;
 
 import com.google.common.collect.Lists;
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.Arguments;
@@ -430,6 +431,7 @@ private static void assertEqualsDelta(ReductionAggregation op, Scalar expected,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createBooleanParams")
   void testBoolean(ReductionAggregation op, Boolean[] values,
@@ -441,6 +443,7 @@ void testBoolean(ReductionAggregation op, Boolean[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createByteParams")
   void testByte(ReductionAggregation op, Byte[] values,
@@ -452,6 +455,7 @@ void testByte(ReductionAggregation op, Byte[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createShortParams")
   void testShort(ReductionAggregation op, Short[] values,
@@ -474,6 +478,7 @@ void testInt(ReductionAggregation op, Integer[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createLongParams")
   void testLong(ReductionAggregation op, Long[] values,
@@ -496,6 +501,7 @@ void testFloat(ReductionAggregation op, Float[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createDoubleParams")
   void testDouble(ReductionAggregation op, Double[] values,
@@ -507,6 +513,7 @@ void testDouble(ReductionAggregation op, Double[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createTimestampDaysParams")
   void testTimestampDays(ReductionAggregation op, Integer[] values,
@@ -518,6 +525,7 @@ void testTimestampDays(ReductionAggregation op, Integer[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createTimestampSecondsParams")
   void testTimestampSeconds(ReductionAggregation op, Long[] values,
@@ -529,6 +537,7 @@ void testTimestampSeconds(ReductionAggregation op, Long[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createTimestampMilliSecondsParams")
   void testTimestampMilliseconds(ReductionAggregation op, Long[] values,
@@ -540,6 +549,7 @@ void testTimestampMilliseconds(ReductionAggregation op, Long[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createTimestampMicroSecondsParams")
   void testTimestampMicroseconds(ReductionAggregation op, Long[] values,
@@ -551,6 +561,7 @@ void testTimestampMicroseconds(ReductionAggregation op, Long[] values,
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @MethodSource("createTimestampNanoSecondsParams")
   void testTimestampNanoseconds(ReductionAggregation op, Long[] values,
diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java
index 352f17e6174..cd53cf7068a 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.ValueSource;
@@ -187,6 +188,7 @@ public void testScopedMaxOutstandingNegative(int rmmAllocMode) {
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @ValueSource(ints = {
       RmmAllocationMode.CUDA_DEFAULT,
@@ -257,6 +259,7 @@ public void onDeallocated(long sizeDeallocated) {
     assertEquals(1024, totalDeallocated.get());
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testSetEventHandlerTwice() {
     Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, Rmm.logToStderr(), 0L);
@@ -278,6 +281,7 @@ public boolean onAllocFailure(long sizeRequested, int retryCount) {
     assertThrows(RmmException.class, () -> Rmm.setEventHandler(otherHandler));
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testClearEventHandler() {
     Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, Rmm.logToStderr(), 0L);
@@ -304,6 +308,7 @@ public boolean onAllocFailure(long sizeRequested, int retryCount) {
     }
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testAllocOnlyThresholds() {
     final AtomicInteger allocInvocations = new AtomicInteger(0);
@@ -367,6 +372,7 @@ public void onDeallocThreshold(long totalAllocSize) {
     assertEquals(0, deallocInvocations.get());
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testThresholds() {
     final AtomicInteger allocInvocations = new AtomicInteger(0);
@@ -451,6 +457,7 @@ public void onDeallocThreshold(long totalAllocSize) {
     assertEquals(2, deallocInvocations.get());
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testExceptionHandling() {
     Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024 * 1024L);
@@ -511,6 +518,7 @@ public void testThreadAutoDeviceSetup() throws Exception {
     }
   }
 
+  @Tag("noSanitizer")
   @ParameterizedTest
   @ValueSource(ints = {
       RmmAllocationMode.CUDA_DEFAULT,
@@ -523,6 +531,7 @@ public void testSetDeviceThrowsAfterRmmInit(int rmmAllocMode) {
     Cuda.autoSetDevice();
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testPoolSize() {
     Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024);
@@ -535,6 +544,7 @@ public void testPoolSize() {
     }
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testCudaAsyncMemoryResourceSize() {
     try {
@@ -553,6 +563,7 @@ public void testCudaAsyncMemoryResourceSize() {
     }
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testCudaAsyncIsIncompatibleWithManaged() {
     assertThrows(IllegalArgumentException.class,
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 6b03079fa81..3740328615a 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -40,6 +40,7 @@
 import org.apache.parquet.schema.GroupType;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.OriginalType;
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 
 import java.io.*;
@@ -67,12 +68,15 @@
 import static org.junit.jupiter.api.Assertions.assertArrayEquals;
 import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 public class TableTest extends CudfTestBase {
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   private static final File TEST_PARQUET_FILE = TestUtils.getResourceAsFile("acq.parquet");
   private static final File TEST_PARQUET_FILE_CHUNKED_READ = TestUtils.getResourceAsFile("splittable.parquet");
   private static final File TEST_PARQUET_FILE_BINARY = TestUtils.getResourceAsFile("binary.parquet");
@@ -129,6 +133,16 @@ void assertTablesHaveSameValues(HashMap<Object, Integer>[] expectedTable, Table
     }
   }
 
+  @Test
+  void testDistinctCount() {
+    try (Table table1 = new Table.TestBuilder()
+            .column(5, 3, null, null, 5)
+            .build()) {
+      assertEquals(3, table1.distinctCount());
+      assertEquals(4, table1.distinctCount(NullEquality.UNEQUAL));
+    }
+  }
+
   @Test
   void testMergeSimple() {
     try (Table table1 = new Table.TestBuilder()
@@ -428,7 +442,7 @@ void testReadJSONTableWithMeta() {
             "{ \"A\": 3, \"B\": 6, \"C\": \"Z\"}\n" +
             "{ \"A\": 4, \"B\": 8, \"C\": \"W\"}\n").getBytes(StandardCharsets.UTF_8);
     final int numBytes = data.length;
-    try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(numBytes)) {
+    try (HostMemoryBuffer hostbuf = hostMemoryAllocator.allocate(numBytes)) {
       hostbuf.setBytes(0, data, 0, numBytes);
       try (Table expected = new Table.TestBuilder()
               .column(1L, 2L, 3L, 4L)
@@ -3099,6 +3113,58 @@ void testContiguousSplit() {
     }
   }
 
+  @Test
+  void testChunkedPackBasic() {
+    try (Table t1 = new Table.TestBuilder()
+        .column(10, 12, 14, 16, 18, 20, 22, 24, null, 28)
+        .column(50, 52, 54, 56, 58, 60, 62, 64, 66, null)
+        .decimal32Column(-3, 10, 12, 14, 16, 18, 20, 22, 24, null, 28)
+        .decimal64Column(-8, 50L, 52L, 54L, 56L, 58L, 60L, 62L, 64L, 66L, null)
+        .build();
+        DeviceMemoryBuffer bounceBuffer = DeviceMemoryBuffer.allocate(10L*1024*1024);
+        ChunkedPack cp = t1.makeChunkedPack(10L*1024*1024);
+        PackedColumnMetadata meta = cp.buildMetadata()) {
+
+      // unpack to bounce buffer
+      assertEquals(true, cp.hasNext());
+      assertEquals(cp.getTotalContiguousSize(), cp.next(bounceBuffer));
+      assertEquals(false, cp.hasNext());
+
+      try (Table unpacked = Table.fromPackedTable(meta.getMetadataDirectBuffer(), bounceBuffer)) {
+        assertTablesAreEqual(t1, unpacked);
+      }
+    }
+  }
+
+  @Test
+  void testChunkedPackTwoPasses() {
+    // this test packes ~2MB worth of long into a 1MB bounce buffer
+    // this is 3 iterations because of the validity buffer
+    Long[] longs = new Long[256*1024];
+    try (Table t1 = new Table.TestBuilder().column(longs).build();
+         DeviceMemoryBuffer bounceBuffer = DeviceMemoryBuffer.allocate(1L*1024*1024);
+         ChunkedPack cp = t1.makeChunkedPack(1L*1024*1024);
+         PackedColumnMetadata meta = cp.buildMetadata();
+         DeviceMemoryBuffer target = DeviceMemoryBuffer.allocate(cp.getTotalContiguousSize())) {
+      long offset = 0;
+
+      // unpack to bounce buffer
+      assertEquals(true, cp.hasNext());
+      while (cp.hasNext()) {
+        long copied = cp.next(bounceBuffer);
+        target.copyFromDeviceBufferAsync(
+          offset, target, 0, copied, Cuda.DEFAULT_STREAM);
+        offset += copied;
+      }
+
+      assertEquals(offset, cp.getTotalContiguousSize());
+
+      try (Table unpacked = Table.fromPackedTable(meta.getMetadataDirectBuffer(), target)) {
+        assertTablesAreEqual(t1, unpacked);
+      }
+    }
+  }
+
   @Test
   void testContiguousSplitWithStrings() {
     ContiguousTable[] splits = null;
@@ -3128,6 +3194,30 @@ void testContiguousSplitWithStrings() {
     }
   }
 
+  @Test
+  void testContiguousSplitWithStringsChunked() {
+    try (Table t1 = new Table.TestBuilder()
+        .column(10, 12, 14, 16, 18, 20, 22, 24, null, 28)
+        .column(50, 52, 54, 56, 58, 60, 62, 64, 66, null)
+        .column("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
+        .decimal32Column(-3, 10, 12, 14, 16, 18, 20, 22, 24, null, 28)
+        .decimal64Column(-8, 50L, 52L, 54L, 56L, 58L, 60L, 62L, 64L, 66L, null)
+        .build();
+        DeviceMemoryBuffer bounceBuffer = DeviceMemoryBuffer.allocate(2L*1024*1024);
+        ChunkedPack cp = t1.makeChunkedPack(2L*1024*1024);
+        PackedColumnMetadata meta = cp.buildMetadata()) {
+
+      // unpack to bounce buffer
+      assertEquals(true, cp.hasNext());
+      assertEquals(cp.getTotalContiguousSize(), cp.next(bounceBuffer));
+      assertEquals(false, cp.hasNext());
+
+      try (Table unpacked = Table.fromPackedTable(meta.getMetadataDirectBuffer(), bounceBuffer)) {
+        assertTablesAreEqual(t1, unpacked);
+      }
+    }
+  }
+
   @Test
   void testPartStability() {
     final int PARTS = 5;
@@ -3377,7 +3467,7 @@ void testSerializationRoundTripConcatOnHostEmpty() throws IOException {
         do {
           head = new JCudfSerialization.SerializedTableHeader(din);
           if (head.wasInitialized()) {
-            HostMemoryBuffer buff = HostMemoryBuffer.allocate(head.getDataLen());
+            HostMemoryBuffer buff = hostMemoryAllocator.allocate(head.getDataLen());
             buffers.add(buff);
             JCudfSerialization.readTableIntoBuffer(din, head, buff);
             assert head.wasDataRead();
@@ -3536,7 +3626,7 @@ void testSerializationRoundTripConcatHostSide() throws IOException {
           do {
             head = new JCudfSerialization.SerializedTableHeader(din);
             if (head.wasInitialized()) {
-              HostMemoryBuffer buff = HostMemoryBuffer.allocate(100 * 1024);
+              HostMemoryBuffer buff = hostMemoryAllocator.allocate(100 * 1024);
               buffers.add(buff);
               JCudfSerialization.readTableIntoBuffer(din, head, buff);
               assert head.wasDataRead();
@@ -3577,7 +3667,7 @@ private void testSerializationRoundTripToHost(Table t) throws IOException {
     JCudfSerialization.SerializedTableHeader header =
             new JCudfSerialization.SerializedTableHeader(din);
     assertTrue(header.wasInitialized());
-    try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(header.getDataLen())) {
+    try (HostMemoryBuffer buffer = hostMemoryAllocator.allocate(header.getDataLen())) {
       JCudfSerialization.readTableIntoBuffer(din, header, buffer);
       assertTrue(header.wasDataRead());
       HostColumnVector[] hostColumns =
@@ -3639,7 +3729,7 @@ void testConcatHost() throws IOException {
       DataInputStream in = new DataInputStream(new ByteArrayInputStream(out.toByteArray()));
       JCudfSerialization.SerializedTableHeader header = new JCudfSerialization.SerializedTableHeader(in);
       assert header.wasInitialized();
-      try (HostMemoryBuffer buff = HostMemoryBuffer.allocate(header.getDataLen())) {
+      try (HostMemoryBuffer buff = hostMemoryAllocator.allocate(header.getDataLen())) {
         JCudfSerialization.readTableIntoBuffer(in, header, buff);
         assert header.wasDataRead();
         try (Table result = JCudfSerialization.readAndConcat(
@@ -3670,7 +3760,7 @@ void testSerializationRoundTripSlicedHostSide() throws IOException {
           do {
             head = new JCudfSerialization.SerializedTableHeader(din);
             if (head.wasInitialized()) {
-              HostMemoryBuffer buff = HostMemoryBuffer.allocate(100 * 1024);
+              HostMemoryBuffer buff = hostMemoryAllocator.allocate(100 * 1024);
               buffers.add(buff);
               JCudfSerialization.readTableIntoBuffer(din, head, buff);
               assert head.wasDataRead();
@@ -5220,6 +5310,35 @@ void testWindowingWithoutGroupByColumns() {
     }
   }
 
+  @Test
+  void testWindowWithUnboundedPrecedingUnboundedFollowing() {
+    try (Table unsorted = new Table.TestBuilder()
+            .column(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
+            .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // GBY Key
+            .column(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6) // OBY Key
+            .column(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6) // Agg Column
+            .build()) {
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
+           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
+        ColumnVector sortedAggColumn = sorted.getColumn(3);
+        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+
+        try (WindowOptions window = WindowOptions.builder()
+                .minPeriods(1)
+                .unboundedPreceding()
+                .unboundedFollowing()
+                .build()) {
+
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+                  .aggregateWindows(RollingAggregation.sum().onColumn(3).overWindow(window));
+               ColumnVector expectAggResult = ColumnVector.fromBoxedLongs(22L, 22L, 22L, 22L, 26L, 26L, 26L, 26L, 20L, 20L, 20L, 20L)) {
+            assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
+          }
+        }
+      }
+    }
+  }
+
   private Scalar getScalar(DType type, long value) {
     if (type.equals(DType.INT32)) {
       return Scalar.fromInt((int) value);
@@ -6071,7 +6190,7 @@ private static BigInteger big(int x)
 
   /**
    * Helper to get scalar for preceding == Decimal(value),
-   * with data width depending upon the the order-by
+   * with data width depending upon the order-by
    * column index:
    *   orderby_col_idx = 2 -> Decimal32
    *   orderby_col_idx = 3 -> Decimal64
@@ -6177,6 +6296,106 @@ void testRangeWindowsWithDecimalOrderBy() {
     }
   }
 
+  /**
+   * Helper to get scalar for preceding == Decimal(value),
+   * with data width depending upon the order-by column index:
+   *   orderby_col_idx = 2 -> FLOAT32
+   *   orderby_col_idx = 3 -> FLOAT64
+   */
+  private static Scalar getFloatingPointScalarRangeBounds(float value, int orderby_col_idx)
+  {
+    switch(orderby_col_idx)
+    {
+      case 2: return Scalar.fromFloat(value);
+      case 3: return Scalar.fromDouble(Double.valueOf(value));
+      default:
+        throw new IllegalStateException("Unexpected order by column index: "
+                + orderby_col_idx);
+    }
+  }
+
+  @Test
+  void testRangeWindowsWithFloatOrderBy() {
+    try (Table unsorted = new Table.TestBuilder()
+            .column(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
+            .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // GBY Key
+            .column(400f, 300f, 200f, 100f,
+                    400f, 300f, 200f, 100f,
+                    400f, 300f, 200f, 100f) // Float OBY Key
+            .column(400.0, 300.0, 200.0, 100.0,
+                    400.0, 300.0, 200.0, 100.0,
+                    400.0, 300.0, 200.0, 100.0) // Double OBY Key
+            .column(9, 1, 5, 7, 2, 8, 9, 7, 6, 6, 0, 8) // Agg Column
+            .build()) {
+
+      // Columns 2-3 are order-by columns of type FLOAT32 and FLOAT64 respectively, with similarly ordered values.
+      // In the following loop, each float type is tested as the order-by column,
+      // producing the same results with similar range bounds.
+      for (int float_oby_col_idx = 2; float_oby_col_idx <= 3; ++float_oby_col_idx) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0),
+                OrderByArg.asc(1),
+                OrderByArg.asc(float_oby_col_idx));
+             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(4);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+
+          // Test Window functionality with range window (200 PRECEDING and 100 FOLLOWING)
+          try (Scalar preceding200 = getFloatingPointScalarRangeBounds(200, float_oby_col_idx);
+               Scalar following100 = getFloatingPointScalarRangeBounds(100, float_oby_col_idx);
+               WindowOptions window = WindowOptions.builder()
+                       .minPeriods(1)
+                       .window(preceding200, following100)
+                       .orderByColumnIndex(float_oby_col_idx)
+                       .build()) {
+
+            try (Table windowAggResults = sorted.groupBy(0, 1)
+                    .aggregateWindowsOverRanges(RollingAggregation.count()
+                            .onColumn(4)
+                            .overWindow(window));
+                 ColumnVector expect = ColumnVector.fromBoxedInts(2, 3, 4, 3, 2, 3, 4, 3, 2, 3, 4, 3)) {
+              assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+            }
+          }
+
+          // Test Window functionality with range window (UNBOUNDED PRECEDING and CURRENT ROW)
+          try (Scalar current_row = getFloatingPointScalarRangeBounds(0, float_oby_col_idx);
+               WindowOptions window = WindowOptions.builder()
+                       .minPeriods(1)
+                       .unboundedPreceding()
+                       .following(current_row)
+                       .orderByColumnIndex(float_oby_col_idx)
+                       .build()) {
+
+            try (Table windowAggResults = sorted.groupBy(0, 1)
+                    .aggregateWindowsOverRanges(RollingAggregation.count()
+                            .onColumn(4)
+                            .overWindow(window));
+                 ColumnVector expect = ColumnVector.fromBoxedInts(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4)) {
+              assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+            }
+          }
+
+          // Test Window functionality with range window (UNBOUNDED PRECEDING and UNBOUNDED FOLLOWING)
+          try (WindowOptions window = WindowOptions.builder()
+                  .minPeriods(1)
+                  .unboundedPreceding()
+                  .unboundedFollowing()
+                  .orderByColumnIndex(float_oby_col_idx)
+                  .build()) {
+
+            try (Table windowAggResults = sorted.groupBy(0, 1)
+                    .aggregateWindowsOverRanges(RollingAggregation.count()
+                            .onColumn(4)
+                            .overWindow(window));
+                 ColumnVector expect = ColumnVector.fromBoxedInts(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4)) {
+              assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+            }
+          }
+        }
+      }
+    }
+  }
+
   @Test
   void testGroupByCountWithNulls() {
     try (Table t1 = new Table.TestBuilder().column(null, null,    1,    1,    1,    1)
@@ -7768,7 +7987,7 @@ private final class MyBufferConsumer implements HostBufferConsumer, AutoCloseabl
     long offset = 0;
 
     public MyBufferConsumer() {
-      buffer = HostMemoryBuffer.allocate(10 * 1024 * 1024);
+      buffer = hostMemoryAllocator.allocate(10 * 1024 * 1024);
     }
 
     @Override
@@ -7817,6 +8036,17 @@ void testParquetWriteToBufferChunkedInt96() {
           .withDecimalColumn("_c8", 5)
           .build();
 
+      TableDebug.get().debug("default stderr table0", table0);
+      TableDebug.builder()
+        .withOutput(TableDebug.Output.STDOUT)
+        .build().debug("stdout table0", table0);
+      TableDebug.builder()
+          .withOutput(TableDebug.Output.LOG)
+          .build().debug("slf4j default debug table0", table0);
+      TableDebug.builder()
+          .withOutput(TableDebug.Output.LOG_ERROR)
+          .build().debug("slf4j error table0", table0);
+
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
         writer.write(table0);
         writer.write(table0);
@@ -7834,7 +8064,8 @@ void testParquetWriteMap() throws IOException {
     ParquetWriterOptions options = ParquetWriterOptions.builder()
         .withMapColumn(mapColumn("my_map",
             new ColumnWriterOptions("key0", false),
-            new ColumnWriterOptions("value0"))).build();
+            new ColumnWriterOptions("value0"),
+            true)).build();
     File f = File.createTempFile("test-map", ".parquet");
     List<HostColumnVector.StructData> list1 =
         Arrays.asList(new HostColumnVector.StructData(Arrays.asList("a", "b")));
@@ -7891,17 +8122,35 @@ void testParquetWriteToBufferChunked() {
     columns.add(Columns.STRUCT.name);
     WriteUtils.buildWriterOptions(optBuilder, columns);
     ParquetWriterOptions options = optBuilder.build();
+    ParquetWriterOptions optionsNoCompress = optBuilder.withCompressionType(CompressionType.NONE).build();
     try (Table table0 = getExpectedFileTable(columns);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
         writer.write(table0);
         writer.write(table0);
         writer.write(table0);
+
+        TableWriter.WriteStatistics statistics = writer.getWriteStatistics();
+        assertNotEquals(0, statistics.numCompressedBytes);
+        assertEquals(0, statistics.numFailedBytes);
+        assertEquals(0, statistics.numSkippedBytes);
+        assertNotEquals(Double.NaN, statistics.compressionRatio);
       }
       try (Table table1 = Table.readParquet(ParquetOptions.DEFAULT, consumer.buffer, 0, consumer.offset);
            Table concat = Table.concatenate(table0, table0, table0)) {
         assertTablesAreEqual(concat, table1);
       }
+      try (TableWriter writer = Table.writeParquetChunked(optionsNoCompress, consumer)) {
+        writer.write(table0);
+        writer.write(table0);
+        writer.write(table0);
+
+        TableWriter.WriteStatistics statistics = writer.getWriteStatistics();
+        assertEquals(0, statistics.numCompressedBytes);
+        assertEquals(0, statistics.numFailedBytes);
+        assertEquals(0, statistics.numSkippedBytes);
+        assertEquals(Double.NaN, statistics.compressionRatio);
+      }
     }
   }
 
@@ -8260,15 +8509,33 @@ void testORCWriteToBufferChunked() {
       ORCWriterOptions.Builder builder = ORCWriterOptions.builder();
       WriteUtils.buildWriterOptions(builder, selectedColumns);
       ORCWriterOptions opts = builder.build();
+      ORCWriterOptions optsNoCompress = builder.withCompressionType(CompressionType.NONE).build();
       try (TableWriter writer = Table.writeORCChunked(opts, consumer)) {
         writer.write(table0);
         writer.write(table0);
         writer.write(table0);
+
+        TableWriter.WriteStatistics statistics = writer.getWriteStatistics();
+        assertNotEquals(0, statistics.numCompressedBytes);
+        assertEquals(0, statistics.numFailedBytes);
+        assertEquals(0, statistics.numSkippedBytes);
+        assertNotEquals(Double.NaN, statistics.compressionRatio);
       }
       try (Table table1 = Table.readORC(ORCOptions.DEFAULT, consumer.buffer, 0, consumer.offset);
            Table concat = Table.concatenate(table0, table0, table0)) {
         assertTablesAreEqual(concat, table1);
       }
+      try (TableWriter writer = Table.writeORCChunked(optsNoCompress, consumer)) {
+        writer.write(table0);
+        writer.write(table0);
+        writer.write(table0);
+
+        TableWriter.WriteStatistics statistics = writer.getWriteStatistics();
+        assertEquals(0, statistics.numCompressedBytes);
+        assertEquals(0, statistics.numFailedBytes);
+        assertEquals(0, statistics.numSkippedBytes);
+        assertEquals(Double.NaN, statistics.compressionRatio);
+      }
     }
   }
 
@@ -8296,7 +8563,8 @@ void testORCWriteMapChunked() throws IOException {
     ORCWriterOptions options = ORCWriterOptions.builder()
             .withMapColumn(mapColumn("my_map",
                     new ColumnWriterOptions("key0", false),
-                    new ColumnWriterOptions("value0"))).build();
+                    new ColumnWriterOptions("value0"),
+                    true)).build();
     File f = File.createTempFile("test-map", ".parquet");
     List<HostColumnVector.StructData> list1 =
             Arrays.asList(new HostColumnVector.StructData(Arrays.asList("a", "b")));
@@ -8339,6 +8607,9 @@ void testORCWriteToFileWithColNames() throws IOException {
     }
   }
 
+  // https://github.com/NVIDIA/spark-rapids-jni/issues/1338
+  // Need to remove this tag if #1338 is fixed.
+  @Tag("noSanitizer")
   @Test
   void testORCReadAndWriteForDecimal128() throws IOException {
     File tempFile = File.createTempFile("test", ".orc");
diff --git a/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java b/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java
index b7cadb2786a..a3684cb42b9 100644
--- a/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -18,11 +18,13 @@
 
 package ai.rapids.cudf;
 
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotEquals;
 
+@Tag("noSanitizer")
 public class UnsafeMemoryAccessorTest {
   @Test
   public void testAllocate() {
diff --git a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
index 43d5f9fdc81..f6a17b4b72d 100644
--- a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
@@ -379,13 +379,23 @@ void testUnaryShortOperationTransform() {
     }
   }
 
-  @Test
-  void testUnaryLogicalOperationTransform() {
-    UnaryOperation expr = new UnaryOperation(UnaryOperator.NOT, new ColumnReference(0));
-    try (Table t = new Table.TestBuilder().column(-5L, 0L, null, 2L, 1L).build();
+  private static Stream<Arguments> createUnaryLogicalOperationParams() {
+    Long[] input = new Long[] { -5L, 0L, null, 2L, 1L };
+    return Stream.of(
+        Arguments.of(UnaryOperator.NOT, input, Arrays.asList(false, true, null, false, false)),
+        Arguments.of(UnaryOperator.IS_NULL, input, Arrays.asList(false, false, true, false, false)));
+  }
+
+  @ParameterizedTest
+  @MethodSource("createUnaryLogicalOperationParams")
+  void testUnaryLogicalOperationTransform(UnaryOperator op, Long[] input,
+                                          List<Boolean> expectedValues) {
+    UnaryOperation expr = new UnaryOperation(op, new ColumnReference(0));
+    try (Table t = new Table.TestBuilder().column(input).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, null, false, false)) {
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(
+             expectedValues.toArray(new Boolean[0]))) {
       assertColumnsAreEqual(expected, actual);
     }
   }
diff --git a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
index ec14a1cfee6..66f4fe39109 100644
--- a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
+++ b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
@@ -26,6 +26,8 @@
 import java.util.Optional;
 
 public class NvcompTest {
+  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
+
   private static final Logger log = LoggerFactory.getLogger(ColumnVector.class);
 
   @Test
@@ -68,9 +70,9 @@ void testBatchedLZ4RoundTripAsync() {
         // check the decompressed results against the original
         for (int i = 0; i < numBuffers; ++i) {
           try (HostMemoryBuffer expected =
-                   HostMemoryBuffer.allocate(originalBuffers.get(i).getLength());
+                   hostMemoryAllocator.allocate(originalBuffers.get(i).getLength());
                HostMemoryBuffer actual =
-                   HostMemoryBuffer.allocate(uncompressedBuffers.get(i).getLength())) {
+                   hostMemoryAllocator.allocate(uncompressedBuffers.get(i).getLength())) {
             Assertions.assertTrue(expected.getLength() <= Integer.MAX_VALUE);
             Assertions.assertTrue(actual.getLength() <= Integer.MAX_VALUE);
             Assertions.assertEquals(expected.getLength(), actual.getLength(),
@@ -114,7 +116,7 @@ private DeviceMemoryBuffer initBatchBuffer(long[] data, int bufferId) {
     }
     long[] bufferData = Arrays.copyOfRange(data, dataStart, dataStart + dataLength + 1);
     DeviceMemoryBuffer devBuffer = null;
-    try (HostMemoryBuffer hmb = HostMemoryBuffer.allocate(bufferData.length * 8)) {
+    try (HostMemoryBuffer hmb = hostMemoryAllocator.allocate(bufferData.length * 8)) {
       hmb.setLongs(0, bufferData, 0, bufferData.length);
       devBuffer = DeviceMemoryBuffer.allocate(hmb.getLength());
       devBuffer.copyFromHostBuffer(hmb);
diff --git a/pyproject.toml b/pyproject.toml
index b0f285c5bff..e538c3e3dde 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,3 +55,16 @@ ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
 ignore-words-list = "inout,unparseable,falsy"
 builtin = "clear"
 quiet-level = 3
+
+[tool.ruff]
+select = ["E", "F", "W"]
+ignore = [
+    # whitespace before :
+    "E203",
+]
+fixable = ["ALL"]
+exclude = [
+    # TODO: Remove this in a follow-up where we fix __all__.
+    "__init__.py",
+]
+line-length = 79
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index 5c0d196746c..6f3e428d291 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -12,9 +12,9 @@
 # the License.
 # =============================================================================
 
-cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(cudf_version 23.06.00)
+set(cudf_version 23.10.00)
 
 include(../../fetch_rapids.cmake)
 include(rapids-cuda)
diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py
index 28777b23583..f908a995c2a 100644
--- a/python/cudf/benchmarks/API/bench_dataframe.py
+++ b/python/cudf/benchmarks/API/bench_dataframe.py
@@ -72,6 +72,40 @@ def bench_sample(benchmark, dataframe, axis, frac, random_state):
     )
 
 
+@benchmark_with_object(cls="dataframe", dtype="int")
+@pytest.mark.parametrize("frac", [0, 0.25, 0.5, 0.75, 1])
+def bench_iloc_getitem_indices(benchmark, dataframe, frac):
+    rs = numpy.random.RandomState(seed=42)
+    n = int(len(dataframe) * frac)
+    values = rs.choice(len(dataframe), size=n, replace=False)
+    benchmark(dataframe.iloc.__getitem__, values)
+
+
+@benchmark_with_object(cls="dataframe", dtype="int")
+@pytest.mark.parametrize("frac", [0, 0.25, 0.5, 0.75, 1])
+def bench_iloc_getitem_mask(benchmark, dataframe, frac):
+    rs = numpy.random.RandomState(seed=42)
+    n = int(len(dataframe) * frac)
+    values = rs.choice(len(dataframe), size=n, replace=False)
+    mask = numpy.zeros(len(dataframe), dtype=bool)
+    mask[values] = True
+    benchmark(dataframe.iloc.__getitem__, mask)
+
+
+@benchmark_with_object(cls="dataframe", dtype="int")
+@pytest.mark.parametrize(
+    "slice",
+    [slice(None), slice(0, 0, 1), slice(1, None, 10), slice(None, -1, -1)],
+)
+def bench_iloc_getitem_slice(benchmark, dataframe, slice):
+    benchmark(dataframe.iloc.__getitem__, slice)
+
+
+@benchmark_with_object(cls="dataframe", dtype="int")
+def bench_iloc_getitem_scalar(benchmark, dataframe):
+    benchmark(dataframe.iloc.__getitem__, len(dataframe) // 2)
+
+
 @benchmark_with_object(cls="dataframe", dtype="int", nulls=False, cols=6)
 @pytest.mark.parametrize(
     "num_key_cols",
diff --git a/python/cudf/benchmarks/internal/bench_dataframe_internal.py b/python/cudf/benchmarks/internal/bench_dataframe_internal.py
index 5204d4fb65d..a2629764943 100644
--- a/python/cudf/benchmarks/internal/bench_dataframe_internal.py
+++ b/python/cudf/benchmarks/internal/bench_dataframe_internal.py
@@ -1,11 +1,13 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 """Benchmarks of internal DataFrame methods."""
 
 from utils import benchmark_with_object, make_boolean_mask_column
 
+from cudf.core.copy_types import BooleanMask
+
 
 @benchmark_with_object(cls="dataframe", dtype="int")
 def bench_apply_boolean_mask(benchmark, dataframe):
     mask = make_boolean_mask_column(len(dataframe))
-    benchmark(dataframe._apply_boolean_mask, mask)
+    benchmark(dataframe._apply_boolean_mask, BooleanMask(mask, len(dataframe)))
diff --git a/python/cudf/cmake/Modules/ProtobufHelpers.cmake b/python/cudf/cmake/Modules/ProtobufHelpers.cmake
index e3a0edf978e..70b8879cf18 100644
--- a/python/cudf/cmake/Modules/ProtobufHelpers.cmake
+++ b/python/cudf/cmake/Modules/ProtobufHelpers.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -36,7 +36,6 @@ function(codegen_protoc)
     file(
       WRITE "${pb2_py_path}"
       [=[
-# flake8: noqa
 # fmt: off
 ]=]
     )
diff --git a/python/cudf/cmake/Modules/WheelHelpers.cmake b/python/cudf/cmake/Modules/WheelHelpers.cmake
index 28ea33240fa..c0351e8bbcc 100644
--- a/python/cudf/cmake/Modules/WheelHelpers.cmake
+++ b/python/cudf/cmake/Modules/WheelHelpers.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -38,7 +38,7 @@ function(add_target_libs_to_wheel)
 
     get_target_property(is_imported ${target} IMPORTED)
     if(NOT is_imported)
-      # If the target isn't imported, install it into the the wheel
+      # If the target isn't imported, install it into the wheel
       install(TARGETS ${target} DESTINATION ${__LIB_DIR})
       message(VERBOSE "install(TARGETS ${target} DESTINATION ${__LIB_DIR})")
     else()
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 06310e278a2..e5c78fca893 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -1,7 +1,12 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
+# _setup_numba _must be called before numba.cuda is imported, because
+# it sets the numba config variable responsible for enabling
+# Minor Version Compatibility. Setting it after importing numba.cuda has no effect.
+from cudf.utils._numba import _setup_numba
 from cudf.utils.gpu_utils import validate_setup
 
+_setup_numba()
 validate_setup()
 
 import cupy
@@ -53,7 +58,7 @@
     UInt64Index,
     interval_range,
 )
-from cudf.core.missing import NA
+from cudf.core.missing import NA, NaT
 from cudf.core.multiindex import MultiIndex
 from cudf.core.reshape import (
     concat,
@@ -79,41 +84,22 @@
     read_parquet,
     read_text,
 )
-from cudf.options import describe_option, get_option, set_option
-from cudf.utils.dtypes import _NA_REP
-from cudf.utils.utils import clear_cache, set_allocator
-
-try:
-    from cubinlinker.patch import patch_numba_linker_if_needed
-except ImportError:
-    pass
-else:
-    # Patch Numba to support CUDA enhanced compatibility.
-    # cuDF requires a stronger set of conditions than what is
-    # checked by patch_numba_linker_if_needed due to the PTX
-    # files needed for JIT Groupby Apply and string UDFs
-    from cudf.core.udf.utils import _PTX_FILE, _setup_numba_linker
-
-    _setup_numba_linker(_PTX_FILE)
-
-    del patch_numba_linker_if_needed
+from cudf.options import (
+    describe_option,
+    get_option,
+    option_context,
+    set_option,
+)
+from cudf.utils.utils import clear_cache
 
 cuda.set_memory_manager(RMMNumbaManager)
 cupy.cuda.set_allocator(rmm_cupy_allocator)
 
-try:
-    # Numba 0.54: Disable low occupancy warnings
-    numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
-except AttributeError:
-    # Numba < 0.54: No occupancy warnings
-    pass
-del numba_config
-
 
 rmm.register_reinitialize_hook(clear_cache)
 
 
-__version__ = "23.06.00"
+__version__ = "23.10.00"
 
 __all__ = [
     "BaseIndex",
@@ -138,6 +124,7 @@
     "ListDtype",
     "MultiIndex",
     "NA",
+    "NaT",
     "RangeIndex",
     "Scalar",
     "Series",
@@ -174,7 +161,6 @@
     "read_orc",
     "read_parquet",
     "read_text",
-    "set_allocator",
     "set_option",
     "testing",
     "to_datetime",
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 9f3c0ab6d5f..03418e00cde 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import random
 from collections import OrderedDict
@@ -24,6 +24,8 @@
     pd.Int16Dtype(): ["int", "null"],
     pd.Int32Dtype(): ["int", "null"],
     pd.Int64Dtype(): ["long", "null"],
+    pd.Float32Dtype(): ["float", "null"],
+    pd.Float64Dtype(): ["double", "null"],
     pd.BooleanDtype(): ["boolean", "null"],
     pd.StringDtype(): ["string", "null"],
     cudf.dtype("bool_"): "boolean",
@@ -45,6 +47,8 @@
     pd.Int16Dtype(): pyorc.SmallInt(),
     pd.Int32Dtype(): pyorc.Int(),
     pd.Int64Dtype(): pyorc.BigInt(),
+    pd.Float32Dtype(): pyorc.Float(),
+    pd.Float64Dtype(): pyorc.Double(),
     pd.BooleanDtype(): pyorc.Boolean(),
     cudf.dtype("bool_"): pyorc.Boolean(),
     cudf.dtype("int16"): pyorc.SmallInt(),
@@ -65,9 +69,9 @@
     pyorc.Boolean().name: pd.BooleanDtype(),
     pyorc.SmallInt().name: pd.Int16Dtype(),
     pyorc.BigInt().name: pd.Int64Dtype(),
-    pyorc.String().name: cudf.dtype("O"),
-    pyorc.Float().name: cudf.dtype("float32"),
-    pyorc.Double().name: cudf.dtype("float64"),
+    pyorc.String().name: pd.StringDtype(),
+    pyorc.Float().name: pd.Float32Dtype(),
+    pyorc.Double().name: pd.Float64Dtype(),
     pyorc.Timestamp().name: cudf.dtype("<M8[ns]"),
 }
 
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index ecacf703bac..06de6cc825f 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -63,11 +63,6 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf
 )
 
-# All modules need to include the header containing the exception handler.
-foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-  target_include_directories(${target} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
-endforeach()
-
 target_link_libraries(strings_udf cudf_strings_udf)
 
 # TODO: Finding NumPy currently requires finding Development due to a bug in CMake. This bug was
@@ -111,6 +106,8 @@ foreach(target IN LISTS targets_using_arrow_headers)
   target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
 endforeach()
 
+add_subdirectory(cpp)
 add_subdirectory(io)
 add_subdirectory(nvtext)
+add_subdirectory(pylibcudf)
 add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 09227def4e7..18b95f5f2e1 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -22,6 +22,7 @@
     orc,
     parquet,
     partitioning,
+    pylibcudf,
     quantiles,
     reduce,
     replace,
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index f8f851bfe0f..7ffb55a6cc6 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -1,4 +1,6 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+
+from typing import Literal
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -22,10 +24,12 @@ cdef class Column:
         cdef object _data
         cdef object _mask
         cdef object _null_count
+        cdef object _distinct_count
 
     cdef column_view _view(self, size_type null_count) except *
     cdef column_view view(self) except *
     cdef mutable_column_view mutable_view(self) except *
+    cpdef to_pylibcudf(self, mode: Literal["read", "write"])
 
     @staticmethod
     cdef Column from_unique_ptr(
diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
index bd53801a972..c667286fc16 100644
--- a/python/cudf/cudf/_lib/column.pyi
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -2,14 +2,14 @@
 
 from __future__ import annotations
 
-from typing import Dict, Optional, Tuple, TypeVar
+from typing import Dict, Optional, Tuple
+
+from typing_extensions import Self
 
 from cudf._typing import Dtype, DtypeObj, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase
 
-T = TypeVar("T")
-
 class Column:
     _data: Optional[Buffer]
     _mask: Optional[Buffer]
@@ -56,7 +56,7 @@ class Column:
     @property
     def mask_ptr(self) -> int: ...
     def set_base_mask(self, value: Optional[Buffer]) -> None: ...
-    def set_mask(self: T, value: Optional[Buffer]) -> T: ...
+    def set_mask(self, value: Optional[Buffer]) -> Self: ...
     @property
     def null_count(self) -> int: ...
     @property
@@ -68,7 +68,8 @@ class Column:
     def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ...
     def _mimic_inplace(
         self, other_col: ColumnBase, inplace=False
-    ) -> Optional[ColumnBase]: ...
+    ) -> Optional[Self]: ...
+
     # TODO: The val parameter should be Scalar, not ScalarLike
     @staticmethod
     def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ...
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 428db210532..4db3761b1b8 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -1,6 +1,8 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 
+from typing import Literal
+
 import cupy as cp
 import numpy as np
 
@@ -8,26 +10,34 @@ import rmm
 
 import cudf
 import cudf._lib as libcudf
+from cudf._lib import pylibcudf
 from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype
 from cudf.core.buffer import (
     Buffer,
-    CopyOnWriteBuffer,
+    ExposureTrackedBuffer,
     SpillableBuffer,
     acquire_spill_lock,
     as_buffer,
+    cuda_array_interface_wrapper,
 )
 from cudf.utils.dtypes import _get_base_dtype
+
 from cpython.buffer cimport PyObject_CheckBuffer
 from libc.stdint cimport uintptr_t
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport DeviceBuffer
 
-from cudf._lib.types cimport dtype_from_column_view, dtype_to_data_type
+from cudf._lib.types cimport (
+    dtype_from_column_view,
+    dtype_to_data_type,
+    dtype_to_pylibcudf_type,
+)
 
 from cudf._lib.null_mask import bitmask_allocation_size_bytes
+from cudf._lib.types import dtype_from_pylibcudf_column
 
 cimport cudf._lib.cpp.types as libcudf_types
 cimport cudf._lib.cpp.unary as libcudf_unary
@@ -37,6 +47,8 @@ from cudf._lib.cpp.column.column_factories cimport (
     make_numeric_column,
 )
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.libcpp.memory cimport make_unique
+from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
 
@@ -62,7 +74,6 @@ cdef class Column:
         object null_count=None,
         object children=()
     ):
-
         self._size = size
         self._distinct_count = {}
         self._dtype = dtype
@@ -155,6 +166,7 @@ cdef class Column:
             )
 
         if value is not None:
+            # bitmask size must be relative to offset = 0 data.
             required_size = bitmask_allocation_size_bytes(self.base_size)
             if value.size < required_size:
                 error_msg = (
@@ -262,7 +274,7 @@ cdef class Column:
                 self._children = ()
             else:
                 children = Column.from_unique_ptr(
-                    make_unique[column](self.view())
+                    move(make_unique[column](self.view()))
                 ).base_children
                 dtypes = [
                     base_child.dtype for base_child in self.base_children
@@ -308,7 +320,15 @@ cdef class Column:
 
     cdef libcudf_types.size_type compute_null_count(self) except? 0:
         with acquire_spill_lock():
-            return self._view(libcudf_types.UNKNOWN_NULL_COUNT).null_count()
+            if not self.nullable:
+                return 0
+            return cpp_null_count(
+                <libcudf_types.bitmask_type*><uintptr_t>(
+                    self.base_mask.get_ptr(mode="read")
+                ),
+                self.offset,
+                self.offset + self.size
+            )
 
     cdef mutable_column_view mutable_view(self) except *:
         if is_categorical_dtype(self.dtype):
@@ -329,8 +349,8 @@ cdef class Column:
         if col.base_data is None:
             data = NULL
         else:
-            data = <void*><uintptr_t>(col.base_data.get_ptr(
-                mode="write")
+            data = <void*><uintptr_t>(
+                col.base_data.get_ptr(mode="write")
             )
 
         cdef Column child_column
@@ -349,7 +369,7 @@ cdef class Column:
         null_count = self._null_count
 
         if null_count is None:
-            null_count = libcudf_types.UNKNOWN_NULL_COUNT
+            null_count = 0
         cdef libcudf_types.size_type c_null_count = null_count
 
         self._mask = None
@@ -369,7 +389,7 @@ cdef class Column:
     cdef column_view view(self) except *:
         null_count = self.null_count
         if null_count is None:
-            null_count = libcudf_types.UNKNOWN_NULL_COUNT
+            null_count = 0
         cdef libcudf_types.size_type c_null_count = null_count
         return self._view(c_null_count)
 
@@ -418,6 +438,80 @@ cdef class Column:
             offset,
             children)
 
+    # TODO: Consider whether this function should support some sort of `copy`
+    # parameter. Not urgent until this functionality is moved up to the Frame
+    # layer and made public. This function will also need to mark the
+    # underlying buffers as exposed before this function can itself be exposed
+    # publicly.  User requests to convert to pylibcudf must assume that the
+    # data may be modified afterwards.
+    cpdef to_pylibcudf(self, mode: Literal["read", "write"]):
+        """Convert this Column to a pylibcudf.Column.
+
+        This function will generate a pylibcudf Column pointing to the same
+        data, mask, and children as this one.
+
+        Parameters
+        ----------
+        mode : str
+            Supported values are {"read", "write"} If "write", the data pointed
+            to may be modified by the caller. If "read", the data pointed to
+            must not be modified by the caller.  Failure to fulfill this
+            contract will cause incorrect behavior.
+
+        Returns
+        -------
+        pylibcudf.Column
+            A new pylibcudf.Column referencing the same data.
+        """
+
+        # TODO: Categoricals will need to be treated differently eventually.
+        # There is no 1-1 correspondence between cudf and libcudf for
+        # categoricals because cudf supports ordered and unordered categoricals
+        # while libcudf supports only unordered categoricals (see
+        # https://github.com/rapidsai/cudf/pull/8567).
+        if is_categorical_dtype(self.dtype):
+            col = self.base_children[0]
+        else:
+            col = self
+
+        dtype = dtype_to_pylibcudf_type(col.dtype)
+
+        data = None
+        if col.base_data is not None:
+            cai = cuda_array_interface_wrapper(
+                ptr=col.base_data.get_ptr(mode=mode),
+                size=col.base_data.size,
+                owner=col.base_data,
+            )
+            data = pylibcudf.gpumemoryview(cai)
+
+        mask = None
+        if self.nullable:
+            # TODO: Are we intentionally use self's mask instead of col's?
+            # Where is the mask stored for categoricals?
+            cai = cuda_array_interface_wrapper(
+                ptr=self.base_mask.get_ptr(mode=mode),
+                size=self.base_mask.size,
+                owner=self.base_mask,
+            )
+            mask = pylibcudf.gpumemoryview(cai)
+
+        cdef Column child_column
+        children = []
+        if col.base_children:
+            for child_column in col.base_children:
+                children.append(child_column.to_pylibcudf(mode=mode))
+
+        return pylibcudf.Column(
+            dtype,
+            self.size,
+            data,
+            mask,
+            self.null_count,
+            self.offset,
+            children,
+        )
+
     @staticmethod
     cdef Column from_unique_ptr(
         unique_ptr[column] c_col, bint data_ptr_exposed=False
@@ -487,6 +581,47 @@ cdef class Column:
             children=tuple(children)
         )
 
+    #  TODO: Actually support exposed data pointers.
+    @staticmethod
+    def from_pylibcudf(
+        col, bint data_ptr_exposed=False
+    ):
+        """Create a Column from a pylibcudf.Column.
+
+        This function will generate a Column pointing to the provided pylibcudf
+        Column.  It will directly access the data and mask buffers of the
+        pylibcudf Column, so the newly created object is not tied to the
+        lifetime of the original pylibcudf.Column.
+
+        Parameters
+        ----------
+        col : pylibcudf.Column
+            The object to copy.
+        data_ptr_exposed : bool
+            This parameter is not yet supported
+
+        Returns
+        -------
+        pylibcudf.Column
+            A new pylibcudf.Column referencing the same data.
+        """
+        dtype = dtype_from_pylibcudf_column(col)
+
+        return cudf.core.column.build_column(
+            data=as_buffer(col.data().obj) if col.data() is not None else None,
+            dtype=dtype,
+            size=col.size(),
+            mask=as_buffer(
+                col.null_mask().obj
+            ) if col.null_mask() is not None else None,
+            offset=col.offset(),
+            null_count=col.null_count(),
+            children=tuple([
+                Column.from_pylibcudf(child)
+                for child in col.children()
+            ])
+        )
+
     @staticmethod
     cdef Column from_column_view(column_view cv, object owner):
         """
@@ -524,13 +659,16 @@ cdef class Column:
                     rmm.DeviceBuffer(ptr=data_ptr,
                                      size=(size+offset) * dtype_itemsize)
                 )
-            elif column_owner and isinstance(data_owner, CopyOnWriteBuffer):
-                # TODO: In future, see if we can just pass on the
-                # CopyOnWriteBuffer reference to another column
-                # and still create a weak reference.
-                # With the current design that's not possible.
-                # https://github.com/rapidsai/cudf/issues/12734
-                data = data_owner.copy(deep=False)
+            elif (
+                column_owner and
+                isinstance(data_owner, ExposureTrackedBuffer)
+            ):
+                data = as_buffer(
+                    data=data_ptr,
+                    size=base_nbytes,
+                    owner=data_owner,
+                    exposed=False,
+                )
             elif (
                 # This is an optimization of the most common case where
                 # from_column_view creates a "view" that is identical to
@@ -554,9 +692,9 @@ cdef class Column:
                     owner=data_owner,
                     exposed=True,
                 )
-                if isinstance(data_owner, CopyOnWriteBuffer):
-                    data_owner.get_ptr(mode="write")
-                    # accessing the pointer marks it exposed.
+                if isinstance(data_owner, ExposureTrackedBuffer):
+                    # accessing the pointer marks it exposed permanently.
+                    data_owner.mark_exposed()
                 elif isinstance(data_owner, SpillableBuffer):
                     if data_owner.is_spilled:
                         raise ValueError(
@@ -599,7 +737,7 @@ cdef class Column:
                     mask = as_buffer(
                         rmm.DeviceBuffer(
                             ptr=mask_ptr,
-                            size=bitmask_allocation_size_bytes(size+offset)
+                            size=bitmask_allocation_size_bytes(base_size)
                         )
                     )
             else:
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index 7872375f599..feaf75ef237 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -1,7 +1,7 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -12,6 +12,7 @@ from cudf._lib.cpp.concatenate cimport (
     concatenate_masks as libcudf_concatenate_masks,
     concatenate_tables as libcudf_concatenate_tables,
 )
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table cimport table, table_view
 from cudf._lib.utils cimport (
     data_from_unique_ptr,
@@ -30,7 +31,7 @@ cpdef concat_masks(object columns):
     cdef vector[column_view] c_views = make_column_views(columns)
     with nogil:
         c_result = move(libcudf_concatenate_masks(c_views))
-        c_unique_result = make_unique[device_buffer](move(c_result))
+        c_unique_result = move(make_unique[device_buffer](move(c_result)))
     return as_buffer(
         DeviceBuffer.c_from_unique_ptr(move(c_unique_result))
     )
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 56291ac0a12..f57bc15ed57 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -4,13 +4,14 @@ import pickle
 
 from libc.stdint cimport int32_t, uint8_t, uintptr_t
 from libcpp cimport bool
-from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr
+from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport DeviceBuffer
 
 import cudf
+from cudf._lib import pylibcudf
 from cudf.core.buffer import Buffer, acquire_spill_lock, as_buffer
 
 from cudf._lib.column cimport Column
@@ -28,6 +29,7 @@ cimport cudf._lib.cpp.copying as cpp_copying
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
 from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.lists.gather cimport (
     segmented_gather as cpp_segmented_gather,
 )
@@ -37,6 +39,7 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.utils cimport (
+    columns_from_pylibcudf_table,
     columns_from_table_view,
     columns_from_unique_ptr,
     data_from_table_view,
@@ -77,9 +80,8 @@ def copy_column(Column input_column):
     -------
     Deep copied column
     """
-
-    cdef column_view input_column_view = input_column.view()
     cdef unique_ptr[column] c_result
+    cdef column_view input_column_view = input_column.view()
     with nogil:
         c_result = move(make_unique[column](input_column_view))
 
@@ -152,10 +154,8 @@ def copy_range(Column source_column,
     example via ``slice.indices``.
     """
 
-    assert (
-        source_end - source_begin == target_end - target_begin,
-        "Source and target ranges must be same length"
-    )
+    msg = "Source and target ranges must be same length"
+    assert source_end - source_begin == target_end - target_begin, msg
     if target_end >= target_begin and inplace:
         # FIXME: Are we allowed to do this when inplace=False?
         return target_column
@@ -174,24 +174,13 @@ def gather(
     Column gather_map,
     bool nullify=False
 ):
-    cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_columns(columns)
-    cdef column_view gather_map_view = gather_map.view()
-    cdef cpp_copying.out_of_bounds_policy policy = (
-        cpp_copying.out_of_bounds_policy.NULLIFY if nullify
-        else cpp_copying.out_of_bounds_policy.DONT_CHECK
+    tbl = pylibcudf.copying.gather(
+        pylibcudf.Table([col.to_pylibcudf(mode="read") for col in columns]),
+        gather_map.to_pylibcudf(mode="read"),
+        pylibcudf.copying.OutOfBoundsPolicy.NULLIFY if nullify
+        else pylibcudf.copying.OutOfBoundsPolicy.DONT_CHECK
     )
-
-    with nogil:
-        c_result = move(
-            cpp_copying.gather(
-                source_table_view,
-                gather_map_view,
-                policy
-            )
-        )
-
-    return columns_from_unique_ptr(move(c_result))
+    return columns_from_pylibcudf_table(tbl)
 
 
 cdef scatter_scalar(list source_device_slrs,
diff --git a/python/cudf/cudf/_lib/cpp/CMakeLists.txt b/python/cudf/cudf/_lib/cpp/CMakeLists.txt
new file mode 100644
index 00000000000..a99aa58dfe8
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/CMakeLists.txt
@@ -0,0 +1,23 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources copying.pyx types.pyx)
+
+set(linked_libraries cudf::cudf)
+
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp
+)
diff --git a/python/cudf/cudf/_lib/cpp/column/column.pxd b/python/cudf/cudf/_lib/cpp/column/column.pxd
index 205a1548c54..136f1d795a9 100644
--- a/python/cudf/cudf/_lib/cpp/column/column.pxd
+++ b/python/cudf/cudf/_lib/cpp/column/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -20,12 +20,6 @@ cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil:
         column() except +
         column(const column& other) except +
 
-        column(
-            data_type dtype,
-            size_type size,
-            device_buffer&& data
-        ) except +
-
         column(column_view view) except +
 
         size_type size() except +
diff --git a/python/cudf/cudf/_lib/cpp/column/column_view.pxd b/python/cudf/cudf/_lib/cpp/column/column_view.pxd
index 39c1c958531..edd013d9340 100644
--- a/python/cudf/cudf/_lib/cpp/column/column_view.pxd
+++ b/python/cudf/cudf/_lib/cpp/column/column_view.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.vector cimport vector
@@ -12,7 +12,6 @@ cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil:
         column_view(const column_view& other) except +
 
         column_view& operator=(const column_view&) except +
-        column_view& operator=(column_view&&) except +
 
         column_view(
             data_type type,
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index 8961675711f..20725c252fc 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -19,9 +19,9 @@ from cudf._lib.exception_handler cimport cudf_exception_handler
 ctypedef const scalar constscalar
 
 cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
-    ctypedef enum out_of_bounds_policy:
-        NULLIFY 'cudf::out_of_bounds_policy::NULLIFY'
-        DONT_CHECK 'cudf::out_of_bounds_policy::DONT_CHECK'
+    cpdef enum class out_of_bounds_policy(bool):
+        NULLIFY
+        DONT_CHECK
 
     cdef unique_ptr[table] gather (
         const table_view& source_table,
diff --git a/python/cudf/cudf/_lib/cpp/copying.pyx b/python/cudf/cudf/_lib/cpp/copying.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/cpp/expressions.pxd b/python/cudf/cudf/_lib/cpp/expressions.pxd
index 1721f8aa734..291afbcc62a 100644
--- a/python/cudf/cudf/_lib/cpp/expressions.pxd
+++ b/python/cudf/cudf/_lib/cpp/expressions.pxd
@@ -1,6 +1,7 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.scalar.scalar cimport (
@@ -40,6 +41,7 @@ cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil:
         LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR"
         # Unary operators
         IDENTITY "cudf::ast::ast_operator::IDENTITY"
+        IS_NULL "cudf::ast::ast_operator::IS_NULL"
         SIN "cudf::ast::ast_operator::SIN"
         COS "cudf::ast::ast_operator::COS"
         TAN "cudf::ast::ast_operator::TAN"
@@ -86,3 +88,8 @@ cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil:
     cdef cppclass operation(expression):
         operation(ast_operator, const expression &)
         operation(ast_operator, const expression &, const expression&)
+
+    cdef cppclass column_name_reference(expression):
+        # column_name_reference is only meant for use in file I/O such as the
+        # Parquet reader.
+        column_name_reference(string) except +
diff --git a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd b/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd
new file mode 100644
index 00000000000..4aef4841844
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport shared_ptr
+from libcpp.string cimport string
+from pyarrow.includes.libarrow cimport CRandomAccessFile
+
+cimport cudf._lib.cpp.io.datasource as cudf_io_datasource
+
+
+cdef extern from "cudf/io/arrow_io_source.hpp" \
+        namespace "cudf::io" nogil:
+
+    cdef cppclass arrow_io_source(cudf_io_datasource.datasource):
+        arrow_io_source(const string& arrow_uri) except +
+        arrow_io_source(shared_ptr[CRandomAccessFile]) except +
diff --git a/python/cudf/cudf/_lib/cpp/io/data_sink.pxd b/python/cudf/cudf/_lib/cpp/io/data_sink.pxd
new file mode 100644
index 00000000000..e939a47d7f9
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/io/data_sink.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+cdef extern from "cudf/io/data_sink.hpp" \
+        namespace "cudf::io" nogil:
+
+    cdef cppclass data_sink:
+        pass
diff --git a/python/cudf/cudf/_lib/cpp/io/datasource.pxd b/python/cudf/cudf/_lib/cpp/io/datasource.pxd
new file mode 100644
index 00000000000..c69aa65bd3c
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/io/datasource.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+cdef extern from "cudf/io/datasource.hpp" \
+        namespace "cudf::io" nogil:
+
+    cdef cppclass datasource:
+        pass
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index ec26fff3779..dd6f919a74d 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
@@ -9,6 +9,7 @@ from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
+from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.types cimport data_type, size_type
 
 
@@ -69,7 +70,8 @@ cdef extern from "cudf/io/orc.hpp" \
         size_type get_stripe_size_rows() except +
         size_type get_row_index_stride() except +
         cudf_table_view.table_view get_table() except +
-        const cudf_io_types.table_input_metadata *get_metadata() except +
+        const optional[cudf_io_types.table_input_metadata]& get_metadata(
+        ) except +
 
         # setter
         void set_compression(cudf_io_types.compression_type comp) except +
@@ -78,7 +80,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void set_stripe_size_rows(size_type val) except +
         void set_row_index_stride(size_type val) except +
         void set_table(cudf_table_view.table_view tbl) except +
-        void set_metadata(cudf_io_types.table_input_metadata* meta) except +
+        void set_metadata(cudf_io_types.table_input_metadata meta) except +
         void set_key_value_metadata(map[string, string] kvm) except +
 
         @staticmethod
@@ -100,7 +102,7 @@ cdef extern from "cudf/io/orc.hpp" \
             cudf_table_view.table_view tbl
         ) except +
         orc_writer_options_builder& metadata(
-            cudf_io_types.table_input_metadata *meta
+            cudf_io_types.table_input_metadata meta
         ) except +
         orc_writer_options_builder& key_value_metadata(
             map[string, string] kvm
@@ -119,7 +121,7 @@ cdef extern from "cudf/io/orc.hpp" \
         size_type stripe_size_rows() except +
         size_type row_index_stride() except +
         cudf_table_view.table_view get_table() except +
-        const cudf_io_types.table_input_metadata *get_metadata(
+        const optional[cudf_io_types.table_input_metadata]& get_metadata(
         ) except +
 
         # setter
@@ -130,7 +132,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void set_row_index_stride(size_type val) except +
         void set_table(cudf_table_view.table_view tbl) except +
         void set_metadata(
-            cudf_io_types.table_input_metadata* meta
+            cudf_io_types.table_input_metadata meta
         ) except +
         void set_key_value_metadata(map[string, string] kvm) except +
 
@@ -154,7 +156,7 @@ cdef extern from "cudf/io/orc.hpp" \
             cudf_table_view.table_view tbl
         ) except +
         chunked_orc_writer_options_builder& metadata(
-            cudf_io_types.table_input_metadata *meta
+            cudf_io_types.table_input_metadata meta
         ) except +
         chunked_orc_writer_options_builder& key_value_metadata(
             map[string, string] kvm
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 98b839ba9b8..2b92b9b58d3 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
@@ -9,6 +9,9 @@ from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
+from cudf._lib.cpp.expressions cimport expression
+from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
+from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.types cimport data_type, size_type
 
 
@@ -17,15 +20,14 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_reader_options() except +
         cudf_io_types.source_info get_source_info() except +
         vector[vector[size_type]] get_row_groups() except +
+        const optional[reference_wrapper[expression]]& get_filter() except +
         data_type get_timestamp_type() except +
-        bool is_enabled_convert_strings_to_categories() except +
         bool is_enabled_use_pandas_metadata() except +
 
         # setter
 
         void set_columns(vector[string] col_names) except +
         void set_row_groups(vector[vector[size_type]] row_grp) except +
-        void enable_convert_strings_to_categories(bool val) except +
         void enable_use_pandas_metadata(bool val) except +
         void set_timestamp_type(data_type type) except +
 
@@ -45,9 +47,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_reader_options_builder& row_groups(
             vector[vector[size_type]] row_grp
         ) except +
-        parquet_reader_options_builder& convert_strings_to_categories(
-            bool val
-        ) except +
         parquet_reader_options_builder& use_pandas_metadata(
             bool val
         ) except +
@@ -65,7 +64,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.compression_type get_compression() except +
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_table_view.table_view get_table() except +
-        const cudf_io_types.table_input_metadata get_metadata() except +
+        const optional[cudf_io_types.table_input_metadata]& get_metadata(
+        ) except +
         string get_column_chunks_file_paths() except +
         size_t get_row_group_size_bytes() except +
         size_type get_row_group_size_rows() except +
@@ -76,7 +76,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             vector[cudf_io_types.partition_info] partitions
         ) except +
         void set_metadata(
-            cudf_io_types.table_input_metadata *m
+            cudf_io_types.table_input_metadata m
         ) except +
         void set_key_value_metadata(
             vector[map[string, string]] kvm
@@ -112,7 +112,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             vector[cudf_io_types.partition_info] partitions
         ) except +
         parquet_writer_options_builder& metadata(
-            cudf_io_types.table_input_metadata *m
+            cudf_io_types.table_input_metadata m
         ) except +
         parquet_writer_options_builder& key_value_metadata(
             vector[map[string, string]] kvm
@@ -153,7 +153,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.sink_info get_sink() except +
         cudf_io_types.compression_type get_compression() except +
         cudf_io_types.statistics_freq get_stats_level() except +
-        cudf_io_types.table_input_metadata* get_metadata(
+        const optional[cudf_io_types.table_input_metadata]& get_metadata(
         ) except +
         size_t get_row_group_size_bytes() except +
         size_type get_row_group_size_rows() except +
@@ -161,7 +161,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_type get_max_page_size_rows() except +
 
         void set_metadata(
-            cudf_io_types.table_input_metadata *m
+            cudf_io_types.table_input_metadata m
         ) except +
         void set_key_value_metadata(
             vector[map[string, string]] kvm
@@ -188,7 +188,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             cudf_io_types.sink_info sink_,
         ) except +
         chunked_parquet_writer_options_builder& metadata(
-            cudf_io_types.table_input_metadata *m
+            cudf_io_types.table_input_metadata m
         ) except +
         chunked_parquet_writer_options_builder& key_value_metadata(
             vector[map[string, string]] kvm
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
index b2b0a77c45f..01eaca82692 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -10,6 +10,8 @@ from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 
+cimport cudf._lib.cpp.io.data_sink as cudf_io_data_sink
+cimport cudf._lib.cpp.io.datasource as cudf_io_datasource
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.types cimport size_type
@@ -105,35 +107,18 @@ cdef extern from "cudf/io/types.hpp" \
         source_info() except +
         source_info(const vector[string] &filepaths) except +
         source_info(const vector[host_buffer] &host_buffers) except +
-        source_info(datasource *source) except +
-        source_info(const vector[datasource*] &datasources) except +
+        source_info(cudf_io_datasource.datasource *source) except +
+        source_info(const vector[cudf_io_datasource.datasource*] &datasources) except +
 
     cdef cppclass sink_info:
         io_type type
         const vector[string]& filepaths()
         const vector[vector[char] *]& buffers()
-        const vector[data_sink *]& user_sinks()
+        const vector[cudf_io_data_sink.data_sink *]& user_sinks()
 
         sink_info() except +
         sink_info(string file_path) except +
         sink_info(vector[string] file_path) except +
         sink_info(vector[char] * buffer) except +
-        sink_info(data_sink * user_sink) except +
-        sink_info(vector[data_sink *] user_sink) except +
-
-
-cdef extern from "cudf/io/data_sink.hpp" \
-        namespace "cudf::io" nogil:
-
-    cdef cppclass data_sink:
-        pass
-
-cdef extern from "cudf/io/datasource.hpp" \
-        namespace "cudf::io" nogil:
-
-    cdef cppclass datasource:
-        pass
-
-    cdef cppclass arrow_io_source(datasource):
-        arrow_io_source(string arrow_uri) except +
-        arrow_io_source(shared_ptr[CRandomAccessFile]) except +
+        sink_info(cudf_io_data_sink.data_sink * user_sink) except +
+        sink_info(vector[cudf_io_data_sink.data_sink *] user_sink) except +
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd b/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
new file mode 100644
index 00000000000..2178f1a940c
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+
+cdef extern from "<memory>" namespace "std" nogil:
+    # The Cython standard header does not have except +, so C++
+    # exceptions from make_unique are not caught and translated to
+    # Python ones. This is not perfectly ergonomic, we always have to
+    # wrap make_unique in move, but at least we can catch exceptions.
+    # See https://github.com/cython/cython/issues/5560
+    unique_ptr[T] make_unique[T](...) except +
diff --git a/python/cudf/cudf/_lib/cpp/null_mask.pxd b/python/cudf/cudf/_lib/cpp/null_mask.pxd
index 3050a9f3459..bd0eb684690 100644
--- a/python/cudf/cudf/_lib/cpp/null_mask.pxd
+++ b/python/cudf/cudf/_lib/cpp/null_mask.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.pair cimport pair
@@ -7,7 +7,7 @@ from rmm._lib.device_buffer cimport device_buffer
 
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport mask_state, size_type
+from cudf._lib.cpp.types cimport bitmask_type, mask_state, size_type
 
 ctypedef int32_t underlying_type_t_mask_state
 
@@ -38,3 +38,9 @@ cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil:
     cdef pair[device_buffer, size_type] bitmask_or(
         table_view view
     )
+
+    cdef size_type null_count(
+        const bitmask_type * bitmask,
+        size_type start,
+        size_type stop,
+    )
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd b/python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd
index 06147df38f2..75822054e4a 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd
+++ b/python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
@@ -20,3 +20,8 @@ cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
         const column_view &strings,
         size_type ngrams
     ) except +
+
+    cdef unique_ptr[column] hash_character_ngrams(
+        const column_view &strings,
+        size_type ngrams
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd b/python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd
new file mode 100644
index 00000000000..a77f95f07ac
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.types cimport size_type
+
+
+cdef extern from "nvtext/jaccard.hpp" namespace "nvtext" nogil:
+
+    cdef unique_ptr[column] jaccard_index(
+        const column_view &input1,
+        const column_view &input2,
+        size_type width
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd
index 0509083ae3b..08b3330953e 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd
+++ b/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd
@@ -4,7 +4,6 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.hash cimport hash_id
 from cudf._lib.cpp.types cimport size_type
 
 
@@ -14,5 +13,10 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const column_view &strings,
         const column_view &seeds,
         const size_type width,
-        const hash_id hash_function
+    ) except +
+
+    cdef unique_ptr[column] minhash64(
+        const column_view &strings,
+        const column_view &seeds,
+        const size_type width,
     ) except +
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
index cdb39e3c7fa..226fa613f2c 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
+++ b/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint16_t, uint32_t
 from libcpp cimport bool
@@ -38,8 +38,7 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
         uint32_t max_sequence_length,
         uint32_t stride,
         bool do_lower,
-        bool do_truncate,
-        uint32_t max_rows_tensor
+        bool do_truncate
     ) except +
 
     cdef tokenizer_result subword_tokenize(
@@ -48,8 +47,7 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
         uint32_t max_sequence_length,
         uint32_t stride,
         bool do_lower,
-        bool do_truncate,
-        uint32_t max_rows_tensor
+        bool do_truncate
     ) except +
 
 cdef extern from "<utility>" namespace "std" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd
index b210ddf81dd..68f01003fe6 100644
--- a/python/cudf/cudf/_lib/cpp/sorting.pxd
+++ b/python/cudf/cudf/_lib/cpp/sorting.pxd
@@ -20,9 +20,8 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         vector[libcudf_types.order] column_order,
         vector[libcudf_types.null_order] null_precedence) except +
 
-    cdef unique_ptr[table] stable_sort_by_key(
-        const table_view& values,
-        const table_view& keys,
+    cdef unique_ptr[column] stable_sorted_order(
+        table_view source_table,
         vector[libcudf_types.order] column_order,
         vector[libcudf_types.null_order] null_precedence) except +
 
@@ -45,3 +44,27 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         const column_view& segment_offsets,
         vector[libcudf_types.order] column_order,
         vector[libcudf_types.null_order] null_precedence) except +
+
+    cdef unique_ptr[table] stable_segmented_sort_by_key(
+        const table_view& values,
+        const table_view& keys,
+        const column_view& segment_offsets,
+        vector[libcudf_types.order] column_order,
+        vector[libcudf_types.null_order] null_precedence) except +
+
+    cdef unique_ptr[table] sort_by_key(
+        const table_view& values,
+        const table_view& keys,
+        vector[libcudf_types.order] column_order,
+        vector[libcudf_types.null_order] null_precedence) except +
+
+    cdef unique_ptr[table] stable_sort_by_key(
+        const table_view& values,
+        const table_view& keys,
+        vector[libcudf_types.order] column_order,
+        vector[libcudf_types.null_order] null_precedence) except +
+
+    cdef unique_ptr[table] sort(
+        table_view source_table,
+        vector[libcudf_types.order] column_order,
+        vector[libcudf_types.null_order] null_precedence) except +
diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
index 61efd040807..bba2d1ffb7c 100644
--- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -20,6 +20,7 @@ from cudf._lib.cpp.types cimport (
 cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \
         nogil:
     ctypedef enum duplicate_keep_option:
+        KEEP_ANY 'cudf::duplicate_keep_option::KEEP_ANY'
         KEEP_FIRST 'cudf::duplicate_keep_option::KEEP_FIRST'
         KEEP_LAST 'cudf::duplicate_keep_option::KEEP_LAST'
         KEEP_NONE 'cudf::duplicate_keep_option::KEEP_NONE'
@@ -33,13 +34,14 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \
         column_view boolean_mask
     ) except +
 
-    cdef unique_ptr[table] unique(
-        table_view source_table,
-        vector[size_type] keys,
-        duplicate_keep_option keep,
-        null_equality nulls_equal) except +
-
     cdef size_type distinct_count(
         column_view source_table,
         null_policy null_handling,
         nan_policy nan_handling) except +
+
+    cdef unique_ptr[table] stable_distinct(
+        table_view input,
+        vector[size_type] keys,
+        duplicate_keep_option keep,
+        null_equality nulls_equal,
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/table/table.pxd b/python/cudf/cudf/_lib/cpp/table/table.pxd
index d7f3de76c63..ac93e3def19 100644
--- a/python/cudf/cudf/_lib/cpp/table/table.pxd
+++ b/python/cudf/cudf/_lib/cpp/table/table.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
@@ -11,7 +11,6 @@ from cudf._lib.cpp.types cimport size_type
 cdef extern from "cudf/table/table.hpp" namespace "cudf" nogil:
     cdef cppclass table:
         table(const table&) except +
-        table(vector[unique_ptr[column]]&& columns) except +
         table(table_view) except +
         size_type num_columns() except +
         size_type num_rows() except +
diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd
index e4106ffb99d..14bf8a83de0 100644
--- a/python/cudf/cudf/_lib/cpp/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/types.pxd
@@ -4,13 +4,15 @@ from libc.stdint cimport int32_t, uint32_t
 
 
 cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
+    # The declaration below is to work around
+    # https://github.com/cython/cython/issues/5637
+    """
+    #define __PYX_ENUM_CLASS_DECL enum
+    """
     ctypedef int32_t size_type
     ctypedef uint32_t bitmask_type
     ctypedef uint32_t char_utf8
 
-    cdef enum:
-        UNKNOWN_NULL_COUNT = -1
-
     ctypedef enum mask_state:
         UNALLOCATED "cudf::mask_state::UNALLOCATED"
         UNINITIALIZED "cudf::mask_state::UNINITIALIZED"
@@ -52,36 +54,36 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         ALL_EQUAL "cudf::nan_equality::ALL_EQUAL"
         NANS_UNEQUAL "cudf::nan_equality::UNEQUAL"
 
-    ctypedef enum type_id "cudf::type_id":
-        EMPTY                  "cudf::type_id::EMPTY"
-        INT8                   "cudf::type_id::INT8"
-        INT16                  "cudf::type_id::INT16"
-        INT32                  "cudf::type_id::INT32"
-        INT64                  "cudf::type_id::INT64"
-        UINT8                  "cudf::type_id::UINT8"
-        UINT16                 "cudf::type_id::UINT16"
-        UINT32                 "cudf::type_id::UINT32"
-        UINT64                 "cudf::type_id::UINT64"
-        FLOAT32                "cudf::type_id::FLOAT32"
-        FLOAT64                "cudf::type_id::FLOAT64"
-        BOOL8                  "cudf::type_id::BOOL8"
-        TIMESTAMP_DAYS         "cudf::type_id::TIMESTAMP_DAYS"
-        TIMESTAMP_SECONDS      "cudf::type_id::TIMESTAMP_SECONDS"
-        TIMESTAMP_MILLISECONDS "cudf::type_id::TIMESTAMP_MILLISECONDS"
-        TIMESTAMP_MICROSECONDS "cudf::type_id::TIMESTAMP_MICROSECONDS"
-        TIMESTAMP_NANOSECONDS  "cudf::type_id::TIMESTAMP_NANOSECONDS"
-        DICTIONARY32           "cudf::type_id::DICTIONARY32"
-        STRING                 "cudf::type_id::STRING"
-        LIST                   "cudf::type_id::LIST"
-        STRUCT                 "cudf::type_id::STRUCT"
-        NUM_TYPE_IDS           "cudf::type_id::NUM_TYPE_IDS"
-        DURATION_SECONDS       "cudf::type_id::DURATION_SECONDS"
-        DURATION_MILLISECONDS  "cudf::type_id::DURATION_MILLISECONDS"
-        DURATION_MICROSECONDS  "cudf::type_id::DURATION_MICROSECONDS"
-        DURATION_NANOSECONDS   "cudf::type_id::DURATION_NANOSECONDS"
-        DECIMAL32              "cudf::type_id::DECIMAL32"
-        DECIMAL64              "cudf::type_id::DECIMAL64"
-        DECIMAL128             "cudf::type_id::DECIMAL128"
+    cpdef enum class type_id(int32_t):
+        EMPTY
+        INT8
+        INT16
+        INT32
+        INT64
+        UINT8
+        UINT16
+        UINT32
+        UINT64
+        FLOAT32
+        FLOAT64
+        BOOL8
+        TIMESTAMP_DAYS
+        TIMESTAMP_SECONDS
+        TIMESTAMP_MILLISECONDS
+        TIMESTAMP_MICROSECONDS
+        TIMESTAMP_NANOSECONDS
+        DICTIONARY32
+        STRING
+        LIST
+        STRUCT
+        NUM_TYPE_IDS
+        DURATION_SECONDS
+        DURATION_MILLISECONDS
+        DURATION_MICROSECONDS
+        DURATION_NANOSECONDS
+        DECIMAL32
+        DECIMAL64
+        DECIMAL128
 
     cdef cppclass data_type:
         data_type() except +
diff --git a/python/cudf/cudf/_lib/cpp/types.pyx b/python/cudf/cudf/_lib/cpp/types.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index ce57ea26360..630dcf73545 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -35,9 +35,9 @@ from cudf._lib.cpp.io.csv cimport (
     read_csv as cpp_read_csv,
     write_csv as cpp_write_csv,
 )
+from cudf._lib.cpp.io.data_sink cimport data_sink
 from cudf._lib.cpp.io.types cimport (
     compression_type,
-    data_sink,
     quote_style,
     sink_info,
     source_info,
@@ -262,7 +262,7 @@ cdef csv_reader_options make_csv_reader_options(
         elif (
             cudf.api.types.is_scalar(dtype) or
             isinstance(dtype, (
-                np.dtype, pd.core.dtypes.dtypes.ExtensionDtype, type
+                np.dtype, pd.api.extensions.ExtensionDtype, type
             ))
         ):
             c_dtypes_list.reserve(1)
@@ -439,7 +439,7 @@ def read_csv(
         elif (
             cudf.api.types.is_scalar(dtype) or
             isinstance(dtype, (
-                np.dtype, pd.core.dtypes.dtypes.ExtensionDtype, type
+                np.dtype, pd.api.extensions.ExtensionDtype, type
             ))
         ):
             if cudf.api.types.is_categorical_dtype(dtype):
@@ -543,7 +543,7 @@ def write_csv(
         )
 
 
-cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
+cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
     # TODO: Remove this work-around Dictionary types
     # in libcudf are fully mapped to categorical columns:
     # https://github.com/rapidsai/cudf/issues/3960
diff --git a/python/cudf/cudf/_lib/exception_handler.hpp b/python/cudf/cudf/_lib/exception_handler.hpp
deleted file mode 100644
index 8daffddd7bd..00000000000
--- a/python/cudf/cudf/_lib/exception_handler.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <Python.h>
-#include <cudf/utilities/error.hpp>
-#include <ios>
-#include <stdexcept>
-
-namespace cudf_python {
-namespace exceptions {
-
-/**
- * @brief Exception handler to map C++ exceptions to Python ones in Cython
- *
- * This exception handler extends the base exception handler provided by
- * Cython (https://github.com/cython/cython/blob/master/Cython/Utility/CppSupport.cpp#L9).
- * In addition to the exceptions that Cython itself supports, this file adds support
- * for additional exceptions thrown by libcudf that need to be mapped to specific Python
- * exceptions.
- *
- * Since this function interoperates with Python's exception state, it does not throw
- * any C++ exceptions.
- */
-void cudf_exception_handler()
-{
-  // Catch a handful of different errors here and turn them into the
-  // equivalent Python errors.
-  try {
-    if (PyErr_Occurred())
-      ;  // let the latest Python exn pass through and ignore the current one else
-    throw;
-  } catch (const std::bad_alloc& exn) {
-    PyErr_SetString(PyExc_MemoryError, exn.what());
-  } catch (const std::bad_cast& exn) {
-    PyErr_SetString(PyExc_TypeError, exn.what());
-  } catch (const std::domain_error& exn) {
-    PyErr_SetString(PyExc_ValueError, exn.what());
-  } catch (const cudf::data_type_error& exn) {
-    // Have to catch data_type_error before invalid_argument because it is a subclass
-    PyErr_SetString(PyExc_TypeError, exn.what());
-  } catch (const std::invalid_argument& exn) {
-    PyErr_SetString(PyExc_ValueError, exn.what());
-  } catch (const std::ios_base::failure& exn) {
-    // Unfortunately, in standard C++ we have no way of distinguishing EOF
-    // from other errors here; be careful with the exception mask
-    PyErr_SetString(PyExc_IOError, exn.what());
-  } catch (const std::out_of_range& exn) {
-    // Change out_of_range to IndexError
-    PyErr_SetString(PyExc_IndexError, exn.what());
-  } catch (const std::overflow_error& exn) {
-    PyErr_SetString(PyExc_OverflowError, exn.what());
-  } catch (const std::range_error& exn) {
-    PyErr_SetString(PyExc_ArithmeticError, exn.what());
-  } catch (const std::underflow_error& exn) {
-    PyErr_SetString(PyExc_ArithmeticError, exn.what());
-    // The below is the default catch-all case.
-  } catch (const std::exception& exn) {
-    PyErr_SetString(PyExc_RuntimeError, exn.what());
-  } catch (...) {
-    PyErr_SetString(PyExc_RuntimeError, "Unknown exception");
-  }
-}
-
-}  // namespace exceptions
-}  // namespace cudf_python
diff --git a/python/cudf/cudf/_lib/exception_handler.pxd b/python/cudf/cudf/_lib/exception_handler.pxd
index 14ac3bb1d40..4337d8db285 100644
--- a/python/cudf/cudf/_lib/exception_handler.pxd
+++ b/python/cudf/cudf/_lib/exception_handler.pxd
@@ -1,5 +1,69 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 
-cdef extern from "exception_handler.hpp" namespace "cudf_python::exceptions":
+# See
+# https://github.com/cython/cython/blob/master/Cython/Utility/CppSupport.cpp
+# for the original Cython exception handler.
+cdef extern from *:
+    """
+    #include <Python.h>
+    #include <cudf/utilities/error.hpp>
+    #include <ios>
+    #include <stdexcept>
+
+    namespace {
+
+    /**
+     * @brief Exception handler to map C++ exceptions to Python ones in Cython
+     *
+     * This exception handler extends the base exception handler provided by
+     * Cython. In addition to the exceptions that Cython itself supports, this
+     * file adds support for additional exceptions thrown by libcudf that need
+     * to be mapped to specific Python exceptions.
+     *
+     * Since this function interoperates with Python's exception state, it
+     * does not throw any C++ exceptions.
+     */
+    void cudf_exception_handler()
+    {
+      // Catch a handful of different errors here and turn them into the
+      // equivalent Python errors.
+      try {
+        if (PyErr_Occurred())
+          ;  // let latest Python exn pass through and ignore the current one
+        throw;
+      } catch (const std::bad_alloc& exn) {
+        PyErr_SetString(PyExc_MemoryError, exn.what());
+      } catch (const std::bad_cast& exn) {
+        PyErr_SetString(PyExc_TypeError, exn.what());
+      } catch (const std::domain_error& exn) {
+        PyErr_SetString(PyExc_ValueError, exn.what());
+      } catch (const cudf::data_type_error& exn) {
+        // Catch subclass (data_type_error) before parent (invalid_argument)
+        PyErr_SetString(PyExc_TypeError, exn.what());
+      } catch (const std::invalid_argument& exn) {
+        PyErr_SetString(PyExc_ValueError, exn.what());
+      } catch (const std::ios_base::failure& exn) {
+        // Unfortunately, in standard C++ we have no way of distinguishing EOF
+        // from other errors here; be careful with the exception mask
+        PyErr_SetString(PyExc_IOError, exn.what());
+      } catch (const std::out_of_range& exn) {
+        // Change out_of_range to IndexError
+        PyErr_SetString(PyExc_IndexError, exn.what());
+      } catch (const std::overflow_error& exn) {
+        PyErr_SetString(PyExc_OverflowError, exn.what());
+      } catch (const std::range_error& exn) {
+        PyErr_SetString(PyExc_ArithmeticError, exn.what());
+      } catch (const std::underflow_error& exn) {
+        PyErr_SetString(PyExc_ArithmeticError, exn.what());
+        // The below is the default catch-all case.
+      } catch (const std::exception& exn) {
+        PyErr_SetString(PyExc_RuntimeError, exn.what());
+      } catch (...) {
+        PyErr_SetString(PyExc_RuntimeError, "Unknown exception");
+      }
+    }
+
+    }  // anonymous namespace
+    """
     cdef void cudf_exception_handler()
diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/expressions.pxd
index f93f815c3ec..fc69dc13bb2 100644
--- a/python/cudf/cudf/_lib/expressions.pxd
+++ b/python/cudf/cudf/_lib/expressions.pxd
@@ -26,3 +26,6 @@ cdef class ColumnReference(Expression):
 
 cdef class Operation(Expression):
     pass
+
+cdef class ColumnNameReference(Expression):
+    pass
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
index c97aa9e75ee..8d7545ffe15 100644
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -4,9 +4,12 @@ from enum import Enum
 
 from cython.operator cimport dereference
 from libc.stdint cimport int64_t
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
 
 from cudf._lib.cpp cimport expressions as libcudf_exp
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.types cimport size_type
 
 # Necessary for proper casting, see below.
@@ -43,6 +46,7 @@ class ASTOperator(Enum):
     NULL_LOGICAL_OR = libcudf_exp.ast_operator.NULL_LOGICAL_OR
     # Unary operators
     IDENTITY = libcudf_exp.ast_operator.IDENTITY
+    IS_NULL = libcudf_exp.ast_operator.IS_NULL
     SIN = libcudf_exp.ast_operator.SIN
     COS = libcudf_exp.ast_operator.COS
     TAN = libcudf_exp.ast_operator.TAN
@@ -79,44 +83,44 @@ cdef class Literal(Expression):
     def __cinit__(self, value):
         if isinstance(value, int):
             self.c_scalar.reset(new numeric_scalar[int64_t](value, True))
-            self.c_obj = <expression_ptr> make_unique[libcudf_exp.literal](
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <numeric_scalar[int64_t] &>dereference(self.c_scalar)
-            )
+            ))
         elif isinstance(value, float):
             self.c_scalar.reset(new numeric_scalar[double](value, True))
-            self.c_obj = <expression_ptr> make_unique[libcudf_exp.literal](
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <numeric_scalar[double] &>dereference(self.c_scalar)
-            )
+            ))
         elif isinstance(value, str):
             self.c_scalar.reset(new string_scalar(value.encode(), True))
-            self.c_obj = <expression_ptr> make_unique[libcudf_exp.literal](
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
                 <string_scalar &>dereference(self.c_scalar)
-            )
+            ))
 
 
 cdef class ColumnReference(Expression):
     def __cinit__(self, size_type index):
-        self.c_obj = <expression_ptr>make_unique[libcudf_exp.column_reference](
+        self.c_obj = <expression_ptr>move(make_unique[libcudf_exp.column_reference](
             index
-        )
+        ))
 
 
 cdef class Operation(Expression):
     def __cinit__(self, op, Expression left, Expression right=None):
-        # This awkward double casting is the only way to get Cython to generate
-        # valid C++. Cython doesn't support scoped enumerations, so it assumes
-        # that enums correspond to their underlying value types and will thus
-        # attempt operations that are invalid without first explicitly casting
-        # to the underlying before casting to the desired type.
         cdef libcudf_exp.ast_operator op_value = <libcudf_exp.ast_operator>(
             <underlying_type_ast_operator> op.value
         )
 
         if right is None:
-            self.c_obj = <expression_ptr> make_unique[libcudf_exp.operation](
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
                 op_value, dereference(left.c_obj)
-            )
+            ))
         else:
-            self.c_obj = <expression_ptr> make_unique[libcudf_exp.operation](
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
                 op_value, dereference(left.c_obj), dereference(right.c_obj)
-            )
+            ))
+
+cdef class ColumnNameReference(Expression):
+    def __cinit__(self, string name):
+        self.c_obj = <expression_ptr> \
+            move(make_unique[libcudf_exp.column_name_reference](name))
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index c5d8c48fa2c..8fd2a409d90 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -19,6 +19,7 @@ from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf.api.types import is_list_dtype, is_struct_dtype
+from cudf.core.buffer import acquire_spill_lock
 
 
 def from_dlpack(dlpack_capsule):
@@ -106,6 +107,7 @@ cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *:
         )
     return cpp_metadata
 
+
 cdef _set_col_children_metadata(dtype,
                                 column_metadata& col_meta):
 
@@ -133,6 +135,7 @@ cdef _set_col_children_metadata(dtype,
         col_meta.children_meta.push_back(column_metadata())
 
 
+@acquire_spill_lock()
 def to_arrow(list source_columns, object column_dtypes):
     """Convert a list of columns from
     cudf Frame to a PyArrow Table.
@@ -158,6 +161,7 @@ def to_arrow(list source_columns, object column_dtypes):
     return pyarrow_wrap_table(cpp_arrow_table)
 
 
+@acquire_spill_lock()
 def from_arrow(object input_table):
     """Convert from PyArrow Table to a list of columns.
 
diff --git a/python/cudf/cudf/_lib/io/datasource.pxd b/python/cudf/cudf/_lib/io/datasource.pxd
index a7a3731a0e6..bd5bf0227a5 100644
--- a/python/cudf/cudf/_lib/io/datasource.pxd
+++ b/python/cudf/cudf/_lib/io/datasource.pxd
@@ -1,12 +1,14 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 
-from cudf._lib.cpp.io.types cimport arrow_io_source, datasource
+from cudf._lib.cpp.io.arrow_io_source cimport arrow_io_source
+from cudf._lib.cpp.io.datasource cimport datasource
 
 
 cdef class Datasource:
-    cdef datasource* get_datasource(self) nogil except *
+    cdef datasource* get_datasource(self) except * nogil
+
 
 cdef class NativeFileDatasource(Datasource):
     cdef shared_ptr[arrow_io_source] c_datasource
diff --git a/python/cudf/cudf/_lib/io/datasource.pyx b/python/cudf/cudf/_lib/io/datasource.pyx
index 7402779a6ac..5cadd58d8d3 100644
--- a/python/cudf/cudf/_lib/io/datasource.pyx
+++ b/python/cudf/cudf/_lib/io/datasource.pyx
@@ -1,14 +1,15 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 from pyarrow.lib cimport NativeFile
 
-from cudf._lib.cpp.io.types cimport arrow_io_source, datasource
+from cudf._lib.cpp.io.arrow_io_source cimport arrow_io_source
+from cudf._lib.cpp.io.datasource cimport datasource
 
 
 cdef class Datasource:
-    cdef datasource* get_datasource(self) nogil except *:
+    cdef datasource* get_datasource(self) except * nogil:
         with gil:
             raise NotImplementedError("get_datasource() should not "
                                       + "be directly invoked here")
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index af1f2521d4a..2c2d52b512b 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -1,15 +1,11 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.types cimport (
-    column_name_info,
-    data_sink,
-    sink_info,
-    source_info,
-)
+from cudf._lib.cpp.io.data_sink cimport data_sink
+from cudf._lib.cpp.io.types cimport column_name_info, sink_info, source_info
 
 
 cdef source_info make_source_info(list src) except*
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 7dbe395be79..9b027a4d275 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cpython.buffer cimport PyBUF_READ
 from cpython.memoryview cimport PyMemoryView_FromMemory
@@ -8,10 +8,10 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
+from cudf._lib.cpp.io.data_sink cimport data_sink
+from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.cpp.io.types cimport (
     column_name_info,
-    data_sink,
-    datasource,
     host_buffer,
     sink_info,
     source_info,
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index da03e8dcdd1..416680aae24 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -1,14 +1,17 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
+from rmm._lib.device_buffer cimport device_buffer
+
 cimport cudf._lib.cpp.join as cpp_join
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
 from cudf._lib.utils cimport table_view_from_columns
@@ -61,10 +64,11 @@ def semi_join(list lhs, list rhs, how=None):
 
 
 cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map):
-    # helple to convert a gather map to a Column
+    # help to convert a gather map to a Column
+    cdef device_buffer c_empty
     cdef size_type size = gather_map.get()[0].size()
-    cdef unique_ptr[column] c_col = make_unique[column](
+    cdef unique_ptr[column] c_col = move(make_unique[column](
         data_type(type_id.INT32),
         size,
-        gather_map.get()[0].release())
+        gather_map.get()[0].release(), move(c_empty), 0))
     return Column.from_unique_ptr(move(c_col))
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index af4232a8734..437c3ef6ec4 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -17,6 +17,7 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
+from cudf._lib.cpp.io.data_sink cimport data_sink
 from cudf._lib.cpp.io.json cimport (
     json_reader_options,
     json_writer_options,
@@ -27,7 +28,6 @@ from cudf._lib.cpp.io.json cimport (
 from cudf._lib.cpp.io.types cimport (
     column_name_info,
     compression_type,
-    data_sink,
     sink_info,
     table_metadata,
     table_with_metadata,
@@ -211,7 +211,7 @@ def write_json(
         )
 
 
-cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +:
+cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *:
     cdef schema_element s_element
     cdef data_type lib_type
     if cudf.api.types.is_categorical_dtype(dtype):
@@ -236,7 +236,7 @@ cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +:
     return s_element
 
 
-cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
+cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
     if cudf.api.types.is_categorical_dtype(dtype):
         raise NotImplementedError(
             "CategoricalDtype as dtype is not yet "
@@ -259,9 +259,10 @@ cdef _set_col_children_metadata(Column col,
                 child_col, col_meta.children[i]
             )
     elif is_list_dtype(col):
-        _set_col_children_metadata(
-            col.children[0],
-            col_meta.children[0]
-        )
+        for i, child_col in enumerate(col.children):
+            col_meta.children.push_back(child_info)
+            _set_col_children_metadata(
+                child_col, col_meta.children[i]
+            )
     else:
         return
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index c41ae98b9bd..5b4538629f6 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from enum import Enum
 
@@ -6,12 +6,13 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
     bitmask_and as cpp_bitmask_and,
@@ -50,7 +51,7 @@ def copy_bitmask(Column col):
 
     with nogil:
         db = move(cpp_copy_bitmask(col_view))
-        up_db = make_unique[device_buffer](move(db))
+        up_db = move(make_unique[device_buffer](move(db)))
 
     rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
     buf = as_buffer(rmm_db)
@@ -96,7 +97,7 @@ def create_null_mask(size_type size, state=MaskState.UNINITIALIZED):
 
     with nogil:
         db = move(cpp_create_null_mask(size, c_mask_state))
-        up_db = make_unique[device_buffer](move(db))
+        up_db = move(make_unique[device_buffer](move(db)))
 
     rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
     buf = as_buffer(rmm_db)
@@ -110,7 +111,7 @@ def bitmask_and(columns: list):
     cdef unique_ptr[device_buffer] up_db
     with nogil:
         c_result = move(cpp_bitmask_and(c_view))
-        up_db = make_unique[device_buffer](move(c_result.first))
+        up_db = move(make_unique[device_buffer](move(c_result.first)))
     dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db))
     buf = as_buffer(dbuf)
     return buf, c_result.second
@@ -123,7 +124,7 @@ def bitmask_or(columns: list):
     cdef unique_ptr[device_buffer] up_db
     with nogil:
         c_result = move(cpp_bitmask_or(c_view))
-        up_db = make_unique[device_buffer](move(c_result.first))
+        up_db = move(make_unique[device_buffer](move(c_result.first)))
     dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db))
     buf = as_buffer(dbuf)
     return buf, c_result.second
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
index 1b6ad5110c4..515b9c1d6e4 100644
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -12,8 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx generate_ngrams.pyx minhash.pyx ngrams_tokenize.pyx
-                   normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
+set(cython_sources
+    edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
+    replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
 )
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
index 7be3b0f7c03..96b95c8792d 100644
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -11,6 +11,7 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.nvtext.generate_ngrams cimport (
     generate_character_ngrams as cpp_generate_character_ngrams,
     generate_ngrams as cpp_generate_ngrams,
+    hash_character_ngrams as cpp_hash_character_ngrams,
 )
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
 from cudf._lib.cpp.types cimport size_type
@@ -55,3 +56,20 @@ def generate_character_ngrams(Column strings, int ngrams):
         )
 
     return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def hash_character_ngrams(Column strings, int ngrams):
+    cdef column_view c_strings = strings.view()
+    cdef size_type c_ngrams = ngrams
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_hash_character_ngrams(
+                c_strings,
+                c_ngrams
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
new file mode 100644
index 00000000000..9035e743fa5
--- /dev/null
+++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
@@ -0,0 +1,31 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from cudf.core.buffer import acquire_spill_lock
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.nvtext.jaccard cimport jaccard_index as cpp_jaccard_index
+from cudf._lib.cpp.types cimport size_type
+
+
+@acquire_spill_lock()
+def jaccard_index(Column input1, Column input2, int width):
+    cdef column_view c_input1 = input1.view()
+    cdef column_view c_input2 = input2.view()
+    cdef size_type c_width = width
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_jaccard_index(
+                c_input1,
+                c_input2,
+                c_width
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index f0b2c799912..6ed5ca834ee 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -8,31 +8,47 @@ from libcpp.utility cimport move
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.hash cimport hash_id as cpp_hash_id
-from cudf._lib.cpp.nvtext.minhash cimport minhash as cpp_minhash
+from cudf._lib.cpp.nvtext.minhash cimport (
+    minhash as cpp_minhash,
+    minhash64 as cpp_minhash64,
+)
 from cudf._lib.cpp.types cimport size_type
 
 
 @acquire_spill_lock()
-def minhash(Column strings, Column seeds, int width, str method):
+def minhash(Column strings, Column seeds, int width):
 
     cdef column_view c_strings = strings.view()
     cdef size_type c_width = width
     cdef column_view c_seeds = seeds.view()
     cdef unique_ptr[column] c_result
-    cdef cpp_hash_id c_hash_function
-    if method == "murmur3":
-        c_hash_function = cpp_hash_id.HASH_MURMUR3
-    else:
-        raise ValueError(f"Unsupported hash function: {method}")
 
     with nogil:
         c_result = move(
             cpp_minhash(
                 c_strings,
                 c_seeds,
-                c_width,
-                c_hash_function
+                c_width
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def minhash64(Column strings, Column seeds, int width):
+
+    cdef column_view c_strings = strings.view()
+    cdef size_type c_width = width
+    cdef column_view c_seeds = seeds.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_minhash64(
+                c_strings,
+                c_seeds,
+                c_width
             )
         )
 
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
index 7a76052ffe4..c690aba70de 100644
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -12,7 +12,7 @@ from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.nvtext.stemmer cimport (
     is_letter as cpp_is_letter,
-    letter_type as letter_type,
+    letter_type,
     porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
index dbd23d91cc5..bf675a16adc 100644
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t
 
@@ -37,7 +37,6 @@ def subword_tokenize_inmem_hash(
     uint32_t stride=48,
     bool do_lower=True,
     bool do_truncate=False,
-    uint32_t max_rows_tensor=500
 ):
     """
     Subword tokenizes text series by using the pre-loaded hashed vocabulary
@@ -53,7 +52,6 @@ def subword_tokenize_inmem_hash(
                 stride,
                 do_lower,
                 do_truncate,
-                max_rows_tensor
             )
         )
     # return the 3 tensor components
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index f4159f078b9..0ae039b14d2 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -5,7 +5,7 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool, int
 from libcpp.map cimport map
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
@@ -21,6 +21,7 @@ except ImportError:
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 from cudf._lib.column cimport Column
+from cudf._lib.cpp.io.data_sink cimport data_sink
 from cudf._lib.cpp.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
@@ -36,7 +37,6 @@ from cudf._lib.cpp.io.orc_metadata cimport (
 from cudf._lib.cpp.io.types cimport (
     column_in_metadata,
     compression_type,
-    data_sink,
     sink_info,
     source_info,
     table_input_metadata,
@@ -254,7 +254,7 @@ def write_orc(
     cdef compression_type compression_ = _get_comp_type(compression)
     cdef unique_ptr[data_sink] data_sink_c
     cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
-    cdef unique_ptr[table_input_metadata] tbl_meta
+    cdef table_input_metadata tbl_meta
     cdef map[string, string] user_data
     user_data[str.encode("pandas")] = str.encode(generate_pandas_metadata(
         table, index)
@@ -264,9 +264,9 @@ def write_orc(
         index is None and not isinstance(table._index, cudf.RangeIndex)
     ):
         tv = table_view_from_table(table)
-        tbl_meta = make_unique[table_input_metadata](tv)
+        tbl_meta = table_input_metadata(tv)
         for level, idx_name in enumerate(table._index.names):
-            tbl_meta.get().column_metadata[level].set_name(
+            tbl_meta.column_metadata[level].set_name(
                 str.encode(
                     _index_level_name(idx_name, level, table._column_names)
                 )
@@ -274,17 +274,17 @@ def write_orc(
         num_index_cols_meta = len(table._index.names)
     else:
         tv = table_view_from_table(table, ignore_index=True)
-        tbl_meta = make_unique[table_input_metadata](tv)
+        tbl_meta = table_input_metadata(tv)
         num_index_cols_meta = 0
 
     if cols_as_map_type is not None:
         cols_as_map_type = set(cols_as_map_type)
 
     for i, name in enumerate(table._column_names, num_index_cols_meta):
-        tbl_meta.get().column_metadata[i].set_name(name.encode())
+        tbl_meta.column_metadata[i].set_name(name.encode())
         _set_col_children_metadata(
             table[name]._column,
-            tbl_meta.get().column_metadata[i],
+            tbl_meta.column_metadata[i],
             (cols_as_map_type is not None)
             and (name in cols_as_map_type),
         )
@@ -292,7 +292,7 @@ def write_orc(
     cdef orc_writer_options c_orc_writer_options = move(
         orc_writer_options.builder(
             sink_info_c, tv
-        ).metadata(tbl_meta.get())
+        ).metadata(tbl_meta)
         .key_value_metadata(move(user_data))
         .compression(compression_)
         .enable_statistics(_get_orc_stat_freq(statistics))
@@ -374,7 +374,7 @@ cdef class ORCWriter:
     cdef cudf_io_types.statistics_freq stat_freq
     cdef compression_type comp_type
     cdef object index
-    cdef unique_ptr[table_input_metadata] tbl_meta
+    cdef table_input_metadata tbl_meta
     cdef object cols_as_map_type
 
     def __cinit__(self,
@@ -423,32 +423,32 @@ cdef class ORCWriter:
         cdef table_view tv
 
         num_index_cols_meta = 0
-        self.tbl_meta = make_unique[table_input_metadata](
+        self.tbl_meta = table_input_metadata(
             table_view_from_table(table, ignore_index=True),
         )
         if self.index is not False:
             if isinstance(table._index, cudf.core.multiindex.MultiIndex):
                 tv = table_view_from_table(table)
-                self.tbl_meta = make_unique[table_input_metadata](tv)
+                self.tbl_meta = table_input_metadata(tv)
                 for level, idx_name in enumerate(table._index.names):
-                    self.tbl_meta.get().column_metadata[level].set_name(
+                    self.tbl_meta.column_metadata[level].set_name(
                         (str.encode(idx_name))
                     )
                 num_index_cols_meta = len(table._index.names)
             else:
                 if table._index.name is not None:
                     tv = table_view_from_table(table)
-                    self.tbl_meta = make_unique[table_input_metadata](tv)
-                    self.tbl_meta.get().column_metadata[0].set_name(
+                    self.tbl_meta = table_input_metadata(tv)
+                    self.tbl_meta.column_metadata[0].set_name(
                         str.encode(table._index.name)
                     )
                     num_index_cols_meta = 1
 
         for i, name in enumerate(table._column_names, num_index_cols_meta):
-            self.tbl_meta.get().column_metadata[i].set_name(name.encode())
+            self.tbl_meta.column_metadata[i].set_name(name.encode())
             _set_col_children_metadata(
                 table[name]._column,
-                self.tbl_meta.get().column_metadata[i],
+                self.tbl_meta.column_metadata[i],
                 (self.cols_as_map_type is not None)
                 and (name in self.cols_as_map_type),
             )
@@ -461,7 +461,7 @@ cdef class ORCWriter:
         with nogil:
             args = move(
                 chunked_orc_writer_options.builder(self.sink)
-                .metadata(self.tbl_meta.get())
+                .metadata(self.tbl_meta)
                 .key_value_metadata(move(user_data))
                 .compression(self.comp_type)
                 .enable_statistics(self.stat_freq)
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 923f5c4089f..85fd25cf1a9 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -32,12 +32,13 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+cimport cudf._lib.cpp.io.data_sink as cudf_io_data_sink
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.types as cudf_types
 from cudf._lib.column cimport Column
@@ -51,7 +52,7 @@ from cudf._lib.cpp.io.parquet cimport (
     write_parquet as parquet_writer,
 )
 from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
 from cudf._lib.io.datasource cimport NativeFileDatasource
@@ -121,7 +122,6 @@ def _parse_metadata(meta):
 
 
 cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
-                   strings_to_categorical=False,
                    use_pandas_metadata=True):
     """
     Cython function to call into libcudf API, see `read_parquet`.
@@ -145,7 +145,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     cdef cudf_io_types.source_info source = make_source_info(
         filepaths_or_buffers)
 
-    cdef bool cpp_strings_to_categorical = strings_to_categorical
     cdef bool cpp_use_pandas_metadata = use_pandas_metadata
 
     cdef vector[vector[size_type]] cpp_row_groups
@@ -161,7 +160,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     args = move(
         parquet_reader_options.builder(source)
         .row_groups(cpp_row_groups)
-        .convert_strings_to_categories(cpp_strings_to_categorical)
         .use_pandas_metadata(cpp_use_pandas_metadata)
         .timestamp_type(cpp_timestamp_type)
         .build()
@@ -333,11 +331,11 @@ def write_parquet(
     """
 
     # Create the write options
-    cdef unique_ptr[table_input_metadata] tbl_meta
+    cdef table_input_metadata tbl_meta
 
     cdef vector[map[string, string]] user_data
     cdef table_view tv
-    cdef vector[unique_ptr[cudf_io_types.data_sink]] _data_sinks
+    cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sinks
     cdef cudf_io_types.sink_info sink = make_sinks_info(
         filepaths_or_buffers, _data_sinks
     )
@@ -346,9 +344,9 @@ def write_parquet(
         index is None and not isinstance(table._index, cudf.RangeIndex)
     ):
         tv = table_view_from_table(table)
-        tbl_meta = make_unique[table_input_metadata](tv)
+        tbl_meta = table_input_metadata(tv)
         for level, idx_name in enumerate(table._index.names):
-            tbl_meta.get().column_metadata[level].set_name(
+            tbl_meta.column_metadata[level].set_name(
                 str.encode(
                     _index_level_name(idx_name, level, table._column_names)
                 )
@@ -356,17 +354,17 @@ def write_parquet(
         num_index_cols_meta = len(table._index.names)
     else:
         tv = table_view_from_table(table, ignore_index=True)
-        tbl_meta = make_unique[table_input_metadata](tv)
+        tbl_meta = table_input_metadata(tv)
         num_index_cols_meta = 0
 
     for i, name in enumerate(table._column_names, num_index_cols_meta):
         if not isinstance(name, str):
             raise ValueError("parquet must have string column names")
 
-        tbl_meta.get().column_metadata[i].set_name(name.encode())
+        tbl_meta.column_metadata[i].set_name(name.encode())
         _set_col_metadata(
             table[name]._column,
-            tbl_meta.get().column_metadata[i],
+            tbl_meta.column_metadata[i],
             force_nullable_schema
         )
 
@@ -396,7 +394,7 @@ def write_parquet(
     # Perform write
     cdef parquet_writer_options args = move(
         parquet_writer_options.builder(sink, tv)
-        .metadata(tbl_meta.get())
+        .metadata(tbl_meta)
         .key_value_metadata(move(user_data))
         .compression(comp_type)
         .stats_level(stat_freq)
@@ -477,9 +475,9 @@ cdef class ParquetWriter:
     """
     cdef bool initialized
     cdef unique_ptr[cpp_parquet_chunked_writer] writer
-    cdef unique_ptr[table_input_metadata] tbl_meta
+    cdef table_input_metadata tbl_meta
     cdef cudf_io_types.sink_info sink
-    cdef vector[unique_ptr[cudf_io_types.data_sink]] _data_sink
+    cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sink
     cdef cudf_io_types.statistics_freq stat_freq
     cdef cudf_io_types.compression_type comp_type
     cdef object index
@@ -577,31 +575,31 @@ cdef class ParquetWriter:
 
         # Set the table_metadata
         num_index_cols_meta = 0
-        self.tbl_meta = make_unique[table_input_metadata](
+        self.tbl_meta = table_input_metadata(
             table_view_from_table(table, ignore_index=True))
         if self.index is not False:
             if isinstance(table._index, cudf.core.multiindex.MultiIndex):
                 tv = table_view_from_table(table)
-                self.tbl_meta = make_unique[table_input_metadata](tv)
+                self.tbl_meta = table_input_metadata(tv)
                 for level, idx_name in enumerate(table._index.names):
-                    self.tbl_meta.get().column_metadata[level].set_name(
+                    self.tbl_meta.column_metadata[level].set_name(
                         (str.encode(idx_name))
                     )
                 num_index_cols_meta = len(table._index.names)
             else:
                 if table._index.name is not None:
                     tv = table_view_from_table(table)
-                    self.tbl_meta = make_unique[table_input_metadata](tv)
-                    self.tbl_meta.get().column_metadata[0].set_name(
+                    self.tbl_meta = table_input_metadata(tv)
+                    self.tbl_meta.column_metadata[0].set_name(
                         str.encode(table._index.name)
                     )
                     num_index_cols_meta = 1
 
         for i, name in enumerate(table._column_names, num_index_cols_meta):
-            self.tbl_meta.get().column_metadata[i].set_name(name.encode())
+            self.tbl_meta.column_metadata[i].set_name(name.encode())
             _set_col_metadata(
                 table[name]._column,
-                self.tbl_meta.get().column_metadata[i],
+                self.tbl_meta.column_metadata[i],
             )
 
         index = (
@@ -617,7 +615,7 @@ cdef class ParquetWriter:
         with nogil:
             args = move(
                 chunked_parquet_writer_options.builder(self.sink)
-                .metadata(self.tbl_meta.get())
+                .metadata(self.tbl_meta)
                 .key_value_metadata(move(user_data))
                 .compression(self.comp_type)
                 .stats_level(self.stat_freq)
@@ -645,7 +643,7 @@ cpdef merge_filemetadata(object filemetadata_list):
 
     for blob_py in filemetadata_list:
         blob_c = blob_py
-        list_c.push_back(make_unique[vector[uint8_t]](blob_c))
+        list_c.push_back(move(make_unique[vector[uint8_t]](blob_c)))
 
     with nogil:
         output_c = move(parquet_merge_metadata(list_c))
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
new file mode 100644
index 00000000000..0ce42dc43ff
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -0,0 +1,21 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources column.pyx copying.pyx gpumemoryview.pyx table.pyx types.pyx utils.pyx)
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
new file mode 100644
index 00000000000..ba7822b0a54
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -0,0 +1,18 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+# TODO: Verify consistent usage of relative/absolute imports in pylibcudf.
+from . cimport copying
+from .column cimport Column
+from .gpumemoryview cimport gpumemoryview
+from .table cimport Table
+# TODO: cimport type_id once
+# https://github.com/cython/cython/issues/5609 is resolved
+from .types cimport DataType
+
+__all__ = [
+    "Column",
+    "DataType",
+    "Table",
+    "copying",
+    "gpumemoryview",
+]
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
new file mode 100644
index 00000000000..3edff9a53e8
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from . import copying
+from .column import Column
+from .gpumemoryview import gpumemoryview
+from .table import Table
+from .types import DataType, TypeId
+
+__all__ = [
+    "Column",
+    "DataType",
+    "Table",
+    "TypeId",
+    "copying",
+    "gpumemoryview",
+]
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
new file mode 100644
index 00000000000..2af87db5b03
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.vector cimport vector
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.types cimport bitmask_type, size_type
+
+from .gpumemoryview cimport gpumemoryview
+from .types cimport DataType
+
+
+cdef class Column:
+    # TODO: Should we document these attributes? Should we mark them readonly?
+    cdef:
+        # Core data
+        DataType _data_type
+        size_type _size
+        gpumemoryview _data
+        gpumemoryview _mask
+        size_type _null_count
+        size_type _offset
+        # children: List[Column]
+        list _children
+        size_type _num_children
+
+    cdef column_view view(self) nogil
+
+    @staticmethod
+    cdef Column from_libcudf(unique_ptr[column] libcudf_col)
+
+    cpdef DataType type(self)
+    cpdef Column child(self, size_type index)
+    cpdef size_type num_children(self)
+    cpdef size_type size(self)
+    cpdef size_type null_count(self)
+    cpdef size_type offset(self)
+    cpdef gpumemoryview data(self)
+    cpdef gpumemoryview null_mask(self)
+    cpdef list children(self)
+
+    cpdef list_view(self)
+
+
+cdef class ListColumnView:
+    """Accessor for methods of a Column that are specific to lists."""
+    cdef Column _column
+    cpdef child(self)
+    cpdef offsets(self)
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
new file mode 100644
index 00000000000..40afc8aaa8a
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -0,0 +1,194 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from rmm._lib.device_buffer cimport DeviceBuffer
+
+from cudf._lib.cpp.column.column cimport column, column_contents
+from cudf._lib.cpp.types cimport size_type
+
+from .gpumemoryview cimport gpumemoryview
+from .types cimport DataType, type_id
+from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
+
+
+cdef class Column:
+    """A container of nullable device data as a column of elements.
+
+    This class is an implementation of [Arrow columnar data
+    specification](https://arrow.apache.org/docs/format/Columnar.html) for data
+    stored on GPUs. It relies on Python memoryview-like semantics to maintain
+    shared ownership of the data it is constructed with, so any input data may
+    also be co-owned by other data structures. The Column is designed to be
+    operated on using algorithms backed by libcudf.
+
+    Parameters
+    ----------
+    data_type : DataType
+        The type of data in the column.
+    size : size_type
+        The number of rows in the column.
+    data : gpumemoryview
+        The data the column will refer to.
+    mask : gpumemoryview
+        The null mask for the column.
+    null_count : int
+        The number of null rows in the column.
+    offset : int
+        The offset into the data buffer where the column's data begins.
+    children : list
+        The children of this column if it is a compound column type.
+    """
+    def __init__(
+        self, DataType data_type not None, size_type size, gpumemoryview data,
+        gpumemoryview mask, size_type null_count, size_type offset,
+        list children
+    ):
+        self._data_type = data_type
+        self._size = size
+        self._data = data
+        self._mask = mask
+        self._null_count = null_count
+        self._offset = offset
+        self._children = children
+        self._num_children = len(children)
+
+    cdef column_view view(self) nogil:
+        """Generate a libcudf column_view to pass to libcudf algorithms.
+
+        This method is for pylibcudf's functions to use to generate inputs when
+        calling libcudf algorithms, and should generally not be needed by users
+        (even direct pylibcudf Cython users).
+        """
+        cdef const void * data = NULL
+        cdef const bitmask_type * null_mask = NULL
+
+        if self._data is not None:
+            data = int_to_void_ptr(self._data.ptr)
+        if self._mask is not None:
+            null_mask = int_to_bitmask_ptr(self._mask.ptr)
+
+        # TODO: Check if children can ever change. If not, this could be
+        # computed once in the constructor and always be reused.
+        cdef vector[column_view] c_children
+        with gil:
+            if self._children is not None:
+                for child in self._children:
+                    # Need to cast to Column here so that Cython knows that
+                    # `view` returns a typed object, not a Python object. We
+                    # cannot use a typed variable for `child` because cdef
+                    # declarations cannot be inside nested blocks (`if` or
+                    # `with` blocks) so we cannot declare it inside the `with
+                    # gil` block, but we also cannot declare it outside the
+                    # `with gil` block because it is erroneous to declare a
+                    # variable of a cdef class type in a `nogil` context (which
+                    # this whole function is).
+                    c_children.push_back((<Column> child).view())
+
+        return column_view(
+            self._data_type.c_obj, self._size, data, null_mask,
+            self._null_count, self._offset, c_children
+        )
+
+    @staticmethod
+    cdef Column from_libcudf(unique_ptr[column] libcudf_col):
+        """Create a Column from a libcudf column.
+
+        This method is for pylibcudf's functions to use to ingest outputs of
+        calling libcudf algorithms, and should generally not be needed by users
+        (even direct pylibcudf Cython users).
+        """
+        cdef DataType dtype = DataType.from_libcudf(libcudf_col.get().type())
+        cdef size_type size = libcudf_col.get().size()
+        cdef size_type null_count = libcudf_col.get().null_count()
+
+        cdef column_contents contents = move(libcudf_col.get().release())
+
+        # Note that when converting to cudf Column objects we'll need to pull
+        # out the base object.
+        cdef gpumemoryview data = gpumemoryview(
+            DeviceBuffer.c_from_unique_ptr(move(contents.data))
+        )
+
+        cdef gpumemoryview mask = None
+        if null_count > 0:
+            mask = gpumemoryview(
+                DeviceBuffer.c_from_unique_ptr(move(contents.null_mask))
+            )
+
+        children = []
+        if contents.children.size() != 0:
+            for i in range(contents.children.size()):
+                children.append(
+                    Column.from_libcudf(move(contents.children[i]))
+                )
+
+        return Column(
+            dtype,
+            size,
+            data,
+            mask,
+            null_count,
+            # Initial offset when capturing a C++ column is always 0.
+            0,
+            children,
+        )
+
+    cpdef DataType type(self):
+        """The type of data in the column."""
+        return self._data_type
+
+    cpdef Column child(self, size_type index):
+        """Get a child column of this column.
+
+        Parameters
+        ----------
+        index : size_type
+            The index of the child column to get.
+
+        Returns
+        -------
+        Column
+            The child column.
+        """
+        return self._children[index]
+
+    cpdef size_type num_children(self):
+        """The number of children of this column."""
+        return self._num_children
+
+    cpdef list_view(self):
+        return ListColumnView(self)
+
+    cpdef gpumemoryview data(self):
+        return self._data
+
+    cpdef gpumemoryview null_mask(self):
+        return self._mask
+
+    cpdef size_type size(self):
+        return self._size
+
+    cpdef size_type offset(self):
+        return self._offset
+
+    cpdef size_type null_count(self):
+        return self._null_count
+
+    cpdef list children(self):
+        return self._children
+
+
+cdef class ListColumnView:
+    """Accessor for methods of a Column that are specific to lists."""
+    def __init__(self, Column col):
+        if col.type().id() != type_id.LIST:
+            raise TypeError("Column is not a list type")
+        self._column = col
+
+    cpdef child(self):
+        return self._column.child(1)
+
+    cpdef offsets(self):
+        return self._column.child(1)
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
new file mode 100644
index 00000000000..d57be650710
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp cimport bool as cbool
+
+from cudf._lib.cpp.copying cimport out_of_bounds_policy
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef Table gather(
+    Table source_table,
+    Column gather_map,
+    out_of_bounds_policy bounds_policy
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
new file mode 100644
index 00000000000..a27b44b3107
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
@@ -0,0 +1,57 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+# TODO: We want to make cpp a more full-featured package so that we can access
+# directly from that. It will make namespacing much cleaner in pylibcudf. What
+# we really want here would be
+# cimport libcudf... libcudf.copying.algo(...)
+from cudf._lib.cpp cimport copying as cpp_copying
+from cudf._lib.cpp.copying cimport out_of_bounds_policy
+
+from cudf._lib.cpp.copying import \
+    out_of_bounds_policy as OutOfBoundsPolicy  # no-cython-lint
+
+from cudf._lib.cpp.table.table cimport table
+
+from .column cimport Column
+from .table cimport Table
+
+
+# TODO: Is it OK to reference the corresponding libcudf algorithm in the
+# documentation? Otherwise there's a lot of room for duplication.
+cpdef Table gather(
+    Table source_table,
+    Column gather_map,
+    out_of_bounds_policy bounds_policy
+):
+    """Select rows from source_table according to the provided gather_map.
+
+    For details on the implementation, see cudf::gather in libcudf.
+
+    Parameters
+    ----------
+    source_table : Table
+        The table object from which to pull data.
+    gather_map : Column
+        The list of row indices to pull out of the source table.
+    bounds_policy : out_of_bounds_policy
+        Controls whether out of bounds indices are checked and nullified in the
+        output or if indices are assumed to be in bounds.
+
+    Returns
+    -------
+    pylibcudf.Table
+        The result of the gather
+    """
+    cdef unique_ptr[table] c_result
+    with nogil:
+        c_result = move(
+            cpp_copying.gather(
+                source_table.view(),
+                gather_map.view(),
+                bounds_policy
+            )
+        )
+    return Table.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd
new file mode 100644
index 00000000000..713697bd139
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+cdef class gpumemoryview:
+    # TODO: Eventually probably want to make this opaque, but for now it's fine
+    # to treat this object as something like a POD struct
+    cdef readonly:
+        Py_ssize_t ptr
+        object obj
diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
new file mode 100644
index 00000000000..fc98f087a1b
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
@@ -0,0 +1,27 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+cdef class gpumemoryview:
+    """Minimal representation of a memory buffer.
+
+    This class aspires to be a GPU equivalent of the [Python memoryview
+    type](https://docs.python.org/3/library/stdtypes.html#memoryview) for any
+    objects exposing a [CUDA Array
+    Interface](https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html).
+    It will be expanded to encompass more memoryview functionality over time.
+    """
+    # TODO: dlpack support
+    def __init__(self, object obj):
+        try:
+            cai = obj.__cuda_array_interface__
+        except AttributeError:
+            raise ValueError(
+                "gpumemoryview must be constructed from an object supporting "
+                "the CUDA array interface"
+            )
+        self.obj = obj
+        # TODO: Need to respect readonly
+        self.ptr = cai["data"][0]
+
+    def __cuda_array_interface__(self):
+        return self.obj.__cuda_array_interface__
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd
new file mode 100644
index 00000000000..95f197b13eb
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd
@@ -0,0 +1,18 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.table.table cimport table
+from cudf._lib.cpp.table.table_view cimport table_view
+
+
+cdef class Table:
+    # List[pylibcudf.Column]
+    cdef list _columns
+
+    cdef table_view view(self) nogil
+
+    @staticmethod
+    cdef Table from_libcudf(unique_ptr[table] libcudf_tbl)
+
+    cpdef list columns(self)
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
new file mode 100644
index 00000000000..720f9815bd6
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -0,0 +1,62 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.table.table cimport table
+
+from .column cimport Column
+
+
+cdef class Table:
+    """A list of columns of the same size.
+
+    Parameters
+    ----------
+    columns : list
+        The columns in this table.
+    """
+    def __init__(self, list columns):
+        self._columns = columns
+
+    cdef table_view view(self) nogil:
+        """Generate a libcudf table_view to pass to libcudf algorithms.
+
+        This method is for pylibcudf's functions to use to generate inputs when
+        calling libcudf algorithms, and should generally not be needed by users
+        (even direct pylibcudf Cython users).
+        """
+        # TODO: Make c_columns a class attribute that is updated along with
+        # self._columns whenever new columns are added or columns are removed.
+        cdef vector[column_view] c_columns
+
+        with gil:
+            for col in self._columns:
+                c_columns.push_back((<Column> col).view())
+
+        return table_view(c_columns)
+
+    @staticmethod
+    cdef Table from_libcudf(unique_ptr[table] libcudf_tbl):
+        """Create a Table from a libcudf table.
+
+        This method is for pylibcudf's functions to use to ingest outputs of
+        calling libcudf algorithms, and should generally not be needed by users
+        (even direct pylibcudf Cython users).
+        """
+        cdef vector[unique_ptr[column]] c_columns = move(
+            dereference(libcudf_tbl).release()
+        )
+
+        cdef vector[unique_ptr[column]].size_type i
+        return Table([
+            Column.from_libcudf(move(c_columns[i]))
+            for i in range(c_columns.size())
+        ])
+
+    cpdef list columns(self):
+        return self._columns
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd
new file mode 100644
index 00000000000..80baa484be7
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libc.stdint cimport int32_t
+from libcpp cimport bool as cbool
+
+from cudf._lib.cpp.types cimport data_type, type_id
+
+
+cdef class DataType:
+    cdef data_type c_obj
+
+    cpdef type_id id(self)
+    cpdef int32_t scale(self)
+
+    @staticmethod
+    cdef DataType from_libcudf(data_type dt)
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
new file mode 100644
index 00000000000..b1391723f0e
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -0,0 +1,45 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libc.stdint cimport int32_t
+
+from cudf._lib.cpp.types cimport data_type, type_id
+
+from cudf._lib.cpp.types import type_id as TypeId  # no-cython-lint
+
+
+cdef class DataType:
+    """Indicator for the logical data type of an element in a column.
+
+    This is the Cython representation of libcudf's data_type.
+
+    Parameters
+    ----------
+    id : TypeId
+        The type's identifier
+    scale : int
+        The scale associated with the data. Only used for decimal data types.
+    """
+    def __cinit__(self, type_id id, int32_t scale=0):
+        self.c_obj = data_type(id, scale)
+
+    # TODO: Consider making both id and scale cached properties.
+    cpdef type_id id(self):
+        """Get the id associated with this data type."""
+        return self.c_obj.id()
+
+    cpdef int32_t scale(self):
+        """Get the scale associated with this data type."""
+        return self.c_obj.scale()
+
+    @staticmethod
+    cdef DataType from_libcudf(data_type dt):
+        """Create a DataType from a libcudf data_type.
+
+        This method is for pylibcudf's functions to use to ingest outputs of
+        calling libcudf algorithms, and should generally not be needed by users
+        (even direct pylibcudf Cython users).
+        """
+        # Spoof an empty data type then swap in the real one.
+        cdef DataType ret = DataType.__new__(DataType, type_id.EMPTY)
+        ret.c_obj = dt
+        return ret
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pxd b/python/cudf/cudf/_lib/pylibcudf/utils.pxd
new file mode 100644
index 00000000000..18bcd9cc91a
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/utils.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.types cimport bitmask_type
+
+
+cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil
+cdef bitmask_type * int_to_bitmask_ptr(Py_ssize_t ptr) nogil
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pyx b/python/cudf/cudf/_lib/pylibcudf/utils.pyx
new file mode 100644
index 00000000000..ccf9ea2bd70
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/utils.pyx
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libc.stdint cimport uintptr_t
+
+from cudf._lib.cpp.types cimport bitmask_type
+
+
+cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil:
+    return <void*><uintptr_t>(ptr)
+
+
+cdef bitmask_type * int_to_bitmask_ptr(Py_ssize_t ptr) nogil:
+    return <bitmask_type*><uintptr_t>(ptr)
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index af63964bac3..0407785b2d8 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 cimport cython
 
@@ -31,7 +31,7 @@ from cudf._lib.types import (
     duration_unit_map,
 )
 from cudf.core.dtypes import ListDtype, StructDtype
-from cudf.core.missing import NA
+from cudf.core.missing import NA, NaT
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -178,7 +178,7 @@ cdef class DeviceScalar:
         return self.get_raw_ptr()[0].is_valid()
 
     def __repr__(self):
-        if self.value is NA:
+        if cudf.utils.utils.is_na_like(self.value):
             return (
                 f"{self.__class__.__name__}"
                 f"({self.value}, {repr(self.dtype)})"
@@ -200,23 +200,23 @@ cdef class DeviceScalar:
         if dtype is not None:
             s._dtype = dtype
         elif cdtype.id() in {
-            libcudf_types.DECIMAL32,
-            libcudf_types.DECIMAL64,
-            libcudf_types.DECIMAL128,
+            libcudf_types.type_id.DECIMAL32,
+            libcudf_types.type_id.DECIMAL64,
+            libcudf_types.type_id.DECIMAL128,
         }:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
             )
-        elif cdtype.id() == libcudf_types.STRUCT:
+        elif cdtype.id() == libcudf_types.type_id.STRUCT:
             struct_table_view = (<struct_scalar*>s.get_raw_ptr())[0].view()
             s._dtype = StructDtype({
                 str(i): dtype_from_column_view(struct_table_view.column(i))
                 for i in range(struct_table_view.num_columns())
             })
-        elif cdtype.id() == libcudf_types.LIST:
+        elif cdtype.id() == libcudf_types.type_id.LIST:
             if (
                 <list_scalar*>s.get_raw_ptr()
-            )[0].view().type().id() == libcudf_types.LIST:
+            )[0].view().type().id() == libcudf_types.type_id.LIST:
                 s._dtype = dtype_from_column_view(
                     (<list_scalar*>s.get_raw_ptr())[0].view()
                 )
@@ -442,27 +442,27 @@ cdef _get_np_scalar_from_numeric(unique_ptr[scalar]& s):
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
-    if cdtype.id() == libcudf_types.INT8:
+    if cdtype.id() == libcudf_types.type_id.INT8:
         return np.int8((<numeric_scalar[int8_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.INT16:
+    elif cdtype.id() == libcudf_types.type_id.INT16:
         return np.int16((<numeric_scalar[int16_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.INT32:
+    elif cdtype.id() == libcudf_types.type_id.INT32:
         return np.int32((<numeric_scalar[int32_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.INT64:
+    elif cdtype.id() == libcudf_types.type_id.INT64:
         return np.int64((<numeric_scalar[int64_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.UINT8:
+    elif cdtype.id() == libcudf_types.type_id.UINT8:
         return np.uint8((<numeric_scalar[uint8_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.UINT16:
+    elif cdtype.id() == libcudf_types.type_id.UINT16:
         return np.uint16((<numeric_scalar[uint16_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.UINT32:
+    elif cdtype.id() == libcudf_types.type_id.UINT32:
         return np.uint32((<numeric_scalar[uint32_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.UINT64:
+    elif cdtype.id() == libcudf_types.type_id.UINT64:
         return np.uint64((<numeric_scalar[uint64_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.FLOAT32:
+    elif cdtype.id() == libcudf_types.type_id.FLOAT32:
         return np.float32((<numeric_scalar[float]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.FLOAT64:
+    elif cdtype.id() == libcudf_types.type_id.FLOAT64:
         return np.float64((<numeric_scalar[double]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.BOOL8:
+    elif cdtype.id() == libcudf_types.type_id.BOOL8:
         return np.bool_((<numeric_scalar[bool]*>s_ptr)[0].value())
     else:
         raise ValueError("Could not convert cudf::scalar to numpy scalar")
@@ -475,15 +475,15 @@ cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s):
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
-    if cdtype.id() == libcudf_types.DECIMAL64:
+    if cdtype.id() == libcudf_types.type_id.DECIMAL64:
         rep_val = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].value())
         scale = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].type().scale())
         return decimal.Decimal(rep_val).scaleb(scale)
-    elif cdtype.id() == libcudf_types.DECIMAL32:
+    elif cdtype.id() == libcudf_types.type_id.DECIMAL32:
         rep_val = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].value())
         scale = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].type().scale())
         return decimal.Decimal(rep_val).scaleb(scale)
-    elif cdtype.id() == libcudf_types.DECIMAL128:
+    elif cdtype.id() == libcudf_types.type_id.DECIMAL128:
         rep_val = int((<fixed_point_scalar[decimal128]*>s_ptr)[0].value())
         scale = int((<fixed_point_scalar[decimal128]*>s_ptr)[0].type().scale())
         return decimal.Decimal(rep_val).scaleb(scale)
@@ -495,32 +495,32 @@ cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s):
     cdef scalar* s_ptr = s.get()
 
     if not s_ptr[0].is_valid():
-        return NA
+        return NaT
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
-    if cdtype.id() == libcudf_types.TIMESTAMP_SECONDS:
+    if cdtype.id() == libcudf_types.type_id.TIMESTAMP_SECONDS:
         return np.datetime64(
             (
                 <timestamp_scalar[timestamp_ms]*> s_ptr
             )[0].ticks_since_epoch_64(),
             datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
         )
-    elif cdtype.id() == libcudf_types.TIMESTAMP_MILLISECONDS:
+    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MILLISECONDS:
         return np.datetime64(
             (
                 <timestamp_scalar[timestamp_ms]*> s_ptr
             )[0].ticks_since_epoch_64(),
             datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
         )
-    elif cdtype.id() == libcudf_types.TIMESTAMP_MICROSECONDS:
+    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MICROSECONDS:
         return np.datetime64(
             (
                 <timestamp_scalar[timestamp_ms]*> s_ptr
             )[0].ticks_since_epoch_64(),
             datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
         )
-    elif cdtype.id() == libcudf_types.TIMESTAMP_NANOSECONDS:
+    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_NANOSECONDS:
         return np.datetime64(
             (
                 <timestamp_scalar[timestamp_ms]*> s_ptr
@@ -536,32 +536,32 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s):
     cdef scalar* s_ptr = s.get()
 
     if not s_ptr[0].is_valid():
-        return None
+        return NaT
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
-    if cdtype.id() == libcudf_types.DURATION_SECONDS:
+    if cdtype.id() == libcudf_types.type_id.DURATION_SECONDS:
         return np.timedelta64(
             (
                 <duration_scalar[duration_s]*> s_ptr
             )[0].ticks(),
             duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
         )
-    elif cdtype.id() == libcudf_types.DURATION_MILLISECONDS:
+    elif cdtype.id() == libcudf_types.type_id.DURATION_MILLISECONDS:
         return np.timedelta64(
             (
                 <duration_scalar[duration_ms]*> s_ptr
             )[0].ticks(),
             duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
         )
-    elif cdtype.id() == libcudf_types.DURATION_MICROSECONDS:
+    elif cdtype.id() == libcudf_types.type_id.DURATION_MICROSECONDS:
         return np.timedelta64(
             (
                 <duration_scalar[duration_us]*> s_ptr
             )[0].ticks(),
             duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
         )
-    elif cdtype.id() == libcudf_types.DURATION_NANOSECONDS:
+    elif cdtype.id() == libcudf_types.type_id.DURATION_NANOSECONDS:
         return np.timedelta64(
             (
                 <duration_scalar[duration_ns]*> s_ptr
@@ -586,7 +586,7 @@ def as_device_scalar(val, dtype=None):
 
 
 def _is_null_host_scalar(slr):
-    if slr is None or slr is NA:
+    if cudf.utils.utils.is_na_like(slr):
         return True
     elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr):
         return True
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index 3c3f8cabda6..b80ea9c7fdc 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -1,10 +1,12 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
+from itertools import repeat
+
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+from libcpp.utility cimport move, pair
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
@@ -19,11 +21,16 @@ from cudf._lib.cpp.sorting cimport (
     is_sorted as cpp_is_sorted,
     rank,
     segmented_sort_by_key as cpp_segmented_sort_by_key,
+    sort as cpp_sort,
+    sort_by_key as cpp_sort_by_key,
     sorted_order,
+    stable_segmented_sort_by_key as cpp_stable_segmented_sort_by_key,
+    stable_sort_by_key as cpp_stable_sort_by_key,
+    stable_sorted_order,
 )
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport null_order, null_policy, order
+from cudf._lib.cpp.types cimport null_order, null_policy, order as cpp_order
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
@@ -56,23 +63,25 @@ def is_sorted(
         ``null_position``, False otherwise.
     """
 
-    cdef vector[order] column_order
+    cdef vector[cpp_order] column_order
     cdef vector[null_order] null_precedence
 
     if ascending is None:
-        column_order = vector[order](len(source_columns), order.ASCENDING)
+        column_order = vector[cpp_order](
+            len(source_columns), cpp_order.ASCENDING
+        )
     else:
         if len(ascending) != len(source_columns):
             raise ValueError(
                 f"Expected a list-like of length {len(source_columns)}, "
                 f"got length {len(ascending)} for `ascending`"
             )
-        column_order = vector[order](
-            len(source_columns), order.DESCENDING
+        column_order = vector[cpp_order](
+            len(source_columns), cpp_order.DESCENDING
         )
         for idx, val in enumerate(ascending):
             if val:
-                column_order[idx] = order.ASCENDING
+                column_order[idx] = cpp_order.ASCENDING
 
     if null_position is None:
         null_precedence = vector[null_order](
@@ -103,54 +112,195 @@ def is_sorted(
     return c_result
 
 
+cdef pair[vector[cpp_order], vector[null_order]] ordering(
+    column_order, null_precedence
+):
+    """
+    Construct order and null order vectors
+
+    Parameters
+    ----------
+    column_order
+        Iterable of bool (True for ascending order, False for descending)
+    null_precedence
+        Iterable string for null positions ("first" for start, "last" for end)
+
+    Both iterables must be the same length (not checked)
+
+    Returns
+    -------
+    pair of vectors (order, and null_order)
+    """
+    cdef vector[cpp_order] c_column_order
+    cdef vector[null_order] c_null_precedence
+    for asc, null in zip(column_order, null_precedence):
+        c_column_order.push_back(
+            cpp_order.ASCENDING if asc else cpp_order.DESCENDING
+        )
+        if asc ^ (null == "first"):
+            c_null_precedence.push_back(null_order.AFTER)
+        elif asc ^ (null == "last"):
+            c_null_precedence.push_back(null_order.BEFORE)
+        else:
+            raise ValueError(f"Invalid null precedence {null}")
+    return pair[vector[cpp_order], vector[null_order]](
+        c_column_order, c_null_precedence
+    )
+
+
 @acquire_spill_lock()
-def order_by(list columns_from_table, object ascending, str na_position):
+def order_by(
+    list columns_from_table,
+    object ascending,
+    str na_position,
+    *,
+    bool stable
+):
     """
     Get index to sort the table in ascending/descending order.
 
     Parameters
     ----------
-    columns_from_table : columns from the table which will be sorted
-    ascending : sequence of boolean values which correspond to each column
-                in source_table signifying order of each column
-                True - Ascending and False - Descending
-    na_position : whether null value should show up at the "first" or "last"
-                position of **all** sorted column.
+    columns_from_table : list[Column]
+        Columns from the table which will be sorted
+    ascending : sequence[bool]
+         Sequence of boolean values which correspond to each column
+         in the table to be sorted signifying the order of each column
+         True - Ascending and False - Descending
+    na_position : str
+        Whether null values should show up at the "first" or "last"
+        position of **all** sorted column.
+    stable : bool
+        Should the sort be stable? (no default)
+
+    Returns
+    -------
+    Column of indices that sorts the table
     """
     cdef table_view source_table_view = table_view_from_columns(
         columns_from_table
     )
-    cdef vector[order] column_order
-    column_order.reserve(len(ascending))
-    cdef vector[null_order] null_precedence
-    null_precedence.reserve(len(ascending))
+    cdef pair[vector[cpp_order], vector[null_order]] order = ordering(
+        ascending, repeat(na_position)
+    )
+    cdef unique_ptr[column] c_result
+    if stable:
+        with nogil:
+            c_result = move(stable_sorted_order(source_table_view,
+                                                order.first,
+                                                order.second))
+    else:
+        with nogil:
+            c_result = move(sorted_order(source_table_view,
+                                         order.first,
+                                         order.second))
 
-    for asc in ascending:
-        if asc:
-            column_order.push_back(order.ASCENDING)
-        else:
-            column_order.push_back(order.DESCENDING)
+    return Column.from_unique_ptr(move(c_result))
 
-        if asc ^ (na_position == "first"):
-            null_precedence.push_back(null_order.AFTER)
-        else:
-            null_precedence.push_back(null_order.BEFORE)
 
-    cdef unique_ptr[column] c_result
+@acquire_spill_lock()
+def sort(
+    list values,
+    list column_order=None,
+    list null_precedence=None,
+):
+    """
+    Sort the table in ascending/descending order.
+
+    Parameters
+    ----------
+    values : list[Column]
+        Columns of the table which will be sorted
+    column_order : list[bool], optional
+        Sequence of boolean values which correspond to each column in
+        keys providing the sort order (default all True).
+        With True <=> ascending; False <=> descending.
+    null_precedence : list[str], optional
+        Sequence of "first" or "last" values (default "first")
+        indicating the position of null values when sorting the keys.
+    """
+    cdef table_view values_view = table_view_from_columns(values)
+    cdef unique_ptr[table] result
+    ncol = len(values)
+    cdef pair[vector[cpp_order], vector[null_order]] order = ordering(
+        column_order or repeat(True, ncol),
+        null_precedence or repeat("first", ncol),
+    )
     with nogil:
-        c_result = move(sorted_order(source_table_view,
-                                     column_order,
-                                     null_precedence))
+        result = move(
+            cpp_sort(
+                values_view,
+                order.first,
+                order.second,
+            )
+        )
+    return columns_from_unique_ptr(move(result))
 
-    return Column.from_unique_ptr(move(c_result))
 
+@acquire_spill_lock()
+def sort_by_key(
+    list values,
+    list keys,
+    object ascending,
+    object na_position,
+    *,
+    bool stable,
+):
+    """
+    Sort a table by given keys
+
+    Parameters
+    ----------
+    values : list[Column]
+        Columns of the table which will be sorted
+    keys : list[Column]
+        Columns making up the sort key
+    ascending : list[bool]
+        Sequence of boolean values which correspond to each column
+        in the table to be sorted signifying the order of each column
+        True - Ascending and False - Descending
+    na_position : list[str]
+        Sequence of "first" or "last" values (default "first")
+        indicating the position of null values when sorting the keys.
+    stable : bool
+        Should the sort be stable? (no default)
+
+    Returns
+    -------
+    list[Column]
+        list of value columns sorted by keys
+    """
+    cdef table_view value_view = table_view_from_columns(values)
+    cdef table_view key_view = table_view_from_columns(keys)
+    cdef pair[vector[cpp_order], vector[null_order]] order = ordering(
+        ascending, na_position
+    )
+    cdef unique_ptr[table] c_result
+    if stable:
+        with nogil:
+            c_result = move(cpp_stable_sort_by_key(value_view,
+                                                   key_view,
+                                                   order.first,
+                                                   order.second))
+    else:
+        with nogil:
+            c_result = move(cpp_sort_by_key(value_view,
+                                            key_view,
+                                            order.first,
+                                            order.second))
+
+    return columns_from_unique_ptr(move(c_result))
 
+
+@acquire_spill_lock()
 def segmented_sort_by_key(
     list values,
     list keys,
     Column segment_offsets,
     list column_order=None,
     list null_precedence=None,
+    *,
+    bool stable,
 ):
     """
     Sort segments of a table by given keys
@@ -170,6 +320,8 @@ def segmented_sort_by_key(
     null_precedence : list[str], optional
         Sequence of "first" or "last" values (default "first")
         indicating the position of null values when sorting the keys.
+    stable : bool
+        Should the sort be stable? (no default)
 
     Returns
     -------
@@ -179,30 +331,34 @@ def segmented_sort_by_key(
     cdef table_view values_view = table_view_from_columns(values)
     cdef table_view keys_view = table_view_from_columns(keys)
     cdef column_view offsets_view = segment_offsets.view()
-    cdef vector[order] c_column_order
-    cdef vector[null_order] c_null_precedence
     cdef unique_ptr[table] result
     ncol = len(values)
-    column_order = column_order or [True] * ncol
-    null_precedence = null_precedence or ["first"] * ncol
-    for asc, null in zip(column_order, null_precedence):
-        c_column_order.push_back(order.ASCENDING if asc else order.DESCENDING)
-        if asc ^ (null == "first"):
-            c_null_precedence.push_back(null_order.AFTER)
-        elif asc ^ (null == "last"):
-            c_null_precedence.push_back(null_order.BEFORE)
-        else:
-            raise ValueError(f"Invalid null precedence {null}")
-    with nogil:
-        result = move(
-            cpp_segmented_sort_by_key(
-                values_view,
-                keys_view,
-                offsets_view,
-                c_column_order,
-                c_null_precedence,
+    cdef pair[vector[cpp_order], vector[null_order]] order = ordering(
+        column_order or repeat(True, ncol),
+        null_precedence or repeat("first", ncol),
+    )
+    if stable:
+        with nogil:
+            result = move(
+                cpp_stable_segmented_sort_by_key(
+                    values_view,
+                    keys_view,
+                    offsets_view,
+                    order.first,
+                    order.second,
+                )
+            )
+    else:
+        with nogil:
+            result = move(
+                cpp_segmented_sort_by_key(
+                    values_view,
+                    keys_view,
+                    offsets_view,
+                    order.first,
+                    order.second,
+                )
             )
-        )
     return columns_from_unique_ptr(move(result))
 
 
@@ -223,10 +379,10 @@ def digitize(list source_columns, list bins, bool right=False):
     cdef table_view source_table_view = table_view_from_columns(
         source_columns
     )
-    cdef vector[order] column_order = (
-        vector[order](
+    cdef vector[cpp_order] column_order = (
+        vector[cpp_order](
             bins_view.num_columns(),
-            order.ASCENDING
+            cpp_order.ASCENDING
         )
     )
     cdef vector[null_order] null_precedence = (
@@ -268,10 +424,10 @@ def rank_columns(list source_columns, object method, str na_option,
         < underlying_type_t_rank_method > method
     )
 
-    cdef order column_order = (
-        order.ASCENDING
+    cdef cpp_order column_order = (
+        cpp_order.ASCENDING
         if ascending
-        else order.DESCENDING
+        else cpp_order.DESCENDING
     )
     # ascending
     #    #top    = na_is_smallest
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index 143999e52ef..4422ad83885 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -9,22 +9,19 @@ from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.sorting cimport stable_sort_by_key as cpp_stable_sort_by_key
 from cudf._lib.cpp.stream_compaction cimport (
     apply_boolean_mask as cpp_apply_boolean_mask,
     distinct_count as cpp_distinct_count,
     drop_nulls as cpp_drop_nulls,
     duplicate_keep_option,
-    unique as cpp_unique,
+    stable_distinct as cpp_stable_distinct,
 )
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport (
     nan_policy,
     null_equality,
-    null_order,
     null_policy,
-    order,
     size_type,
 )
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
@@ -145,41 +142,13 @@ def drop_duplicates(list columns,
         if nulls_are_equal
         else null_equality.UNEQUAL
     )
-
-    cdef vector[order] column_order = (
-        vector[order](
-            cpp_keys.size(),
-            order.ASCENDING
-        )
-    )
-    cdef vector[null_order] null_precedence = (
-        vector[null_order](
-            cpp_keys.size(),
-            null_order.BEFORE
-        )
-    )
-
     cdef table_view source_table_view = table_view_from_columns(columns)
-    cdef table_view keys_view = source_table_view.select(cpp_keys)
-    cdef unique_ptr[table] sorted_source_table
     cdef unique_ptr[table] c_result
 
     with nogil:
-        # cudf::unique keeps unique rows in each consecutive group of
-        # equivalent rows. To match the behavior of pandas.DataFrame.
-        # drop_duplicates, users need to stable sort the input first
-        # and then invoke cudf::unique.
-        sorted_source_table = move(
-            cpp_stable_sort_by_key(
-                source_table_view,
-                keys_view,
-                column_order,
-                null_precedence
-            )
-        )
         c_result = move(
-            cpp_unique(
-                sorted_source_table.get().view(),
+            cpp_stable_distinct(
+                source_table_view,
                 cpp_keys,
                 cpp_keep_option,
                 cpp_nulls_equal
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 84d538d8e55..16875e4397e 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -3,8 +3,10 @@
 from cudf._lib.nvtext.generate_ngrams import (
     generate_character_ngrams,
     generate_ngrams,
+    hash_character_ngrams,
 )
-from cudf._lib.nvtext.minhash import minhash
+from cudf._lib.nvtext.jaccard import jaccard_index
+from cudf._lib.nvtext.minhash import minhash, minhash64
 from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
 from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
 from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index a92ee66df33..14d78cdaa51 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 
 from libcpp cimport bool
@@ -14,7 +14,7 @@ from cudf._lib.cpp.scalar.scalar cimport string_scalar
 from cudf._lib.cpp.strings.char_types cimport (
     all_characters_of_type as cpp_all_characters_of_type,
     filter_characters_of_type as cpp_filter_characters_of_type,
-    string_character_types as string_character_types,
+    string_character_types,
 )
 from cudf._lib.scalar cimport DeviceScalar
 
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index f38f4c5f847..7d86d34ab25 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -13,8 +13,8 @@ from cudf._lib.cpp.strings.combine cimport (
     concatenate as cpp_concatenate,
     join_list_elements as cpp_join_list_elements,
     join_strings as cpp_join_strings,
-    output_if_empty_list as output_if_empty_list,
-    separator_on_nulls as separator_on_nulls,
+    output_if_empty_list,
+    separator_on_nulls,
 )
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.scalar cimport DeviceScalar
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index 177cbffddb0..2085d5c2896 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 import cudf
 
@@ -15,7 +15,7 @@ from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
     is_fixed_point as cpp_is_fixed_point,
     to_fixed_point as cpp_to_fixed_point,
 )
-from cudf._lib.cpp.types cimport DECIMAL32, DECIMAL64, DECIMAL128, data_type
+from cudf._lib.cpp.types cimport data_type, type_id
 
 
 @acquire_spill_lock()
@@ -61,11 +61,11 @@ def to_decimal(Column input_col, object out_type):
     cdef int scale = out_type.scale
     cdef data_type c_out_type
     if isinstance(out_type, cudf.Decimal32Dtype):
-        c_out_type = data_type(DECIMAL32, -scale)
+        c_out_type = data_type(type_id.DECIMAL32, -scale)
     elif isinstance(out_type, cudf.Decimal64Dtype):
-        c_out_type = data_type(DECIMAL64, -scale)
+        c_out_type = data_type(type_id.DECIMAL64, -scale)
     elif isinstance(out_type, cudf.Decimal128Dtype):
-        c_out_type = data_type(DECIMAL128, -scale)
+        c_out_type = data_type(type_id.DECIMAL128, -scale)
     else:
         raise TypeError("should be a decimal dtype")
     with nogil:
@@ -100,7 +100,7 @@ def is_fixed_point(Column input_col, object dtype):
     cdef unique_ptr[column] c_result
     cdef column_view source_view = input_col.view()
     cdef int scale = dtype.scale
-    cdef data_type c_dtype = data_type(DECIMAL64, -scale)
+    cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale)
     with nogil:
         c_result = move(cpp_is_fixed_point(
             source_view,
diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx
index 262d479d914..55659e98dcb 100644
--- a/python/cudf/cudf/_lib/strings/translate.pyx
+++ b/python/cudf/cudf/_lib/strings/translate.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -14,7 +14,7 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
 from cudf._lib.cpp.strings.translate cimport (
     filter_characters as cpp_filter_characters,
-    filter_type as filter_type,
+    filter_type,
     translate as cpp_translate,
 )
 from cudf._lib.cpp.types cimport char_utf8
diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index 3d465f9172b..a59e6db1b72 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -33,8 +33,8 @@ def column_to_string_view_array(Column strings_col):
     with nogil:
         c_buffer = move(cpp_to_string_view_array(input_view))
 
-    device_buffer = DeviceBuffer.c_from_unique_ptr(move(c_buffer))
-    return as_buffer(device_buffer, exposed=True)
+    db = DeviceBuffer.c_from_unique_ptr(move(c_buffer))
+    return as_buffer(db, exposed=True)
 
 
 def column_from_udf_string_array(DeviceBuffer d_buffer):
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index a0a8279b213..d8eb6134042 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -163,7 +163,7 @@ def one_hot_encode(Column input_column, Column categories):
         move(c_result.second),
         owner=owner,
         column_names=[
-            x if x is not None else 'null' for x in pylist_categories
+            x if x is not None else '<NA>' for x in pylist_categories
         ]
     )
     return encodings
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index 58e3221a4ec..a95db84ceff 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp cimport bool
@@ -17,4 +17,5 @@ ctypedef bool underlying_type_t_null_policy
 cdef dtype_from_column_view(column_view cv)
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *
+cpdef dtype_to_pylibcudf_type(dtype)
 cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index d8614dbbdca..929f8b447ab 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from enum import IntEnum
 
@@ -16,6 +16,7 @@ from cudf._lib.types cimport (
 )
 
 import cudf
+from cudf._lib import pylibcudf
 
 size_type_dtype = np.dtype("int32")
 
@@ -94,6 +95,11 @@ SUPPORTED_NUMPY_TO_LIBCUDF_TYPES = {
     np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS,
 }
 
+SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = {
+    k: pylibcudf.TypeId(v).value
+    for k, v in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES.items()
+}
+
 LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
     # There's no equivalent to EMPTY in cudf.  We translate EMPTY
     # columns from libcudf to ``int8`` columns of all nulls in Python.
@@ -122,6 +128,11 @@ LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
     TypeId.STRUCT: np.dtype("object"),
 }
 
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
+    pylibcudf.TypeId(k).value: v
+    for k, v in LIBCUDF_TO_SUPPORTED_NUMPY_TYPES.items()
+}
+
 duration_unit_map = {
     TypeId.DURATION_SECONDS: "s",
     TypeId.DURATION_MILLISECONDS: "ms",
@@ -225,6 +236,7 @@ cdef dtype_from_column_view(column_view cv):
         ]
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
+    cdef libcudf_types.type_id tid
     if cudf.api.types.is_list_dtype(dtype):
         tid = libcudf_types.type_id.LIST
     elif cudf.api.types.is_struct_dtype(dtype):
@@ -245,9 +257,77 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
     else:
         return libcudf_types.data_type(tid)
 
+cpdef dtype_to_pylibcudf_type(dtype):
+    if cudf.api.types.is_list_dtype(dtype):
+        return pylibcudf.DataType(pylibcudf.TypeId.LIST)
+    elif cudf.api.types.is_struct_dtype(dtype):
+        return pylibcudf.DataType(pylibcudf.TypeId.STRUCT)
+    elif cudf.api.types.is_decimal_dtype(dtype):
+        if cudf.api.types.is_decimal128_dtype(dtype):
+            tid = pylibcudf.TypeId.DECIMAL128
+        elif cudf.api.types.is_decimal64_dtype(dtype):
+            tid = pylibcudf.TypeId.DECIMAL64
+        else:
+            tid = pylibcudf.TypeId.DECIMAL32
+        return pylibcudf.DataType(tid, -dtype.scale)
+    return pylibcudf.DataType(
+        SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[np.dtype(dtype)]
+    )
+
 cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *:
     return tid in (
         libcudf_types.type_id.DECIMAL128,
         libcudf_types.type_id.DECIMAL64,
         libcudf_types.type_id.DECIMAL32,
     )
+
+
+def dtype_from_pylibcudf_lists_column(col):
+    child = col.list_view().child()
+    tid = child.type().id()
+
+    if tid == pylibcudf.TypeId.LIST:
+        return cudf.ListDtype(dtype_from_pylibcudf_lists_column(child))
+    elif tid == pylibcudf.TypeId.EMPTY:
+        return cudf.ListDtype("int8")
+    else:
+        return cudf.ListDtype(
+            dtype_from_pylibcudf_column(child)
+        )
+
+
+def dtype_from_pylibcudf_structs_column(col):
+    fields = {
+        str(i): dtype_from_pylibcudf_column(col.child(i))
+        for i in range(col.num_children())
+    }
+    return cudf.StructDtype(fields)
+
+
+def dtype_from_pylibcudf_column(col):
+    type_ = col.type()
+    tid = type_.id()
+
+    if tid == pylibcudf.TypeId.LIST:
+        return dtype_from_pylibcudf_lists_column(col)
+    elif tid == pylibcudf.TypeId.STRUCT:
+        return dtype_from_pylibcudf_structs_column(col)
+    elif tid == pylibcudf.TypeId.DECIMAL64:
+        return cudf.Decimal64Dtype(
+            precision=cudf.Decimal64Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    elif tid == pylibcudf.TypeId.DECIMAL32:
+        return cudf.Decimal32Dtype(
+            precision=cudf.Decimal32Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    elif tid == pylibcudf.TypeId.DECIMAL128:
+        return cudf.Decimal128Dtype(
+            precision=cudf.Decimal128Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    else:
+        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
+            <underlying_type_t_type_id>(tid)
+        ]
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 8a53b71124a..653fa8f2b8b 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -18,3 +18,4 @@ cdef table_view table_view_from_columns(columns) except *
 cdef table_view table_view_from_table(tbl, ignore_index=*) except*
 cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
 cdef columns_from_table_view(table_view tv, object owners)
+cdef columns_from_pylibcudf_table(tbl)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 56918799cca..03982a58517 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -67,17 +67,17 @@ cdef vector[column_view] make_column_views(object columns):
     return views
 
 
-cdef vector[string] get_column_names(object table, object index):
+cdef vector[string] get_column_names(object tbl, object index):
     cdef vector[string] column_names
     if index is not False:
-        if isinstance(table._index, cudf.core.multiindex.MultiIndex):
-            for idx_name in table._index.names:
+        if isinstance(tbl._index, cudf.core.multiindex.MultiIndex):
+            for idx_name in tbl._index.names:
                 column_names.push_back(str.encode(idx_name))
         else:
-            if table._index.name is not None:
-                column_names.push_back(str.encode(table._index.name))
+            if tbl._index.name is not None:
+                column_names.push_back(str.encode(tbl._index.name))
 
-    for col_name in table._column_names:
+    for col_name in tbl._column_names:
         column_names.push_back(str.encode(col_name))
 
     return column_names
@@ -246,6 +246,22 @@ cdef columns_from_unique_ptr(
     return columns
 
 
+cdef columns_from_pylibcudf_table(tbl):
+    """Convert a pylibcudf table into list of columns.
+
+    Parameters
+    ----------
+    tbl : pylibcudf.Table
+        The pylibcudf table whose columns will be extracted
+
+    Returns
+    -------
+    list[Column]
+        A list of columns.
+    """
+    return [Column.from_pylibcudf(plc) for plc in tbl.columns()]
+
+
 cdef data_from_unique_ptr(
     unique_ptr[table] c_tbl, column_names, index_names=None
 ):
diff --git a/python/cudf/cudf/api/extensions/__init__.py b/python/cudf/cudf/api/extensions/__init__.py
index eeb5dcdb32a..6118b6bf620 100644
--- a/python/cudf/cudf/api/extensions/__init__.py
+++ b/python/cudf/cudf/api/extensions/__init__.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+
+from pandas.api.extensions import no_default
 
 from cudf.api.extensions.accessor import (
     register_dataframe_accessor,
@@ -7,6 +9,7 @@
 )
 
 __all__ = [
+    "no_default",
     "register_dataframe_accessor",
     "register_index_accessor",
     "register_series_accessor",
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index ba0f9ea47f2..a2afbde83eb 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -86,7 +86,7 @@ def is_integer(obj):
     bool
     """
     if isinstance(obj, cudf.Scalar):
-        return pd.api.types.is_integer(obj.dtype)
+        return pd.api.types.is_integer_dtype(obj.dtype)
     return pd.api.types.is_integer(obj)
 
 
@@ -154,10 +154,8 @@ def _is_scalar_or_zero_d_array(val):
         Return True if given object is scalar.
     """
     return (
-        (isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0)
-        or (isinstance(val, pd.Categorical) and len(val) == 1)
-        or is_scalar(val)
-    )
+        isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0
+    ) or is_scalar(val)
 
 
 # TODO: We should be able to reuse the pandas function for this, need to figure
@@ -222,9 +220,7 @@ def _union_categoricals(
         [obj._column for obj in to_union]
     )
     if sort_categories:
-        sorted_categories = result_col.categories.sort_by_values(
-            ascending=True
-        )[0]
+        sorted_categories = result_col.categories.sort_values(ascending=True)
         result_col = result_col.reorder_categories(
             new_categories=sorted_categories
         )
@@ -458,17 +454,23 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool:
 # TODO: Evaluate which of the datetime types need special handling for cudf.
 is_datetime_dtype = _wrap_pandas_is_dtype_api(pd_types.is_datetime64_dtype)
 is_datetime64_any_dtype = pd_types.is_datetime64_any_dtype
-is_datetime64_dtype = pd_types.is_datetime64_dtype
-is_datetime64_ns_dtype = pd_types.is_datetime64_ns_dtype
-is_datetime64tz_dtype = pd_types.is_datetime64tz_dtype
+is_datetime64_dtype = _wrap_pandas_is_dtype_api(pd_types.is_datetime64_dtype)
+is_datetime64_ns_dtype = _wrap_pandas_is_dtype_api(
+    pd_types.is_datetime64_ns_dtype
+)
+is_datetime64tz_dtype = _wrap_pandas_is_dtype_api(
+    pd_types.is_datetime64tz_dtype
+)
 is_extension_type = pd_types.is_extension_type
 is_extension_array_dtype = pd_types.is_extension_array_dtype
 is_int64_dtype = pd_types.is_int64_dtype
 is_period_dtype = pd_types.is_period_dtype
 is_signed_integer_dtype = pd_types.is_signed_integer_dtype
 is_timedelta_dtype = _wrap_pandas_is_dtype_api(pd_types.is_timedelta64_dtype)
-is_timedelta64_dtype = pd_types.is_timedelta64_dtype
-is_timedelta64_ns_dtype = pd_types.is_timedelta64_ns_dtype
+is_timedelta64_dtype = _wrap_pandas_is_dtype_api(pd_types.is_timedelta64_dtype)
+is_timedelta64_ns_dtype = _wrap_pandas_is_dtype_api(
+    pd_types.is_timedelta64_ns_dtype
+)
 is_unsigned_integer_dtype = pd_types.is_unsigned_integer_dtype
 is_sparse = pd_types.is_sparse
 # is_list_like = pd_types.is_list_like
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 68c52a8b9e8..829ca33d8a5 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -2,12 +2,14 @@
 
 from __future__ import annotations
 
+import builtins
 import pickle
 import warnings
 from functools import cached_property
-from typing import Any, Set, TypeVar
+from typing import Any, Set, Tuple
 
 import pandas as pd
+from typing_extensions import Self
 
 import cudf
 from cudf._lib.copying import _gather_map_is_valid, gather
@@ -17,7 +19,6 @@
     drop_nulls,
 )
 from cudf._lib.types import size_type_dtype
-from cudf._typing import DtypeObj
 from cudf.api.types import (
     is_bool_dtype,
     is_integer,
@@ -30,53 +31,24 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.utils import ioutils
 from cudf.utils.dtypes import is_mixed_with_object_dtype
-
-_index_astype_docstring = """\
-Create an Index with values cast to dtypes.
-
-The class of a new Index is determined by dtype. When conversion is
-impossible, a ValueError exception is raised.
-
-Parameters
-----------
-dtype : :class:`numpy.dtype`
-    Use a :class:`numpy.dtype` to cast entire Index object to.
-copy : bool, default False
-    By default, astype always returns a newly allocated object.
-    If copy is set to False and internal requirements on dtype are
-    satisfied, the original data is used to create a new Index
-    or the original Index is returned.
-
-Returns
--------
-Index
-    Index with values cast to specified dtype.
-
-Examples
---------
->>> import cudf
->>> index = cudf.Index([1, 2, 3])
->>> index
-Int64Index([1, 2, 3], dtype='int64')
->>> index.astype('float64')
-Float64Index([1.0, 2.0, 3.0], dtype='float64')
-"""
-
-BaseIndexT = TypeVar("BaseIndexT", bound="BaseIndex")
+from cudf.utils.utils import _is_same_name
 
 
 class BaseIndex(Serializable):
     """Base class for all cudf Index types."""
 
-    dtype: DtypeObj
     _accessors: Set[Any] = set()
     _data: ColumnAccessor
 
+    @property
+    def _columns(self) -> Tuple[Any, ...]:
+        raise NotImplementedError
+
     @cached_property
     def _values(self) -> ColumnBase:
         raise NotImplementedError
 
-    def copy(self, deep: bool = True) -> BaseIndex:
+    def copy(self, deep: bool = True) -> Self:
         raise NotImplementedError
 
     def __len__(self):
@@ -87,10 +59,130 @@ def size(self):
         # The size of an index is always its length irrespective of dimension.
         return len(self)
 
+    def astype(self, dtype, copy: bool = True):
+        """Create an Index with values cast to dtypes.
+
+        The class of a new Index is determined by dtype. When conversion is
+        impossible, a ValueError exception is raised.
+
+        Parameters
+        ----------
+        dtype : :class:`numpy.dtype`
+            Use a :class:`numpy.dtype` to cast entire Index object to.
+        copy : bool, default False
+            By default, astype always returns a newly allocated object.
+            If copy is set to False and internal requirements on dtype are
+            satisfied, the original data is used to create a new Index
+            or the original Index is returned.
+
+        Returns
+        -------
+        Index
+            Index with values cast to specified dtype.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> index = cudf.Index([1, 2, 3])
+        >>> index
+        Int64Index([1, 2, 3], dtype='int64')
+        >>> index.astype('float64')
+        Float64Index([1.0, 2.0, 3.0], dtype='float64')
+        """
+        raise NotImplementedError
+
+    def argsort(self, *args, **kwargs):
+        """Return the integer indices that would sort the index.
+
+        Parameters vary by subclass.
+        """
+        raise NotImplementedError
+
+    @property
+    def dtype(self):
+        raise NotImplementedError
+
+    @property
+    def empty(self):
+        return self.size == 0
+
+    @property
+    def is_unique(self):
+        """Return if the index has unique values."""
+        raise NotImplementedError
+
+    def memory_usage(self, deep=False):
+        """Return the memory usage of an object.
+
+        Parameters
+        ----------
+        deep : bool
+            The deep parameter is ignored and is only included for pandas
+            compatibility.
+
+        Returns
+        -------
+        The total bytes used.
+        """
+        raise NotImplementedError
+
+    def tolist(self):  # noqa: D102
+        raise TypeError(
+            "cuDF does not support conversion to host memory "
+            "via the `tolist()` method. Consider using "
+            "`.to_arrow().to_pylist()` to construct a Python list."
+        )
+
+    to_list = tolist
+
+    @property
+    def name(self):
+        """Returns the name of the Index."""
+        raise NotImplementedError
+
+    @property  # type: ignore
+    def ndim(self):  # noqa: D401
+        """Number of dimensions of the underlying data, by definition 1."""
+        return 1
+
+    def equals(self, other):
+        """
+        Determine if two Index objects contain the same elements.
+
+        Returns
+        -------
+        out: bool
+            True if "other" is an Index and it has the same elements
+            as calling index; False otherwise.
+        """
+        raise NotImplementedError
+
+    def shift(self, periods=1, freq=None):
+        """Not yet implemented"""
+        raise NotImplementedError
+
+    @property
+    def shape(self):
+        """Get a tuple representing the dimensionality of the data."""
+        return (len(self),)
+
+    @property
+    def str(self):
+        """Not yet implemented."""
+        raise NotImplementedError
+
     @property
     def values(self):
         raise NotImplementedError
 
+    def max(self):
+        """The maximum value of the index."""
+        raise NotImplementedError
+
+    def min(self):
+        """The minimum value of the index."""
+        raise NotImplementedError
+
     def get_loc(self, key, method=None, tolerance=None):
         raise NotImplementedError
 
@@ -101,8 +193,8 @@ def __contains__(self, item):
         return item in self._values
 
     def _copy_type_metadata(
-        self: BaseIndexT, other: BaseIndexT, *, override_dtypes=None
-    ) -> BaseIndexT:
+        self, other: Self, *, override_dtypes=None
+    ) -> Self:
         raise NotImplementedError
 
     def get_level_values(self, level):
@@ -199,6 +291,7 @@ def is_monotonic(self):
         -------
         bool
         """
+        # Do not remove until pandas 2.0 support is added.
         warnings.warn(
             "is_monotonic is deprecated and will be removed in a future "
             "version. Use is_monotonic_increasing instead.",
@@ -337,6 +430,30 @@ def set_names(self, names, level=None, inplace=False):
     def has_duplicates(self):
         return not self.is_unique
 
+    def where(self, cond, other=None, inplace=False):
+        """
+        Replace values where the condition is False.
+
+        The replacement is taken from other.
+
+        Parameters
+        ----------
+        cond : bool array-like with the same length as self
+            Condition to select the values on.
+        other : scalar, or array-like, default None
+            Replacement if the condition is False.
+
+        Returns
+        -------
+        cudf.Index
+            A copy of self with values replaced from other
+            where the condition is False.
+        """
+        raise NotImplementedError
+
+    def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
+        raise NotImplementedError
+
     def union(self, other, sort=None):
         """
         Form the union of two Index objects.
@@ -534,7 +651,7 @@ def _get_reconciled_name_object(self, other):
         case make a shallow copy of self.
         """
         name = _get_result_name(self.name, other.name)
-        if self.name != name:
+        if not _is_same_name(self.name, name):
             return self.rename(name)
         return self
 
@@ -595,6 +712,18 @@ def to_frame(self, index=True, name=None):
             {col_name: self._values}, index=self if index else None
         )
 
+    def to_arrow(self):
+        """Convert to a suitable Arrow object."""
+        raise NotImplementedError
+
+    def to_cupy(self):
+        """Convert to a cupy array."""
+        raise NotImplementedError
+
+    def to_numpy(self):
+        """Convert to a numpy array."""
+        raise NotImplementedError
+
     def any(self):
         """
         Return whether any elements is True in Index.
@@ -606,7 +735,7 @@ def isna(self):
         Detect missing values.
 
         Return a boolean same-sized object indicating if the values are NA.
-        NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`cudf.NaN`, get
+        NA values, such as ``None``, `numpy.NAN` or `cudf.NA`, get
         mapped to ``True`` values.
         Everything else get mapped to ``False`` values.
 
@@ -624,7 +753,7 @@ def notna(self):
 
         Return a boolean same-sized object indicating if the values are not NA.
         Non-missing values get mapped to ``True``.
-        NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False``
+        NA values, such as None or `numpy.NAN`, get mapped to ``False``
         values.
 
         Returns
@@ -814,17 +943,18 @@ def difference(self, other, sort=None):
 
         other = cudf.Index(other)
 
+        res_name = _get_result_name(self.name, other.name)
+
         if is_mixed_with_object_dtype(self, other):
             difference = self.copy()
         else:
             other = other.copy(deep=False)
-            other.names = self.names
             difference = cudf.core.index._index_from_data(
-                cudf.DataFrame._from_data(self._data)
+                cudf.DataFrame._from_data({"None": self._column})
                 .merge(
-                    cudf.DataFrame._from_data(other._data),
+                    cudf.DataFrame._from_data({"None": other._column}),
                     how="leftanti",
-                    on=self.name,
+                    on="None",
                 )
                 ._data
             )
@@ -832,6 +962,8 @@ def difference(self, other, sort=None):
             if self.dtype != other.dtype:
                 difference = difference.astype(self.dtype)
 
+        difference.name = res_name
+
         if sort is None and len(other):
             return difference.sort_values()
 
@@ -877,6 +1009,7 @@ def is_numeric(self):
         >>> idx.is_numeric()
         False
         """
+        # Do not remove until pandas removes this.
         warnings.warn(
             f"{type(self).__name__}.is_numeric is deprecated. "
             "Use cudf.api.types.is_any_real_numeric_dtype instead",
@@ -921,6 +1054,7 @@ def is_boolean(self):
         >>> idx.is_boolean()
         False
         """
+        # Do not remove until pandas removes this.
         warnings.warn(
             f"{type(self).__name__}.is_boolean is deprecated. "
             "Use cudf.api.types.is_bool_dtype instead",
@@ -965,6 +1099,7 @@ def is_integer(self):
         >>> idx.is_integer()
         False
         """
+        # Do not remove until pandas removes this.
         warnings.warn(
             f"{type(self).__name__}.is_integer is deprecated. "
             "Use cudf.api.types.is_integer_dtype instead",
@@ -1016,6 +1151,7 @@ def is_floating(self):
         >>> idx.is_floating()
         False
         """
+        # Do not remove until pandas removes this.
         warnings.warn(
             f"{type(self).__name__}.is_floating is deprecated. "
             "Use cudf.api.types.is_float_dtype instead",
@@ -1061,6 +1197,7 @@ def is_object(self):
         >>> idx.is_object()
         False
         """
+        # Do not remove until pandas removes this.
         warnings.warn(
             f"{type(self).__name__}.is_object is deprecated. "
             "Use cudf.api.types.is_object_dtype instead",
@@ -1113,6 +1250,7 @@ def is_categorical(self):
         >>> s.index.is_categorical()
         False
         """
+        # Do not remove until pandas removes this.
         warnings.warn(
             f"{type(self).__name__}.is_categorical is deprecated. "
             "Use cudf.api.types.is_categorical_dtype instead",
@@ -1159,6 +1297,7 @@ def is_interval(self):
         >>> idx.is_interval()
         False
         """
+        # Do not remove until pandas removes this.
         warnings.warn(
             f"{type(self).__name__}.is_interval is deprecated. "
             "Use cudf.api.types.is_interval_dtype instead",
@@ -1187,14 +1326,12 @@ def _union(self, other, sort=None):
         return union_result
 
     def _intersection(self, other, sort=None):
-        other_unique = other.unique()
-        other_unique.names = self.names
         intersection_result = cudf.core.index._index_from_data(
-            cudf.DataFrame._from_data(self.unique()._data)
+            cudf.DataFrame._from_data({"None": self.unique()._column})
             .merge(
-                cudf.DataFrame._from_data(other_unique._data),
+                cudf.DataFrame._from_data({"None": other.unique()._column}),
                 how="inner",
-                on=self.name,
+                on="None",
             )
             ._data
         )
@@ -1358,7 +1495,7 @@ def join(
         else:
             lhs = self.copy(deep=False)
             rhs = other.copy(deep=False)
-
+        same_names = lhs.names == rhs.names
         # There should be no `None` values in Joined indices,
         # so essentially it would be `left/right` or 'inner'
         # in case of MultiIndex
@@ -1378,7 +1515,7 @@ def join(
                 how = "inner"
         else:
             # Both are normal indices
-            on = rhs.names[0]
+            on = lhs.names[0]
             rhs.names = lhs.names
 
         lhs = lhs.to_frame()
@@ -1392,7 +1529,9 @@ def join(
         if self_is_multi and other_is_multi:
             return cudf.MultiIndex._from_data(output._data)
         else:
-            return cudf.core.index._index_from_data(output._data)
+            idx = cudf.core.index._index_from_data(output._data)
+            idx.name = self.name if same_names else None
+            return idx
 
     def rename(self, name, inplace=False):
         """
@@ -1431,7 +1570,94 @@ def rename(self, name, inplace=False):
             out.name = name
             return out
 
-    def get_slice_bound(self, label, side, kind=None):
+    def _indices_of(self, value) -> cudf.core.column.NumericalColumn:
+        """
+        Return indices corresponding to value
+
+        Parameters
+        ----------
+        value
+            Value to look for in index
+
+        Returns
+        -------
+        Column of indices
+        """
+        raise NotImplementedError
+
+    def find_label_range(self, loc: slice) -> slice:
+        """
+        Translate a label-based slice to an index-based slice
+
+        Parameters
+        ----------
+        loc
+            slice to search for.
+
+        Notes
+        -----
+        As with all label-based searches, the slice is right-closed.
+
+        Returns
+        -------
+        New slice translated into integer indices of the index (right-open).
+        """
+        start = loc.start
+        stop = loc.stop
+        step = 1 if loc.step is None else loc.step
+        if step < 0:
+            start_side, stop_side = "right", "left"
+        else:
+            start_side, stop_side = "left", "right"
+        istart = (
+            None
+            if start is None
+            else self.get_slice_bound(start, side=start_side)
+        )
+        istop = (
+            None
+            if stop is None
+            else self.get_slice_bound(stop, side=stop_side)
+        )
+        if step < 0:
+            # Fencepost
+            istart = None if istart is None else max(istart - 1, 0)
+            istop = None if (istop is None or istop == 0) else istop - 1
+        return slice(istart, istop, step)
+
+    def searchsorted(
+        self,
+        value,
+        side: builtins.str = "left",
+        ascending: bool = True,
+        na_position: builtins.str = "last",
+    ):
+        """Find index where elements should be inserted to maintain order
+
+        Parameters
+        ----------
+        value :
+            Value to be hypothetically inserted into Self
+        side : str {'left', 'right'} optional, default 'left'
+            If 'left', the index of the first suitable location found is given
+            If 'right', return the last such index
+        ascending : bool optional, default True
+            Index is in ascending order (otherwise descending)
+        na_position : str {'last', 'first'} optional, default 'last'
+            Position of null values in sorted order
+
+        Returns
+        -------
+        Insertion point.
+
+        Notes
+        -----
+        As a precondition the index must be sorted in the same order
+        as requested by the `ascending` flag.
+        """
+        raise NotImplementedError
+
+    def get_slice_bound(self, label, side: builtins.str, kind=None) -> int:
         """
         Calculate slice bound that corresponds to given label.
         Returns leftmost (one-past-the-rightmost if ``side=='right'``) position
@@ -1448,10 +1674,34 @@ def get_slice_bound(self, label, side, kind=None):
         int
             Index of label.
         """
-        raise NotImplementedError
+        if kind is not None:
+            # Do not remove until pandas 2.0 support is added.
+            warnings.warn(
+                "'kind' argument in get_slice_bound is deprecated and will be "
+                "removed in a future version.",
+                FutureWarning,
+            )
+        if side not in {"left", "right"}:
+            raise ValueError(f"Invalid side argument {side}")
+        if self.is_monotonic_increasing or self.is_monotonic_decreasing:
+            return self.searchsorted(
+                label, side=side, ascending=self.is_monotonic_increasing
+            )
+        else:
+            try:
+                left, right = self._values._find_first_and_last(label)
+            except ValueError:
+                raise KeyError(f"{label=} not in index")
+            if left != right:
+                raise KeyError(
+                    f"Cannot get slice bound for non-unique label {label=}"
+                )
+            if side == "left":
+                return left
+            else:
+                return right + 1
 
     def __array_function__(self, func, types, args, kwargs):
-
         # check if the function is implemented for the current type
         cudf_index_module = type(self)
         for submodule in func.__module__.split(".")[1:]:
@@ -1478,7 +1728,12 @@ def __array_function__(self, func, types, args, kwargs):
             if cudf_func is func:
                 return NotImplemented
             else:
-                return cudf_func(*args, **kwargs)
+                result = cudf_func(*args, **kwargs)
+                if fname == "unique":
+                    # NumPy expects a sorted result for `unique`, which is not
+                    # guaranteed by cudf.Index.unique.
+                    result = result.sort_values()
+                return result
 
         else:
             return NotImplemented
@@ -1757,7 +2012,4 @@ def _split(self, splits):
 
 
 def _get_result_name(left_name, right_name):
-    if left_name == right_name:
-        return left_name
-    else:
-        return None
+    return left_name if _is_same_name(left_name, right_name) else None
diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
index 6ecbe414ebb..888b94e070c 100644
--- a/python/cudf/cudf/core/_compat.py
+++ b/python/cudf/cudf/core/_compat.py
@@ -10,3 +10,4 @@
 PANDAS_GE_150 = PANDAS_VERSION >= version.parse("1.5.0")
 PANDAS_LT_153 = PANDAS_VERSION < version.parse("1.5.3")
 PANDAS_GE_200 = PANDAS_VERSION >= version.parse("2.0.0")
+PANDAS_GE_210 = PANDAS_VERSION >= version.parse("2.1.0")
diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
index e3c58bd0c8d..5cb9f0363e0 100644
--- a/python/cudf/cudf/core/_internals/expressions.py
+++ b/python/cudf/cudf/core/_internals/expressions.py
@@ -56,6 +56,8 @@
     # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval  # noqa: E501
     # that we don't support yet:
     # expm1, log1p, arctan2 and log10.
+    "isnull": ASTOperator.IS_NULL,
+    "isna": ASTOperator.IS_NULL,
     "sin": ASTOperator.SIN,
     "cos": ASTOperator.COS,
     "tan": ASTOperator.TAN,
diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 693aa1acf9e..67043d3fbb3 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -186,22 +186,56 @@ def localize(
         DatetimeColumn,
         data._scatter_by_column(
             data.isnull() | (ambiguous | nonexistent),
-            cudf.Scalar(cudf.NA, dtype=data.dtype),
+            cudf.Scalar(cudf.NaT, dtype=data.dtype),
         ),
     )
     gmt_data = local_to_utc(localized, zone_name)
     return cast(
         DatetimeTZColumn,
         build_column(
-            data=gmt_data.data,
+            data=gmt_data.base_data,
             dtype=dtype,
-            mask=localized.mask,
+            mask=localized.base_mask,
             size=gmt_data.size,
             offset=gmt_data.offset,
         ),
     )
 
 
+def delocalize(data: DatetimeColumn) -> DatetimeColumn:
+    """
+    Convert a timezone-aware datetime column to a timezone-naive one.
+    If the column is already timezone-naive, return it as is.
+    """
+    if isinstance(data, DatetimeTZColumn):
+        return data._local_time
+    # already timezone-naive:
+    return data
+
+
+def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
+    if not isinstance(data, DatetimeTZColumn):
+        raise TypeError(
+            "Cannot convert from timezone-naive timestamps to "
+            "timezone-aware timestamps. For that, "
+            "use `tz_localize`."
+        )
+    if zone_name == str(data.dtype.tz):
+        return data.copy()
+    utc_time = data._utc_time
+    out = cast(
+        DatetimeTZColumn,
+        build_column(
+            data=utc_time.base_data,
+            dtype=pd.DatetimeTZDtype(data._time_unit, zone_name),
+            mask=utc_time.base_mask,
+            size=utc_time.size,
+            offset=utc_time.offset,
+        ),
+    )
+    return out
+
+
 def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
     tz_data_for_zone = get_tz_data(zone_name)
     transition_times, offsets = tz_data_for_zone._columns
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 6d4a2990e34..0f65861dc72 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 import warnings
 from typing import Tuple, Union
@@ -13,7 +13,6 @@
     is_scalar,
 )
 from cudf.core.column import ColumnBase
-from cudf.core.missing import NA
 from cudf.utils.dtypes import (
     _can_cast,
     _dtype_can_hold_element,
@@ -59,7 +58,7 @@ def _check_and_cast_columns_with_other(
                 f"{type(other).__name__} to {source_dtype.name}"
             )
 
-        if other in {None, NA}:
+        if cudf.utils.utils.is_na_like(other):
             return _normalize_categorical(
                 source_col, cudf.Scalar(other, dtype=source_dtype)
             )
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 7012496434a..a472142ece0 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -4,11 +4,12 @@
 import cupy as cp
 import numpy as np
 
-from cudf.core.column import as_column
+from cudf.core.copy_types import BooleanMask
 from cudf.core.index import Index, RangeIndex
 from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
 from cudf.core.series import Series
+from cudf.options import get_option
 
 
 def factorize(
@@ -120,6 +121,7 @@ def factorize(
                 "Specify `use_na_sentinel=True` to use the sentinel value -1, "
                 "and `use_na_sentinel=False` to encode NA values.",
             )
+        # Do not remove until pandas 2.0 support is added.
         warnings.warn(msg, FutureWarning)
 
     if size_hint:
@@ -133,10 +135,12 @@ def factorize(
     cats = cats.unique().astype(values.dtype)
 
     if sort:
-        cats, _ = cats.sort_by_values()
+        cats = cats.sort_values()
 
     labels = values._column._label_encoding(
-        cats=cats, na_sentinel=Scalar(na_sentinel)
+        cats=cats,
+        na_sentinel=Scalar(na_sentinel),
+        dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
     return labels, cats.values if return_cupy_array else Index(cats)
@@ -170,7 +174,9 @@ def _index_or_values_interpolation(column, index=None):
         return column
 
     to_interp = IndexedFrame(data={None: column}, index=index)
-    known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))
+    known_x_and_y = to_interp._apply_boolean_mask(
+        BooleanMask(~mask, len(to_interp))
+    )
 
     known_x = known_x_and_y._index._column.values
     known_y = known_x_and_y._data.columns[0].values
diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py
index 0d433509497..d8883bd97e5 100644
--- a/python/cudf/cudf/core/buffer/__init__.py
+++ b/python/cudf/cudf/core/buffer/__init__.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper
-from cudf.core.buffer.cow_buffer import CopyOnWriteBuffer
+from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer
 from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock
 from cudf.core.buffer.utils import (
     acquire_spill_lock,
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index abf1ec47e3d..59d20a2784d 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -5,9 +5,10 @@
 import math
 import pickle
 from types import SimpleNamespace
-from typing import Any, Dict, Mapping, Optional, Sequence, Tuple, Type, TypeVar
+from typing import Any, Dict, Literal, Mapping, Optional, Sequence, Tuple
 
 import numpy
+from typing_extensions import Self
 
 import rmm
 
@@ -15,8 +16,6 @@
 from cudf.core.abc import Serializable
 from cudf.utils.string import format_bytes
 
-T = TypeVar("T", bound="Buffer")
-
 
 def host_memory_allocation(nbytes: int) -> memoryview:
     """Allocate host memory using NumPy
@@ -108,7 +107,7 @@ def __init__(self):
         )
 
     @classmethod
-    def _from_device_memory(cls: Type[T], data: Any) -> T:
+    def _from_device_memory(cls, data: Any) -> Self:
         """Create a Buffer from an object exposing `__cuda_array_interface__`.
 
         No data is being copied.
@@ -139,7 +138,7 @@ def _from_device_memory(cls: Type[T], data: Any) -> T:
         return ret
 
     @classmethod
-    def _from_host_memory(cls: Type[T], data: Any) -> T:
+    def _from_host_memory(cls, data: Any) -> Self:
         """Create a Buffer from a buffer or array like object
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
@@ -169,7 +168,7 @@ def _from_host_memory(cls: Type[T], data: Any) -> T:
         # Create from device memory
         return cls._from_device_memory(buf)
 
-    def _getitem(self, offset: int, size: int) -> Buffer:
+    def _getitem(self, offset: int, size: int) -> Self:
         """
         Sub-classes can overwrite this to implement __getitem__
         without having to handle non-slice inputs.
@@ -182,7 +181,7 @@ def _getitem(self, offset: int, size: int) -> Buffer:
             )
         )
 
-    def __getitem__(self, key: slice) -> Buffer:
+    def __getitem__(self, key: slice) -> Self:
         """Create a new slice of the buffer."""
         if not isinstance(key, slice):
             raise TypeError(
@@ -194,7 +193,7 @@ def __getitem__(self, key: slice) -> Buffer:
             raise ValueError("slice must be C-contiguous")
         return self._getitem(offset=start, size=stop - start)
 
-    def copy(self, deep: bool = True):
+    def copy(self, deep: bool = True) -> Self:
         """
         Return a copy of Buffer.
 
@@ -234,35 +233,15 @@ def owner(self) -> Any:
     @property
     def __cuda_array_interface__(self) -> Mapping:
         """Implementation of the CUDA Array Interface."""
-        return self._get_cuda_array_interface(readonly=False)
-
-    def _get_cuda_array_interface(self, readonly=False):
-        """Helper function to create a CUDA Array Interface.
-
-        Parameters
-        ----------
-        readonly : bool, default False
-            If True, returns a CUDA Array Interface with
-            readonly flag set to True.
-            If False, returns a CUDA Array Interface with
-            readonly flag set to False.
-
-        Returns
-        -------
-        dict
-        """
         return {
-            "data": (
-                self.get_ptr(mode="read" if readonly else "write"),
-                readonly,
-            ),
+            "data": (self.get_ptr(mode="write"), False),
             "shape": (self.size,),
             "strides": None,
             "typestr": "|u1",
             "version": 0,
         }
 
-    def get_ptr(self, *, mode) -> int:
+    def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
         """Device pointer to the start of the buffer.
 
         Parameters
@@ -275,19 +254,26 @@ def get_ptr(self, *, mode) -> int:
             Failure to fulfill this contract will cause
             incorrect behavior.
 
+        Returns
+        -------
+        int
+            The device pointer as an integer
 
         See Also
         --------
         SpillableBuffer.get_ptr
-        CopyOnWriteBuffer.get_ptr
+        ExposureTrackedBuffer.get_ptr
         """
         return self._ptr
 
-    def memoryview(self) -> memoryview:
+    def memoryview(
+        self, *, offset: int = 0, size: Optional[int] = None
+    ) -> memoryview:
         """Read-only access to the buffer through host memory."""
-        host_buf = host_memory_allocation(self.size)
+        size = self._size if size is None else size
+        host_buf = host_memory_allocation(size)
         rmm._lib.device_buffer.copy_ptr_to_host(
-            self.get_ptr(mode="read"), host_buf
+            self.get_ptr(mode="read") + offset, host_buf
         )
         return memoryview(host_buf).toreadonly()
 
@@ -310,7 +296,7 @@ def serialize(self) -> Tuple[dict, list]:
         return header, frames
 
     @classmethod
-    def deserialize(cls: Type[T], header: dict, frames: list) -> T:
+    def deserialize(cls, header: dict, frames: list) -> Self:
         """Create an Buffer from a serialized representation.
 
         Parameters
diff --git a/python/cudf/cudf/core/buffer/cow_buffer.py b/python/cudf/cudf/core/buffer/cow_buffer.py
deleted file mode 100644
index 16a1e3942e7..00000000000
--- a/python/cudf/cudf/core/buffer/cow_buffer.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-
-from __future__ import annotations
-
-import weakref
-from collections import defaultdict
-from typing import Any, DefaultDict, Tuple, Type, TypeVar
-from weakref import WeakSet
-
-import rmm
-
-from cudf.core.buffer.buffer import Buffer
-
-T = TypeVar("T", bound="CopyOnWriteBuffer")
-
-
-def _keys_cleanup(ptr):
-    weak_set_values = CopyOnWriteBuffer._instances[ptr]
-    if (
-        len(weak_set_values) == 1
-        and next(iter(weak_set_values.data))() is None
-    ):
-        # When the last remaining reference is being cleaned up we will still
-        # have a dead reference in `weak_set_values`. If that is the case, then
-        # we can safely clean up the key
-        del CopyOnWriteBuffer._instances[ptr]
-
-
-class CopyOnWriteBuffer(Buffer):
-    """A copy-on-write buffer that implements Buffer.
-
-    This buffer enables making copies of data only when there
-    is a write operation being performed.
-
-    See more here :ref:`copy-on-write-dev-doc`.
-
-    Use the factory function `as_buffer` to create a CopyOnWriteBuffer
-    instance.
-    """
-
-    _instances: DefaultDict[Tuple, WeakSet] = defaultdict(WeakSet)
-    """This dict keeps track of all instances that have the same `ptr`
-    and `size` attributes.  Each key of the dict is a `(ptr, size)`
-    tuple and the corresponding value is a set of weak references to
-    instances with that `ptr` and `size`."""
-
-    # TODO: This is synonymous to SpillableBuffer._exposed attribute
-    # and has to be merged.
-    _zero_copied: bool
-
-    def _finalize_init(self):
-        self.__class__._instances[self._ptr].add(self)
-        self._instances = self.__class__._instances[self._ptr]
-        self._zero_copied = False
-        weakref.finalize(self, _keys_cleanup, self._ptr)
-
-    @classmethod
-    def _from_device_memory(
-        cls: Type[T], data: Any, *, exposed: bool = False
-    ) -> T:
-        """Create a Buffer from an object exposing `__cuda_array_interface__`.
-
-        No data is being copied.
-
-        Parameters
-        ----------
-        data : device-buffer-like
-            An object implementing the CUDA Array Interface.
-        exposed : bool, optional
-            Mark the buffer as zero copied.
-
-        Returns
-        -------
-        Buffer
-            Buffer representing the same device memory as `data`
-        """
-
-        # Bypass `__init__` and initialize attributes manually
-        ret = super()._from_device_memory(data)
-        ret._finalize_init()
-        ret._zero_copied = exposed
-        return ret
-
-    @classmethod
-    def _from_host_memory(cls: Type[T], data: Any) -> T:
-        ret = super()._from_host_memory(data)
-        ret._finalize_init()
-        return ret
-
-    @property
-    def _is_shared(self):
-        """
-        Return `True` if `self`'s memory is shared with other columns.
-        """
-        return len(self._instances) > 1
-
-    def get_ptr(self, mode: str = "write") -> int:
-        """Device pointer to the start of the buffer.
-
-        Parameters
-        ----------
-        mode : str, default 'write'
-            Supported values are {"read", "write"}
-            If "write", when weak-references exist, they
-            are unlinked and the data pointed to may be modified
-            by the caller. If "read", the data pointed to
-            must not be modified by the caller.
-            Failure to fulfill this contract will cause
-            incorrect behavior.
-
-        See Also
-        --------
-        Buffer.get_ptr
-        SpillableBuffer.get_ptr
-        """
-        if mode == "write":
-            self._unlink_shared_buffers()
-        elif mode != "read":
-            raise ValueError(f"Incorrect mode passed : {mode}")
-        return self._ptr
-
-    def copy(self, deep: bool = True):
-        if deep or self._zero_copied:
-            return super().copy(deep=True)
-        else:
-            cls = type(self)
-            copied_buf = cls.__new__(cls)
-            copied_buf._ptr = self._ptr
-            copied_buf._size = self._size
-            copied_buf._owner = self._owner
-            copied_buf._finalize_init()
-            return copied_buf
-
-    @property
-    def __cuda_array_interface__(self) -> dict:
-        # Unlink if there are any weak references.
-        # Mark the Buffer as ``zero_copied=True``,
-        # which will prevent any copy-on-write
-        # mechanism post this operation.
-        # This is done because we don't have any
-        # control over knowing if a third-party library
-        # has modified the data this Buffer is
-        # pointing to.
-        self._unlink_shared_buffers()
-        self._zero_copied = True
-        return self._get_cuda_array_interface(readonly=False)
-
-    def _get_cuda_array_interface(self, readonly=False):
-        return {
-            "data": (self._ptr, readonly),
-            "shape": (self.size,),
-            "strides": None,
-            "typestr": "|u1",
-            "version": 0,
-        }
-
-    def _unlink_shared_buffers(self):
-        """
-        Unlinks a Buffer if it is shared with other buffers by
-        making a true deep-copy.
-        """
-        if not self._zero_copied and self._is_shared:
-            # make a deep copy of existing DeviceBuffer
-            # and replace pointer to it.
-            current_buf = rmm.DeviceBuffer(ptr=self._ptr, size=self._size)
-            new_buf = current_buf.copy()
-            self._ptr = new_buf.ptr
-            self._size = new_buf.size
-            self._owner = new_buf
-            self._finalize_init()
diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
new file mode 100644
index 00000000000..f2ac6301944
--- /dev/null
+++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import weakref
+from typing import (
+    Any,
+    Container,
+    Literal,
+    Mapping,
+    Optional,
+    Type,
+    TypeVar,
+    cast,
+)
+
+from typing_extensions import Self
+
+import cudf
+from cudf.core.buffer.buffer import Buffer, get_ptr_and_size
+from cudf.utils.string import format_bytes
+
+T = TypeVar("T", bound="ExposureTrackedBuffer")
+
+
+def get_owner(data, klass: Type[T]) -> Optional[T]:
+    """Get the owner of `data`, if any exist
+
+    Search through the stack of data owners in order to find an
+    owner of type `klass` (not subclasses).
+
+    Parameters
+    ----------
+    data
+        The data object
+
+    Return
+    ------
+    klass or None
+        The owner of `data` if `klass` or None.
+    """
+
+    if type(data) is klass:
+        return data
+    if hasattr(data, "owner"):
+        return get_owner(data.owner, klass)
+    return None
+
+
+def as_exposure_tracked_buffer(
+    data, exposed: bool, subclass: Optional[Type[T]] = None
+) -> BufferSlice:
+    """Factory function to wrap `data` in a slice of an exposure tracked buffer
+
+    If `subclass` is None, a new ExposureTrackedBuffer that points to the
+    memory of `data` is created and a BufferSlice that points to all of the
+    new ExposureTrackedBuffer is returned.
+
+    If `subclass` is not None, a new `subclass` is created instead. Still,
+    a BufferSlice that points to all of the new `subclass` is returned
+
+    It is illegal for an exposure tracked buffer to own another exposure
+    tracked buffer. When representing the same memory, we should have a single
+    exposure tracked buffer and multiple buffer slices.
+
+    Developer Notes
+    ---------------
+    This function always returns slices thus all buffers in cudf will use
+    `BufferSlice` when copy-on-write is enabled. The slices implement
+    copy-on-write by trigging deep copies when write access is detected
+    and multiple slices points to the same exposure tracked buffer.
+
+    Parameters
+    ----------
+    data : buffer-like or array-like
+        A buffer-like or array-like object that represents C-contiguous memory.
+    exposed
+        Mark the buffer as permanently exposed.
+    subclass
+        If not None, a subclass of ExposureTrackedBuffer to wrap `data`.
+
+    Return
+    ------
+    BufferSlice
+        A buffer slice that points to a ExposureTrackedBuffer (or `subclass`),
+        which in turn wraps `data`.
+    """
+
+    if not hasattr(data, "__cuda_array_interface__"):
+        if exposed:
+            raise ValueError("cannot created exposed host memory")
+        return cast(
+            BufferSlice, ExposureTrackedBuffer._from_host_memory(data)[:]
+        )
+
+    owner = get_owner(data, subclass or ExposureTrackedBuffer)
+    if owner is None:
+        return cast(
+            BufferSlice,
+            ExposureTrackedBuffer._from_device_memory(data, exposed=exposed)[
+                :
+            ],
+        )
+
+    # At this point, we know that `data` is owned by a exposure tracked buffer
+    ptr, size = get_ptr_and_size(data.__cuda_array_interface__)
+    if size > 0 and owner._ptr == 0:
+        raise ValueError("Cannot create a non-empty slice of a null buffer")
+    return BufferSlice(base=owner, offset=ptr - owner._ptr, size=size)
+
+
+class ExposureTrackedBuffer(Buffer):
+    """A Buffer that tracks its "expose" status.
+
+    In order to implement copy-on-write and spillable buffers, we need the
+    ability to detect external access to the underlying memory. We say that
+    the buffer has been exposed if the device pointer (integer or void*) has
+    been accessed outside of ExposureTrackedBuffer. In this case, we have no
+    control over knowing if the data is being modified by a third-party.
+
+    Attributes
+    ----------
+    _exposed
+        The current exposure status of the buffer. Notice, once the exposure
+        status becomes True, it should never change back.
+    _slices
+        The set of BufferSlice instances that point to this buffer.
+    """
+
+    _exposed: bool
+    _slices: weakref.WeakSet[BufferSlice]
+
+    @property
+    def exposed(self) -> bool:
+        return self._exposed
+
+    def mark_exposed(self) -> None:
+        """Mark the buffer as "exposed" permanently"""
+        self._exposed = True
+
+    @classmethod
+    def _from_device_memory(cls, data: Any, *, exposed: bool = False) -> Self:
+        """Create an exposure tracked buffer from device memory.
+
+        No data is being copied.
+
+        Parameters
+        ----------
+        data : device-buffer-like
+            An object implementing the CUDA Array Interface.
+        exposed : bool, optional
+            Mark the buffer as permanently exposed.
+
+        Returns
+        -------
+        ExposureTrackedBuffer
+            Buffer representing the same device memory as `data`
+        """
+        ret = super()._from_device_memory(data)
+        ret._exposed = exposed
+        ret._slices = weakref.WeakSet()
+        return ret
+
+    def _getitem(self, offset: int, size: int) -> BufferSlice:
+        return BufferSlice(base=self, offset=offset, size=size)
+
+    @property
+    def __cuda_array_interface__(self) -> Mapping:
+        self.mark_exposed()
+        return super().__cuda_array_interface__
+
+    def __repr__(self) -> str:
+        return (
+            f"<ExposureTrackedBuffer exposed={self.exposed} "
+            f"size={format_bytes(self._size)} "
+            f"ptr={hex(self._ptr)} owner={repr(self._owner)}>"
+        )
+
+
+class BufferSlice(ExposureTrackedBuffer):
+    """A slice (aka. a view) of a exposure tracked buffer.
+
+    Parameters
+    ----------
+    base
+        The exposure tracked buffer this slice refers to.
+    offset
+        The offset relative to the start memory of base (in bytes).
+    size
+        The size of the slice (in bytes)
+    passthrough_attributes
+        Name of attributes that are passed through to the base as-is.
+    """
+
+    def __init__(
+        self,
+        base: ExposureTrackedBuffer,
+        offset: int,
+        size: int,
+        *,
+        passthrough_attributes: Container[str] = ("exposed",),
+    ) -> None:
+        if size < 0:
+            raise ValueError("size cannot be negative")
+        if offset < 0:
+            raise ValueError("offset cannot be negative")
+        if offset + size > base.size:
+            raise ValueError(
+                "offset+size cannot be greater than the size of base"
+            )
+        self._base = base
+        self._offset = offset
+        self._size = size
+        self._owner = base
+        self._passthrough_attributes = passthrough_attributes
+        base._slices.add(self)
+
+    def __getattr__(self, name):
+        if name in self._passthrough_attributes:
+            return getattr(self._base, name)
+        raise AttributeError(
+            f"{self.__class__.__name__} object has no attribute {name}"
+        )
+
+    def _getitem(self, offset: int, size: int) -> BufferSlice:
+        return BufferSlice(
+            base=self._base, offset=offset + self._offset, size=size
+        )
+
+    def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
+        if mode == "write" and cudf.get_option("copy_on_write"):
+            self.make_single_owner_inplace()
+        return self._base.get_ptr(mode=mode) + self._offset
+
+    def memoryview(
+        self, *, offset: int = 0, size: Optional[int] = None
+    ) -> memoryview:
+        return self._base.memoryview(offset=self._offset + offset, size=size)
+
+    def copy(self, deep: bool = True) -> Self:
+        """Return a copy of Buffer.
+
+        What actually happens when `deep == False` depends on the
+        "copy_on_write" option. When copy-on-write is enabled, a shallow copy
+        becomes a deep copy if the buffer has been exposed. This is because we
+        have no control over knowing if the data is being modified when the
+        buffer has been exposed to third-party.
+
+        Parameters
+        ----------
+        deep : bool, default True
+            The semantics when copy-on-write is disabled:
+                - If deep=True, returns a deep copy of the underlying data.
+                - If deep=False, returns a shallow copy of the Buffer pointing
+                  to the same underlying data.
+            The semantics when copy-on-write is enabled:
+                - From the users perspective, always a deep copy of the
+                  underlying data. However, the data isn't actually copied
+                  until someone writers to the returned buffer.
+
+        Returns
+        -------
+        BufferSlice
+            A slice pointing to either a new or the existing base buffer
+            depending on the expose status of the base buffer and the
+            copy-on-write option (see above).
+        """
+        if cudf.get_option("copy_on_write"):
+            base_copy = self._base.copy(deep=deep or self.exposed)
+        else:
+            base_copy = self._base.copy(deep=deep)
+        return cast(Self, base_copy[self._offset : self._offset + self._size])
+
+    @property
+    def __cuda_array_interface__(self) -> Mapping:
+        if cudf.get_option("copy_on_write"):
+            self.make_single_owner_inplace()
+        return super().__cuda_array_interface__
+
+    def make_single_owner_inplace(self) -> None:
+        """Make sure this slice is the only one pointing to the base.
+
+        This is used by copy-on-write to trigger a deep copy when write
+        access is detected.
+
+        Parameters
+        ----------
+        data : device-buffer-like
+            An object implementing the CUDA Array Interface.
+
+        Returns
+        -------
+        Buffer
+            Buffer representing the same device memory as `data`
+        """
+
+        if len(self._base._slices) > 1:
+            # If this is not the only slice pointing to `self._base`, we
+            # point to a new deep copy of the base.
+            t = self.copy(deep=True)
+            self._base = t._base
+            self._offset = t._offset
+            self._size = t._size
+            self._owner = t._base
+            self._base._slices.add(self)
+
+    def __repr__(self) -> str:
+        return (
+            f"<BufferSlice size={format_bytes(self._size)} "
+            f"offset={format_bytes(self._offset)} of {self._base}>"
+        )
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index 7f8399ba522..f056a0fd592 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -271,8 +271,8 @@ def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool:
         In order to avoid deadlock, this function should not lock
         already locked buffers.
         """
-
         # Let's try to spill device memory
+
         spilled = self.spill_device_memory(nbytes=nbytes)
 
         if spilled > 0:
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index 169b52b828e..84fb2044c62 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -7,18 +7,10 @@
 import time
 import weakref
 from threading import RLock
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Dict,
-    List,
-    Optional,
-    Tuple,
-    Type,
-    TypeVar,
-)
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple
 
 import numpy
+from typing_extensions import Self
 
 import rmm
 
@@ -34,9 +26,6 @@
     from cudf.core.buffer.spill_manager import SpillManager
 
 
-T = TypeVar("T", bound="SpillableBuffer")
-
-
 def get_spillable_owner(data) -> Optional[SpillableBuffer]:
     """Get the spillable owner of `data`, if any exist
 
@@ -212,9 +201,7 @@ def _finalize_init(self, ptr_desc: Dict[str, Any], exposed: bool) -> None:
         self._manager.add(self)
 
     @classmethod
-    def _from_device_memory(
-        cls: Type[T], data: Any, *, exposed: bool = False
-    ) -> T:
+    def _from_device_memory(cls, data: Any, *, exposed: bool = False) -> Self:
         """Create a spillabe buffer from device memory.
 
         No data is being copied.
@@ -236,7 +223,7 @@ def _from_device_memory(
         return ret
 
     @classmethod
-    def _from_host_memory(cls: Type[T], data: Any) -> T:
+    def _from_host_memory(cls, data: Any) -> Self:
         """Create a spillabe buffer from host memory.
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
@@ -278,7 +265,7 @@ def _from_host_memory(cls: Type[T], data: Any) -> T:
     def is_spilled(self) -> bool:
         return self._ptr_desc["type"] != "gpu"
 
-    def copy(self, deep: bool = True):
+    def copy(self, deep: bool = True) -> Self:
         spill_lock = SpillLock()
         self.spill_lock(spill_lock=spill_lock)
         return super().copy(deep=deep)
@@ -364,7 +351,7 @@ def spill_lock(self, spill_lock: SpillLock) -> None:
             self.spill(target="gpu")
             self._spill_locks.add(spill_lock)
 
-    def get_ptr(self, *, mode) -> int:
+    def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
         """Get a device pointer to the memory of the buffer.
 
         If this is called within an `acquire_spill_lock` context,
@@ -464,7 +451,7 @@ def memoryview(
                 )
                 return ret
 
-    def _getitem(self, offset: int, size: int) -> Buffer:
+    def _getitem(self, offset: int, size: int) -> SpillableBufferSlice:
         return SpillableBufferSlice(base=self, offset=offset, size=size)
 
     def serialize(self) -> Tuple[dict, list]:
@@ -554,14 +541,14 @@ def __init__(self, base: SpillableBuffer, offset: int, size: int) -> None:
         self._owner = base
         self.lock = base.lock
 
-    def get_ptr(self, *, mode) -> int:
+    def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
         """
         A passthrough method to `SpillableBuffer.get_ptr`
         with factoring in the `offset`.
         """
         return self._base.get_ptr(mode=mode) + self._offset
 
-    def _getitem(self, offset: int, size: int) -> Buffer:
+    def _getitem(self, offset: int, size: int) -> SpillableBufferSlice:
         return SpillableBufferSlice(
             base=self._base, offset=offset + self._offset, size=size
         )
diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py
index 85e4762641e..373be99ec96 100644
--- a/python/cudf/cudf/core/buffer/utils.py
+++ b/python/cudf/cudf/core/buffer/utils.py
@@ -7,7 +7,7 @@
 from typing import Any, Dict, Optional, Tuple, Union
 
 from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper
-from cudf.core.buffer.cow_buffer import CopyOnWriteBuffer
+from cudf.core.buffer.exposure_tracked_buffer import as_exposure_tracked_buffer
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.core.buffer.spillable_buffer import SpillLock, as_spillable_buffer
 from cudf.options import get_option
@@ -45,9 +45,9 @@ def as_buffer(
         Python object to which the lifetime of the memory allocation is tied.
         A reference to this object is kept in the returned Buffer.
     exposed : bool, optional
-        Mark the buffer as permanently exposed (unspillable). This is ignored
-        unless spilling is enabled and the data represents device memory, see
-        SpillableBuffer.
+        Mark the buffer as permanently exposed. This is used by
+        ExposureTrackedBuffer to determine when a deep copy is required and
+        by SpillableBuffer to mark the buffer unspillable.
 
     Return
     ------
@@ -74,16 +74,9 @@ def as_buffer(
         )
 
     if get_option("copy_on_write"):
-        if isinstance(data, Buffer) or hasattr(
-            data, "__cuda_array_interface__"
-        ):
-            return CopyOnWriteBuffer._from_device_memory(data, exposed=exposed)
-        if exposed:
-            raise ValueError("cannot created exposed host memory")
-        return CopyOnWriteBuffer._from_host_memory(data)
+        return as_exposure_tracked_buffer(data, exposed=exposed)
     if get_global_manager() is not None:
         return as_spillable_buffer(data, exposed=exposed)
-
     if hasattr(data, "__cuda_array_interface__"):
         return Buffer._from_device_memory(data)
     return Buffer._from_host_memory(data)
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index c6d7f779884..5be609c81bc 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -11,6 +11,7 @@
 import pandas as pd
 import pyarrow as pa
 from numba import cuda
+from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -190,6 +191,7 @@ def as_ordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]:
         Categories (3, int64): [1 < 2 < 10]
         """
         if inplace:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "The inplace parameter is deprecated and will be removed in a "
                 "future release. set_ordered will always return a new Series "
@@ -271,6 +273,7 @@ def as_unordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]:
         Categories (3, int64): [1, 2, 10]
         """
         if inplace:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "The inplace parameter is deprecated and will be removed in a "
                 "future release. set_ordered will always return a new Series "
@@ -340,6 +343,7 @@ def add_categories(
         Categories (5, int64): [1, 2, 0, 3, 4]
         """
         if inplace:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "The `inplace` parameter in cudf.Series.cat.add_categories "
                 "is deprecated and will be removed in a future version of "
@@ -459,6 +463,7 @@ def remove_categories(
         Categories (2, int64): [1, 2]
         """
         if inplace:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "The `inplace` parameter in "
                 "cudf.Series.cat.remove_categories is deprecated and "
@@ -577,6 +582,7 @@ def set_categories(
         Categories (2, int64): [1, 10]
         """
         if inplace:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "The `inplace` parameter in cudf.Series.cat.set_categories is "
                 "deprecated and will be removed in a future version of cudf. "
@@ -665,6 +671,7 @@ def reorder_categories(
         old categories
         """
         if inplace:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "The `inplace` parameter in "
                 "cudf.Series.cat.reorder_categories is deprecated "
@@ -716,7 +723,6 @@ def __init__(
         null_count: Optional[int] = None,
         children: Tuple["column.ColumnBase", ...] = (),
     ):
-
         if size is None:
             for child in children:
                 assert child.offset == 0
@@ -874,7 +880,7 @@ def _fill(
         begin: int,
         end: int,
         inplace: bool = False,
-    ) -> "column.ColumnBase":
+    ) -> Self:
         if end <= begin or begin >= self.size:
             return self if inplace else self.copy()
 
@@ -890,17 +896,20 @@ def _fill(
 
     def slice(
         self, start: int, stop: int, stride: Optional[int] = None
-    ) -> "column.ColumnBase":
+    ) -> Self:
         codes = self.codes.slice(start, stop, stride)
-        return cudf.core.column.build_categorical_column(
-            categories=self.categories,
-            codes=cudf.core.column.build_column(
-                codes.base_data, dtype=codes.dtype
+        return cast(
+            Self,
+            cudf.core.column.build_categorical_column(
+                categories=self.categories,
+                codes=cudf.core.column.build_column(
+                    codes.base_data, dtype=codes.dtype
+                ),
+                mask=codes.base_mask,
+                ordered=self.ordered,
+                size=codes.size,
+                offset=codes.offset,
             ),
-            mask=codes.base_mask,
-            ordered=self.ordered,
-            size=codes.size,
-            offset=codes.offset,
         )
 
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
@@ -937,10 +946,10 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
             ordered=self.dtype.ordered,
         )
 
-    def sort_by_values(
+    def sort_values(
         self, ascending: bool = True, na_position="last"
-    ) -> Tuple[CategoricalColumn, NumericalColumn]:
-        codes, inds = self.as_numerical.sort_by_values(ascending, na_position)
+    ) -> CategoricalColumn:
+        codes = self.as_numerical.sort_values(ascending, na_position)
         col = column.build_categorical_column(
             categories=self.dtype.categories._values,
             codes=column.build_column(codes.base_data, dtype=codes.dtype),
@@ -948,7 +957,7 @@ def sort_by_values(
             size=codes.size,
             ordered=self.dtype.ordered,
         )
-        return col, inds
+        return col
 
     def element_indexing(self, index: int) -> ScalarLike:
         val = self.as_numerical.element_indexing(index)
@@ -1039,8 +1048,8 @@ def data_array_view(
     ) -> cuda.devicearray.DeviceNDArray:
         return self.codes.data_array_view(mode=mode)
 
-    def unique(self, preserve_order=False) -> CategoricalColumn:
-        codes = self.as_numerical.unique(preserve_order=preserve_order)
+    def unique(self) -> CategoricalColumn:
+        codes = self.as_numerical.unique()
         return column.build_categorical_column(
             categories=self.categories,
             codes=column.build_column(codes.base_data, dtype=codes.dtype),
@@ -1263,32 +1272,12 @@ def fillna(
                     self.codes.dtype
                 )
 
-        result = super().fillna(value=fill_value, method=method)
+        return super().fillna(value=fill_value, method=method)
 
-        result = column.build_categorical_column(
-            categories=self.dtype.categories._values,
-            codes=column.build_column(result.base_data, dtype=result.dtype),
-            offset=result.offset,
-            size=result.size,
-            mask=result.base_mask,
-            ordered=self.dtype.ordered,
-        )
-
-        return result
-
-    def find_first_value(
-        self, value: ScalarLike, closest: bool = False
-    ) -> int:
-        """
-        Returns offset of first value that matches
-        """
-        return self.as_numerical.find_first_value(self._encode(value))
-
-    def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
-        """
-        Returns offset of last value that matches
-        """
-        return self.as_numerical.find_last_value(self._encode(value))
+    def indices_of(
+        self, value: ScalarLike
+    ) -> cudf.core.column.NumericalColumn:
+        return self.as_numerical.indices_of(self._encode(value))
 
     @property
     def is_monotonic_increasing(self) -> bool:
@@ -1356,7 +1345,7 @@ def _get_decategorized_column(self) -> ColumnBase:
         out = out.set_mask(self.mask)
         return out
 
-    def copy(self, deep: bool = True) -> CategoricalColumn:
+    def copy(self, deep: bool = True) -> Self:
         result_col = super().copy(deep=deep)
         if deep:
             result_col.categories = libcudf.copying.copy_column(
@@ -1370,7 +1359,7 @@ def memory_usage(self) -> int:
 
     def _mimic_inplace(
         self, other_col: ColumnBase, inplace: bool = False
-    ) -> Optional[ColumnBase]:
+    ) -> Optional[Self]:
         out = super()._mimic_inplace(other_col, inplace=inplace)
         if inplace and isinstance(other_col, CategoricalColumn):
             self._codes = other_col._codes
@@ -1394,9 +1383,7 @@ def _concat(
         head = next((obj for obj in objs if obj.valid_count), objs[0])
 
         # Combine and de-dupe the categories
-        cats = column.concat_columns([o.categories for o in objs]).unique(
-            preserve_order=True
-        )
+        cats = column.concat_columns([o.categories for o in objs]).unique()
         objs = [o._set_categories(cats, is_unique=True) for o in objs]
         codes = [o.codes for o in objs]
 
@@ -1475,7 +1462,7 @@ def set_categories(
             )
         else:
             out_col = self
-            if not (type(out_col.categories) is type(new_categories)):
+            if type(out_col.categories) is not type(new_categories):
                 # If both categories are of different Column types,
                 # return a column full of Nulls.
                 out_col = _create_empty_categorical_column(
@@ -1485,7 +1472,7 @@ def set_categories(
                     ),
                 )
             elif (
-                not out_col._categories_equal(new_categories, ordered=ordered)
+                not out_col._categories_equal(new_categories, ordered=True)
                 or not self.ordered == ordered
             ):
                 out_col = out_col._set_categories(
@@ -1535,10 +1522,7 @@ def _set_categories(
 
         # Ensure new_categories is unique first
         if not (is_unique or new_cats.is_unique):
-            # drop_duplicates() instead of unique() to preserve order
-            new_cats = cudf.Series(new_cats)._column.unique(
-                preserve_order=True
-            )
+            new_cats = cudf.Series(new_cats)._column.unique()
 
         cur_codes = self.codes
         max_cat_size = (
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 6557001f884..f7189a2f9cf 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2,8 +2,10 @@
 
 from __future__ import annotations
 
+import builtins
 import pickle
 import warnings
+from collections import abc
 from functools import cached_property
 from itertools import chain
 from types import SimpleNamespace
@@ -15,7 +17,6 @@
     Optional,
     Sequence,
     Tuple,
-    TypeVar,
     Union,
     cast,
 )
@@ -25,6 +26,7 @@
 import pandas as pd
 import pyarrow as pa
 from numba import cuda
+from typing_extensions import Self
 
 import rmm
 
@@ -44,12 +46,14 @@
     drop_nulls,
 )
 from cudf._lib.transform import bools_to_mask
+from cudf._lib.types import size_type_dtype
 from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     infer_dtype,
     is_bool_dtype,
     is_categorical_dtype,
+    is_datetime64_dtype,
     is_datetime64tz_dtype,
     is_decimal32_dtype,
     is_decimal64_dtype,
@@ -77,8 +81,8 @@
     ListDtype,
     StructDtype,
 )
-from cudf.core.missing import NA
 from cudf.core.mixins import BinaryOperand, Reducible
+from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
     _maybe_convert_to_default_type,
     cudf_dtype_from_pa_type,
@@ -97,11 +101,6 @@
 else:
     from pandas.core.arrays._arrow_utils import ArrowIntervalType
 
-T = TypeVar("T", bound="ColumnBase")
-# TODO: This workaround allows type hints for `slice`, since `slice` is a
-# method in ColumnBase.
-Slice = TypeVar("Slice", bound=slice)
-
 
 class ColumnBase(Column, Serializable, BinaryOperand, Reducible):
     _VALID_REDUCTIONS = {
@@ -143,8 +142,9 @@ def data_array_view(
         """
         if self.data is not None:
             if mode == "read":
-                obj = _proxy_cai_obj(
-                    self.data._get_cuda_array_interface(readonly=True),
+                obj = cuda_array_interface_wrapper(
+                    ptr=self.data.get_ptr(mode="read"),
+                    size=self.data.size,
                     owner=self.data,
                 )
             elif mode == "write":
@@ -179,8 +179,9 @@ def mask_array_view(
         """
         if self.mask is not None:
             if mode == "read":
-                obj = _proxy_cai_obj(
-                    self.mask._get_cuda_array_interface(readonly=True),
+                obj = cuda_array_interface_wrapper(
+                    ptr=self.mask.get_ptr(mode="read"),
+                    size=self.mask.size,
                     owner=self.mask,
                 )
             elif mode == "write":
@@ -203,7 +204,7 @@ def __repr__(self):
 
     def to_pandas(
         self, index: Optional[pd.Index] = None, **kwargs
-    ) -> "pd.Series":
+    ) -> pd.Series:
         """Convert object to pandas type.
 
         The default implementation falls back to PyArrow for the conversion.
@@ -246,11 +247,11 @@ def values(self) -> "cupy.ndarray":
         return cupy.asarray(self.data_array_view(mode="write"))
 
     def find_and_replace(
-        self: T,
+        self,
         to_replace: ColumnLike,
         replacement: ColumnLike,
         all_nan: bool = False,
-    ) -> T:
+    ) -> Self:
         raise NotImplementedError
 
     def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase:
@@ -290,7 +291,7 @@ def any(self, skipna: bool = True) -> bool:
 
     def dropna(self, drop_nan: bool = False) -> ColumnBase:
         # The drop_nan argument is only used for numerical columns.
-        return drop_nulls([self])[0]
+        return drop_nulls([self])[0]._with_type_metadata(self.dtype)
 
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
@@ -399,7 +400,7 @@ def _fill(
         begin: int,
         end: int,
         inplace: bool = False,
-    ) -> Optional[ColumnBase]:
+    ) -> Optional[Self]:
         if end <= begin or begin >= self.size:
             return self if inplace else self.copy()
 
@@ -439,15 +440,15 @@ def nullmask(self) -> Buffer:
             raise ValueError("Column has no null mask")
         return self.mask_array_view(mode="read")
 
-    def force_deep_copy(self: T) -> T:
+    def force_deep_copy(self) -> Self:
         """
         A method to create deep copy irrespective of whether
         `copy-on-write` is enabled.
         """
         result = libcudf.copying.copy_column(self)
-        return cast(T, result._with_type_metadata(self.dtype))
+        return result._with_type_metadata(self.dtype)
 
-    def copy(self: T, deep: bool = True) -> T:
+    def copy(self, deep: bool = True) -> Self:
         """
         Makes a copy of the Column.
 
@@ -469,7 +470,7 @@ def copy(self: T, deep: bool = True) -> T:
             return self.force_deep_copy()
         else:
             return cast(
-                T,
+                Self,
                 build_column(
                     data=self.base_data
                     if self.base_data is None
@@ -552,14 +553,14 @@ def element_indexing(self, index: int):
 
     def slice(
         self, start: int, stop: int, stride: Optional[int] = None
-    ) -> ColumnBase:
+    ) -> Self:
         stride = 1 if stride is None else stride
         if start < 0:
             start = start + len(self)
         if stop < 0 and not (stride < 0 and stop == -1):
             stop = stop + len(self)
         if (stride > 0 and start >= stop) or (stride < 0 and start <= stop):
-            return column_empty(0, self.dtype, masked=True)
+            return cast(Self, column_empty(0, self.dtype, masked=True))
         # compute mask slice
         if stride == 1:
             return libcudf.copying.column_slice(self, [start, stop])[
@@ -603,7 +604,7 @@ def __setitem__(self, key: Any, value: Any):
             self._mimic_inplace(out, inplace=True)
 
     def _wrap_binop_normalization(self, other):
-        if other is NA or other is None:
+        if cudf.utils.utils.is_na_like(other):
             return cudf.Scalar(other, dtype=self.dtype)
         if isinstance(other, np.ndarray) and other.ndim == 0:
             # Try and maintain the dtype
@@ -611,8 +612,10 @@ def _wrap_binop_normalization(self, other):
         return self.normalize_binop_value(other)
 
     def _scatter_by_slice(
-        self, key: Slice, value: Union[cudf.core.scalar.Scalar, ColumnBase]
-    ) -> Optional[ColumnBase]:
+        self,
+        key: builtins.slice,
+        value: Union[cudf.core.scalar.Scalar, ColumnBase],
+    ) -> Optional[Self]:
         """If this function returns None, it's either a no-op (slice is empty),
         or the inplace replacement is already performed (fill-in-place).
         """
@@ -650,7 +653,7 @@ def _scatter_by_column(
         self,
         key: cudf.core.column.NumericalColumn,
         value: Union[cudf.core.scalar.Scalar, ColumnBase],
-    ) -> ColumnBase:
+    ) -> Self:
         if is_bool_dtype(key.dtype):
             # `key` is boolean mask
             if len(key) != len(self):
@@ -701,18 +704,18 @@ def _check_scatter_key_length(
                 raise ValueError(msg)
 
     def fillna(
-        self: T,
+        self,
         value: Any = None,
         method: Optional[str] = None,
         dtype: Optional[Dtype] = None,
-    ) -> T:
+    ) -> Self:
         """Fill null values with ``value``.
 
         Returns a copy with null filled.
         """
         return libcudf.replace.replace_nulls(
             input_col=self, replacement=value, method=method, dtype=dtype
-        )
+        )._with_type_metadata(self.dtype)
 
     def isnull(self) -> ColumnBase:
         """Identify missing values in a Column."""
@@ -736,29 +739,79 @@ def notnull(self) -> ColumnBase:
 
         return result
 
-    def find_first_value(
-        self, value: ScalarLike, closest: bool = False
-    ) -> int:
+    def indices_of(
+        self, value: ScalarLike | Self
+    ) -> cudf.core.column.NumericalColumn:
         """
-        Returns offset of first value that matches
+        Find locations of value in the column
+
+        Parameters
+        ----------
+        value
+            Scalar to look for (cast to dtype of column), or a length-1 column
+
+        Returns
+        -------
+        Column of indices that match value
         """
-        # FIXME: Inefficient, may be need a libcudf api
-        index = cudf.core.index.RangeIndex(0, stop=len(self))
-        indices = index.take(self == value)
-        if not len(indices):
-            raise ValueError("value not found")
-        return indices[0]
-
-    def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
+        if not isinstance(value, ColumnBase):
+            value = as_column([value], dtype=self.dtype)
+        else:
+            assert len(value) == 1
+        mask = libcudf.search.contains(value, self)
+        return apply_boolean_mask(
+            [arange(0, len(self), dtype=size_type_dtype)], mask
+        )[0]
+
+    def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]:
+        indices = self.indices_of(value)
+        if n := len(indices):
+            return (
+                indices.element_indexing(0),
+                indices.element_indexing(n - 1),
+            )
+        else:
+            raise ValueError(f"Value {value} not found in column")
+
+    def find_first_value(self, value: ScalarLike) -> int:
         """
-        Returns offset of last value that matches
+        Return index of first value that matches
+
+        Parameters
+        ----------
+        value
+            Value to search for (cast to dtype of column)
+
+        Returns
+        -------
+        Index of value
+
+        Raises
+        ------
+        ValueError if value is not found
         """
-        # FIXME: Inefficient, may be need a libcudf api
-        index = cudf.core.index.RangeIndex(0, stop=len(self))
-        indices = index.take(self == value)
-        if not len(indices):
-            raise ValueError("value not found")
-        return indices[-1]
+        first, _ = self._find_first_and_last(value)
+        return first
+
+    def find_last_value(self, value: ScalarLike) -> int:
+        """
+        Return index of last value that matches
+
+        Parameters
+        ----------
+        value
+            Value to search for (cast to dtype of column)
+
+        Returns
+        -------
+        Index of value
+
+        Raises
+        ------
+        ValueError if value is not found
+        """
+        _, last = self._find_first_and_last(value)
+        return last
 
     def append(self, other: ColumnBase) -> ColumnBase:
         return concat_columns([self, as_column(other)])
@@ -773,8 +826,8 @@ def quantile(
         raise TypeError(f"cannot perform quantile with type {self.dtype}")
 
     def take(
-        self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True
-    ) -> T:
+        self, indices: ColumnBase, nullify: bool = False, check_bounds=True
+    ) -> Self:
         """Return Column by taking values from the corresponding *indices*.
 
         Skip bounds checking if check_bounds is False.
@@ -782,7 +835,7 @@ def take(
         """
         # Handle zero size
         if indices.size == 0:
-            return cast(T, column_empty_like(self, newsize=0))
+            return cast(Self, column_empty_like(self, newsize=0))
 
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
@@ -895,49 +948,14 @@ def is_monotonic_decreasing(self) -> bool:
             ascending=[False], null_position=None
         )
 
-    def get_slice_bound(self, label: ScalarLike, side: str, kind: str) -> int:
-        """
-        Calculate slice bound that corresponds to given label.
-        Returns leftmost (one-past-the-rightmost if ``side=='right'``) position
-        of given label.
-
-        Parameters
-        ----------
-        label : Scalar
-        side : {'left', 'right'}
-        kind : {'ix', 'loc', 'getitem'}
-        """
-        if kind not in {"ix", "loc", "getitem", None}:
-            raise ValueError(
-                f"Invalid value for ``kind`` parameter,"
-                f" must be either one of the following: "
-                f"{'ix', 'loc', 'getitem', None}, but found: {kind}"
-            )
-        if side not in {"left", "right"}:
-            raise ValueError(
-                "Invalid value for side kwarg,"
-                " must be either 'left' or 'right': %s" % (side,)
-            )
-
-        # TODO: Handle errors/missing keys correctly
-        #       Not currently using `kind` argument.
-        if side == "left":
-            return self.find_first_value(label, closest=True)
-        elif side == "right":
-            return self.find_last_value(label, closest=True) + 1
-        else:
-            raise ValueError(f"Invalid value for side: {side}")
-
-    def sort_by_values(
+    def sort_values(
         self: ColumnBase,
         ascending: bool = True,
         na_position: str = "last",
-    ) -> Tuple[ColumnBase, "cudf.core.column.NumericalColumn"]:
-        col_inds = self.as_frame()._get_sorted_inds(
-            ascending=ascending, na_position=na_position
-        )
-        col_keys = self.take(col_inds)
-        return col_keys, col_inds
+    ) -> ColumnBase:
+        return libcudf.sort.sort(
+            [self], column_order=[ascending], null_precedence=[na_position]
+        )[0]
 
     def distinct_count(self, dropna: bool = True) -> int:
         try:
@@ -971,6 +989,13 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
             np.object_,
             str,
         }:
+            if cudf.get_option("mode.pandas_compatible") and np.dtype(
+                dtype
+            ).type in {np.object_}:
+                raise ValueError(
+                    f"Casting to {dtype} is not supported, use "
+                    "`.astype('str')` instead."
+                )
             return self.as_string_column(dtype, **kwargs)
         elif is_list_dtype(dtype):
             if not self.dtype == dtype:
@@ -1023,17 +1048,16 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
                 ordered=dtype.ordered,
             )
 
-        cats = self.unique().astype(self.dtype)
+        # Categories must be unique and sorted in ascending order.
+        cats = self.unique().sort_values().astype(self.dtype)
         label_dtype = min_unsigned_type(len(cats))
         labels = self._label_encoding(
             cats=cats, dtype=label_dtype, na_sentinel=cudf.Scalar(1)
         )
-
         # columns include null index in factorization; remove:
         if self.has_nulls():
             cats = cats.dropna(drop_nan=False)
             min_type = min_unsigned_type(len(cats), 8)
-            labels = labels - 1
             if cudf.dtype(min_type).itemsize < labels.dtype.itemsize:
                 labels = labels.astype(min_type)
 
@@ -1134,26 +1158,13 @@ def searchsorted(
             values, side, ascending=ascending, na_position=na_position
         )
 
-    def unique(self, preserve_order=False) -> ColumnBase:
+    def unique(self) -> ColumnBase:
         """
         Get unique values in the data
         """
-        # TODO: We could avoid performing `drop_duplicates` for
-        # columns with values that already are unique.
-        # Few things to note before we can do this optimization is
-        # the following issue resolved:
-        # https://github.com/rapidsai/cudf/issues/5286
-        if preserve_order:
-            ind = as_column(cupy.arange(0, len(self)))
-
-            # dedup based on the column of data only
-            ind, col = drop_duplicates([ind, self], keys=[1])
-
-            # sort col based on ind
-            map = ind.argsort()
-            return col.take(map)
-
-        return drop_duplicates([self], keep="first")[0]
+        return drop_duplicates([self], keep="first")[0]._with_type_metadata(
+            self.dtype
+        )
 
     def serialize(self) -> Tuple[dict, list]:
         # data model:
@@ -1411,8 +1422,10 @@ def column_empty_like(
         and is_categorical_dtype(column.dtype)
         and dtype == column.dtype
     ):
-        column = cast("cudf.core.column.CategoricalColumn", column)
-        codes = column_empty_like(column.codes, masked=masked, newsize=newsize)
+        catcolumn = cast("cudf.core.column.CategoricalColumn", column)
+        codes = column_empty_like(
+            catcolumn.codes, masked=masked, newsize=newsize
+        )
         return build_column(
             data=None,
             dtype=dtype,
@@ -1956,9 +1969,7 @@ def as_column(
         ):
             arbitrary = cupy.ascontiguousarray(arbitrary)
 
-        data = as_buffer(arbitrary)
-        if cudf.get_option("copy_on_write"):
-            data._zero_copied = True
+        data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write"))
         col = build_column(data, dtype=current_dtype, mask=mask)
 
         if dtype is not None:
@@ -1985,29 +1996,56 @@ def as_column(
         col = ColumnBase.from_arrow(arbitrary)
 
         if isinstance(arbitrary, pa.NullArray):
-            new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype())
             if dtype is not None:
                 # Cast the column to the `dtype` if specified.
-                col = col.astype(dtype)
+                new_dtype = dtype
             elif len(arbitrary) == 0:
                 # If the column is empty, it has to be
                 # a `float64` dtype.
-                col = col.astype("float64")
+                new_dtype = cudf.dtype("float64")
             else:
                 # If the null column is not empty, it has to
                 # be of `object` dtype.
-                col = col.astype(new_dtype)
+                new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype())
+
+            if cudf.get_option(
+                "mode.pandas_compatible"
+            ) and new_dtype == cudf.dtype("O"):
+                # We internally raise if we do `astype("object")`, hence
+                # need to cast to `str` since this is safe to do so because
+                # it is a null-array.
+                new_dtype = "str"
+
+            col = col.astype(new_dtype)
 
         return col
 
     elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
-        if isinstance(arbitrary, pd.Series) and isinstance(
-            arbitrary.array, pd.core.arrays.masked.BaseMaskedArray
-        ):
-            return as_column(arbitrary.array)
-        if is_categorical_dtype(arbitrary):
+        if isinstance(arbitrary, pd.Series):
+            if isinstance(
+                arbitrary.array, pd.core.arrays.masked.BaseMaskedArray
+            ):
+                return as_column(arbitrary.array)
+            elif PANDAS_GE_150 and isinstance(arbitrary.dtype, pd.ArrowDtype):
+                return as_column(pa.array(arbitrary.array, from_pandas=True))
+            elif isinstance(arbitrary.dtype, pd.SparseDtype):
+                raise NotImplementedError(
+                    f"{arbitrary.dtype} is not supported. Convert first to "
+                    f"{arbitrary.dtype.subtype}."
+                )
+        if is_categorical_dtype(arbitrary.dtype):
+            if isinstance(
+                arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
+            ):
+                raise NotImplementedError(
+                    "cuDF does not yet support timezone-aware datetimes"
+                )
             data = as_column(pa.array(arbitrary, from_pandas=True))
         elif is_interval_dtype(arbitrary.dtype):
+            if isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype):
+                raise NotImplementedError(
+                    "cuDF does not yet support timezone-aware datetimes"
+                )
             data = as_column(pa.array(arbitrary, from_pandas=True))
         elif arbitrary.dtype == np.bool_:
             data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype)
@@ -2024,6 +2062,15 @@ def as_column(
             )
         else:
             pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null)
+            if (
+                arbitrary.dtype == cudf.dtype("object")
+                and cudf.dtype(pyarrow_array.type.to_pandas_dtype())
+                != cudf.dtype(arbitrary.dtype)
+                and not is_bool_dtype(
+                    cudf.dtype(pyarrow_array.type.to_pandas_dtype())
+                )
+            ):
+                raise MixedTypeError("Cannot create column with mixed types")
             if isinstance(pyarrow_array.type, pa.Decimal128Type):
                 pyarrow_type = cudf.Decimal128Dtype.from_arrow(
                     pyarrow_array.type
@@ -2147,9 +2194,7 @@ def as_column(
             if dtype is not None:
                 data = data.astype(dtype)
         elif arb_dtype.kind in ("O", "U"):
-            data = as_column(
-                pa.Array.from_pandas(arbitrary), dtype=arbitrary.dtype
-            )
+            data = as_column(pa.array(arbitrary), dtype=arbitrary.dtype)
             # There is no cast operation available for pa.Array from int to
             # str, Hence instead of handling in pa.Array block, we
             # will have to type-cast here.
@@ -2169,7 +2214,7 @@ def as_column(
         if delayed_cast:
             data = data.astype(cudf.dtype(dtype))
 
-    elif isinstance(arbitrary, pd.core.arrays.numpy_.PandasArray):
+    elif isinstance(arbitrary, pd.arrays.PandasArray):
         if is_categorical_dtype(arbitrary.dtype):
             arb_dtype = arbitrary.dtype
         else:
@@ -2194,7 +2239,20 @@ def as_column(
                 pa.Array.from_pandas(interval_series), dtype=arb_dtype
             )
         elif arb_dtype.kind in ("O", "U"):
-            data = as_column(pa.Array.from_pandas(arbitrary), dtype=arb_dtype)
+            pyarrow_array = pa.Array.from_pandas(arbitrary)
+            if not isinstance(
+                pyarrow_array,
+                (
+                    pa.ListArray,
+                    pa.StructArray,
+                    pa.NullArray,
+                    pa.Decimal128Array,
+                    pa.StringArray,
+                    pa.BooleanArray,
+                ),
+            ):
+                raise MixedTypeError("Cannot create column with mixed types")
+            data = as_column(pyarrow_array, dtype=arb_dtype)
         else:
             data = as_column(
                 pa.array(
@@ -2205,6 +2263,11 @@ def as_column(
             )
         if dtype is not None:
             data = data.astype(dtype)
+    elif isinstance(arbitrary, pd.arrays.SparseArray):
+        raise NotImplementedError(
+            f"{arbitrary.dtype} is not supported. Convert first to "
+            f"{arbitrary.dtype.subtype}."
+        )
     elif isinstance(arbitrary, memoryview):
         data = as_column(
             np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null
@@ -2213,6 +2276,31 @@ def as_column(
         data = ColumnBase.from_scalar(arbitrary, length if length else 1)
     elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray):
         data = as_column(pa.Array.from_pandas(arbitrary), dtype=dtype)
+    elif (
+        (
+            isinstance(arbitrary, pd.DatetimeIndex)
+            and isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
+        )
+        or (
+            isinstance(arbitrary, pd.IntervalIndex)
+            and is_datetime64tz_dtype(arbitrary.dtype.subtype)
+        )
+        or (
+            isinstance(arbitrary, pd.CategoricalIndex)
+            and isinstance(
+                arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
+            )
+        )
+    ):
+        raise NotImplementedError(
+            "cuDF does not yet support timezone-aware datetimes"
+        )
+    elif (
+        cudf.get_option("mode.pandas_compatible")
+        and isinstance(arbitrary, (pd.DatetimeIndex, pd.TimedeltaIndex))
+        and arbitrary.freq is not None
+    ):
+        raise NotImplementedError("freq is not implemented yet")
     else:
         try:
             data = as_column(
@@ -2262,6 +2350,18 @@ def as_column(
                             "Use `tz_localize()` to construct "
                             "timezone aware data."
                         )
+                    elif is_datetime64_dtype(dtype):
+                        # Error checking only, actual construction happens
+                        # below.
+                        pa_array = pa.array(arbitrary)
+                        if (
+                            isinstance(pa_array.type, pa.TimestampType)
+                            and pa_array.type.tz is not None
+                        ):
+                            raise NotImplementedError(
+                                "cuDF does not yet support timezone-aware "
+                                "datetimes"
+                            )
                     if is_list_dtype(dtype):
                         data = pa.array(arbitrary)
                         if type(data) not in (pa.ListArray, pa.NullArray):
@@ -2332,18 +2432,49 @@ def as_column(
                             _maybe_convert_to_default_type("float")
                         )
 
+                pyarrow_array = pa.array(
+                    arbitrary,
+                    type=pa_type,
+                    from_pandas=True if nan_as_null is None else nan_as_null,
+                )
+
+                if (
+                    isinstance(pyarrow_array, pa.NullArray)
+                    and pa_type is None
+                    and dtype is None
+                    and getattr(arbitrary, "dtype", None)
+                    == cudf.dtype("object")
+                ):
+                    # pa.array constructor returns a NullArray
+                    # for empty arrays, instead of a StringArray.
+                    # This issue is only specific to this dtype,
+                    # all other dtypes, result in their corresponding
+                    # arrow array creation.
+                    dtype = cudf.dtype("str")
+                    pyarrow_array = pyarrow_array.cast(np_to_pa_dtype(dtype))
+
+                if (
+                    isinstance(arbitrary, pd.Index)
+                    and arbitrary.dtype == cudf.dtype("object")
+                    and (
+                        cudf.dtype(pyarrow_array.type.to_pandas_dtype())
+                        != cudf.dtype(arbitrary.dtype)
+                        and not is_bool_dtype(
+                            cudf.dtype(pyarrow_array.type.to_pandas_dtype())
+                        )
+                    )
+                ):
+                    raise MixedTypeError(
+                        "Cannot create column with mixed types"
+                    )
                 data = as_column(
-                    pa.array(
-                        arbitrary,
-                        type=pa_type,
-                        from_pandas=True
-                        if nan_as_null is None
-                        else nan_as_null,
-                    ),
+                    pyarrow_array,
                     dtype=dtype,
                     nan_as_null=nan_as_null,
                 )
-            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
+            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as e:
+                if isinstance(e, MixedTypeError):
+                    raise TypeError(str(e))
                 if is_categorical_dtype(dtype):
                     sr = pd.Series(arbitrary, dtype="category")
                     data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
@@ -2364,36 +2495,52 @@ def as_column(
                     return cudf.core.column.ListColumn.from_sequences(
                         arbitrary
                     )
-                else:
+                elif isinstance(arbitrary, abc.Iterable) or isinstance(
+                    arbitrary, abc.Sequence
+                ):
                     data = as_column(
                         _construct_array(arbitrary, dtype),
                         dtype=dtype,
                         nan_as_null=nan_as_null,
                     )
+                else:
+                    raise e
     return data
 
 
 def _construct_array(
     arbitrary: Any, dtype: Optional[Dtype]
-) -> Union[np.ndarray, cupy.ndarray]:
+) -> Union[np.ndarray, cupy.ndarray, pd.api.extensions.ExtensionArray]:
     """
-    Construct a CuPy or NumPy array from `arbitrary`
+    Construct a CuPy/NumPy/Pandas array from `arbitrary`
     """
     try:
         dtype = dtype if dtype is None else cudf.dtype(dtype)
         arbitrary = cupy.asarray(arbitrary, dtype=dtype)
     except (TypeError, ValueError):
         native_dtype = dtype
+        inferred_dtype = None
         if (
             dtype is None
             and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
-            and infer_dtype(arbitrary)
+            and (inferred_dtype := infer_dtype(arbitrary, skipna=False))
             in (
                 "mixed",
                 "mixed-integer",
             )
         ):
             native_dtype = "object"
+        if inferred_dtype == "interval":
+            # Only way to construct an Interval column.
+            return pd.array(arbitrary)
+        elif inferred_dtype == "string" and getattr(dtype, "kind", None) == "M":
+            # We may have datetime-like string data that is timezone aware
+            arbitrary = pd.to_datetime(arbitrary)
+            if isinstance(arbitrary.dtype, pd.DatetimeTZDtype):
+                raise NotImplementedError(
+                    "cuDF does not yet support timezone-aware datetimes"
+                )
+            return arbitrary.to_numpy()
         arbitrary = np.asarray(
             arbitrary,
             dtype=native_dtype
@@ -2626,22 +2773,3 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
 
     # Filter out inputs that have 0 length, then concatenate.
     return libcudf.concat.concat_columns([o for o in objs if len(o)])
-
-
-def _proxy_cai_obj(cai, owner):
-    """
-    Returns a proxy CAI SimpleNameSpace wrapped
-    with the provided `cai` as `__cuda_array_interface__`
-    and owner as `owner` to keep the object alive.
-    This is an internal utility for `data_array_view`
-    and `mask_array_view` where an object with
-    read-only CAI is required.
-    """
-    return cuda_array_interface_wrapper(
-        ptr=cai["data"][0],
-        size=cai["shape"][0],
-        owner=owner,
-        readonly=cai["data"][1],
-        typestr=cai["typestr"],
-        version=cai["version"],
-    )
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index c0a2a6ac546..da6c4fb858c 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -31,7 +31,7 @@
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
-from cudf.utils.utils import _fillna_natwise
+from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise
 
 _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format
 
@@ -146,7 +146,7 @@ def __init__(
             null_count=null_count,
         )
 
-        if not (self.dtype.type is np.datetime64):
+        if self.dtype.type is not np.datetime64:
             raise TypeError(f"{self.dtype} is not a supported datetime type")
 
         self._time_unit, _ = np.datetime_data(self.dtype)
@@ -242,14 +242,22 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
         if isinstance(other, (cudf.Scalar, ColumnBase, cudf.DateOffset)):
             return other
 
-        if isinstance(other, datetime.datetime):
-            other = np.datetime64(other)
-        elif isinstance(other, datetime.timedelta):
-            other = np.timedelta64(other)
-        elif isinstance(other, pd.Timestamp):
+        tz_error_msg = (
+            "Cannot perform binary operation on timezone-naive columns"
+            " and timezone-aware timestamps."
+        )
+        if isinstance(other, pd.Timestamp):
+            if other.tz is not None:
+                raise NotImplementedError(tz_error_msg)
             other = other.to_datetime64()
         elif isinstance(other, pd.Timedelta):
             other = other.to_timedelta64()
+        elif isinstance(other, datetime.datetime):
+            if other.tzinfo is not None:
+                raise NotImplementedError(tz_error_msg)
+            other = np.datetime64(other)
+        elif isinstance(other, datetime.timedelta):
+            other = np.timedelta64(other)
 
         if isinstance(other, np.datetime64):
             if np.isnat(other):
@@ -416,12 +424,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         )
         lhs, rhs = (other, self) if reflect else (self, other)
         out_dtype = None
-        if op in {
-            "__eq__",
-            "NULL_EQUALS",
-        }:
-            out_dtype = cudf.dtype(np.bool_)
-        elif (
+
+        if (
             op
             in {
                 "__ne__",
@@ -447,11 +451,31 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             # well-defined if this operation was not invoked via reflection.
             elif other_is_timedelta and not reflect:
                 out_dtype = _resolve_mixed_dtypes(lhs, rhs, "datetime64")
+        elif op in {
+            "__eq__",
+            "NULL_EQUALS",
+            "__ne__",
+        }:
+            out_dtype = cudf.dtype(np.bool_)
+            if isinstance(other, ColumnBase) and not isinstance(
+                other, DatetimeColumn
+            ):
+                result = _all_bools_with_nulls(
+                    self, other, bool_fill_value=op == "__ne__"
+                )
+                if cudf.get_option("mode.pandas_compatible"):
+                    result = result.fillna(op == "__ne__")
+                return result
 
         if out_dtype is None:
             return NotImplemented
 
-        return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
+        result = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
+        if cudf.get_option(
+            "mode.pandas_compatible"
+        ) and out_dtype == cudf.dtype(np.bool_):
+            result = result.fillna(op == "__ne__")
+        return result
 
     def fillna(
         self,
@@ -470,27 +494,13 @@ def fillna(
 
         return super().fillna(fill_value, method)
 
-    def find_first_value(
-        self, value: ScalarLike, closest: bool = False
-    ) -> int:
-        """
-        Returns offset of first value that matches
-        """
-        value = pd.to_datetime(value)
-        value = column.as_column(
-            value, dtype=self.dtype
-        ).as_numerical.element_indexing(0)
-        return self.as_numerical.find_first_value(value, closest=closest)
-
-    def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
-        """
-        Returns offset of last value that matches
-        """
-        value = pd.to_datetime(value)
+    def indices_of(
+        self, value: ScalarLike
+    ) -> cudf.core.column.NumericalColumn:
         value = column.as_column(
-            value, dtype=self.dtype
-        ).as_numerical.element_indexing(0)
-        return self.as_numerical.find_last_value(value, closest=closest)
+            pd.to_datetime(value), dtype=self.dtype
+        ).as_numerical
+        return self.as_numerical.indices_of(value)
 
     @property
     def is_unique(self) -> bool:
@@ -577,6 +587,18 @@ def to_arrow(self):
             self._local_time.to_arrow(), str(self.dtype.tz)
         )
 
+    @property
+    def _utc_time(self):
+        """Return UTC time as naive timestamps."""
+        return DatetimeColumn(
+            data=self.base_data,
+            dtype=_get_base_dtype(self.dtype),
+            mask=self.base_mask,
+            size=self.size,
+            offset=self.offset,
+            null_count=self.null_count,
+        )
+
     @property
     def _local_time(self):
         """Return the local time as naive timestamps."""
@@ -589,6 +611,18 @@ def as_string_column(
     ) -> "cudf.core.column.StringColumn":
         return self._local_time.as_string_column(dtype, format, **kwargs)
 
+    def __repr__(self):
+        # Arrow prints the UTC timestamps, but we want to print the
+        # local timestamps:
+        arr = self._local_time.to_arrow().cast(
+            pa.timestamp(self.dtype.unit, str(self.dtype.tz))
+        )
+        return (
+            f"{object.__repr__(self)}\n"
+            f"{arr.to_string()}\n"
+            f"dtype: {self.dtype}"
+        )
+
 
 def infer_format(element: str, **kwargs) -> str:
     """
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 420637c1924..5a823c5f7c3 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -62,6 +62,27 @@ def as_string_column(
                 "cudf.core.column.StringColumn", as_column([], dtype="object")
             )
 
+    def __pow__(self, other):
+        if isinstance(other, int):
+            if other == 0:
+                res = cudf.core.column.full(
+                    size=len(self), fill_value=1, dtype=self.dtype
+                )
+                if self.nullable:
+                    res = res.set_mask(self.mask)
+                return res
+            elif other < 0:
+                raise TypeError("Power of negative integers not supported.")
+            res = self
+            for _ in range(other - 1):
+                res = self * res
+            return res
+        else:
+            raise NotImplementedError(
+                f"__pow__ of types {self.dtype} and {type(other)} is "
+                "not yet implemented."
+            )
+
     # Decimals in libcudf don't support truediv, see
     # https://github.com/rapidsai/cudf/pull/7435 for explanation.
     def __truediv__(self, other):
@@ -126,10 +147,7 @@ def fillna(
                 "integer values"
             )
 
-        result = libcudf.replace.replace_nulls(
-            input_col=self, replacement=value, method=method, dtype=dtype
-        )
-        return result._with_type_metadata(self.dtype)
+        return super().fillna(value=value, method=method)
 
     def normalize_binop_value(self, other):
         if isinstance(other, ColumnBase):
@@ -357,12 +375,23 @@ def _get_decimal_type(lhs_dtype, rhs_dtype, op):
     if op in {"__add__", "__sub__"}:
         scale = max(s1, s2)
         precision = scale + max(p1 - s1, p2 - s2) + 1
-    elif op == "__mul__":
-        scale = s1 + s2
-        precision = p1 + p2 + 1
-    elif op == "__div__":
-        scale = max(6, s1 + p2 + 1)
-        precision = p1 - s1 + s2 + scale
+        if precision > Decimal128Dtype.MAX_PRECISION:
+            precision = Decimal128Dtype.MAX_PRECISION
+            scale = Decimal128Dtype.MAX_PRECISION - max(p1 - s1, p2 - s2)
+    elif op in {"__mul__", "__div__"}:
+        if op == "__mul__":
+            scale = s1 + s2
+            precision = p1 + p2 + 1
+        else:
+            scale = max(6, s1 + p2 + 1)
+            precision = p1 - s1 + s2 + scale
+        if precision > Decimal128Dtype.MAX_PRECISION:
+            integral = precision - scale
+            if integral < 32:
+                scale = min(scale, Decimal128Dtype.MAX_PRECISION - integral)
+            elif scale > 6 and integral > 32:
+                scale = 6
+            precision = Decimal128Dtype.MAX_PRECISION
     else:
         raise NotImplementedError()
 
@@ -390,9 +419,9 @@ def _get_decimal_type(lhs_dtype, rhs_dtype, op):
     # can fit `precision` & `scale`.
     max_precision = max(lhs_dtype.MAX_PRECISION, rhs_dtype.MAX_PRECISION)
     for decimal_type in (
-        cudf.Decimal32Dtype,
-        cudf.Decimal64Dtype,
-        cudf.Decimal128Dtype,
+        Decimal32Dtype,
+        Decimal64Dtype,
+        Decimal128Dtype,
     ):
         if decimal_type.MAX_PRECISION >= max_precision:
             try:
@@ -402,15 +431,12 @@ def _get_decimal_type(lhs_dtype, rhs_dtype, op):
                 # to try the next dtype
                 continue
 
-    # Instead of raising an overflow error, we create a `Decimal128Dtype`
-    # with max possible scale & precision, see example of this demonstration
-    # here: https://learn.microsoft.com/en-us/sql/t-sql/data-types/
-    # precision-scale-and-length-transact-sql?view=sql-server-ver16#examples
-    scale = min(
-        scale, cudf.Decimal128Dtype.MAX_PRECISION - (precision - scale)
+    # if we've reached this point, we cannot create a decimal type without
+    # overflow; raise an informative error
+    raise ValueError(
+        f"Performing {op} between columns of type {repr(lhs_dtype)} and "
+        f"{repr(rhs_dtype)} would result in overflow"
     )
-    precision = min(cudf.Decimal128Dtype.MAX_PRECISION, max_precision)
-    return cudf.Decimal128Dtype(precision=precision, scale=scale)
 
 
 def _same_precision_and_scale(lhs: DecimalDtype, rhs: DecimalDtype) -> bool:
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index 1b9caa42ecf..38384d09126 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -21,7 +21,6 @@ def __init__(
         children=(),
         closed="right",
     ):
-
         super().__init__(
             data=None,
             dtype=dtype,
@@ -128,7 +127,7 @@ def as_interval_column(self, dtype, **kwargs):
 
     def to_pandas(
         self, index: Optional[pd.Index] = None, **kwargs
-    ) -> "pd.Series":
+    ) -> pd.Series:
         # Note: This does not handle null values in the interval column.
         # However, this exact sequence (calling __from_arrow__ on the output of
         # self.to_arrow) is currently the best known way to convert interval
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 840858c4bdb..50795c22b82 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -13,12 +13,14 @@
     cast,
 )
 
+import cupy as cp
 import numpy as np
 import pandas as pd
 
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.stream_compaction import drop_nulls
+from cudf._lib.types import size_type_dtype
 from cudf._typing import (
     ColumnBinaryOperand,
     ColumnLike,
@@ -31,14 +33,9 @@
     is_float_dtype,
     is_integer,
     is_integer_dtype,
-    is_number,
     is_scalar,
 )
-from cudf.core.buffer import (
-    Buffer,
-    acquire_spill_lock,
-    cuda_array_interface_wrapper,
-)
+from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -49,14 +46,12 @@
 )
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
-from cudf.utils import cudautils
 from cudf.utils.dtypes import (
     NUMERIC_TYPES,
     min_column_type,
     min_signed_type,
     np_dtypes_to_pandas_dtypes,
     numeric_normalize_types,
-    to_cudf_compatible_scalar,
 )
 
 from .numerical_base import NumericalBaseColumn
@@ -124,6 +119,19 @@ def __contains__(self, item: ScalarLike) -> bool:
             self, column.as_column([item], dtype=self.dtype)
         ).any()
 
+    def indices_of(self, value: ScalarLike) -> NumericalColumn:
+        if (
+            value is not None
+            and self.dtype.kind in {"c", "f"}
+            and np.isnan(value)
+        ):
+            return column.as_column(
+                cp.argwhere(cp.isnan(self.data_array_view(mode="read"))),
+                dtype=size_type_dtype,
+            )
+        else:
+            return super().indices_of(value)
+
     def has_nulls(self, include_nan=False):
         return self.null_count != 0 or (
             self.nan_count != 0 if include_nan else False
@@ -567,62 +575,6 @@ def fillna(
 
         return super(NumericalColumn, col).fillna(fill_value, method)
 
-    @acquire_spill_lock()
-    def _find_value(
-        self, value: ScalarLike, closest: bool, find: Callable, compare: str
-    ) -> int:
-        value = to_cudf_compatible_scalar(value)
-        if not is_number(value):
-            raise ValueError("Expected a numeric value")
-        found = 0
-        if len(self):
-            found = find(
-                self.data_array_view(mode="read"),
-                value,
-                mask=self.mask,
-            )
-        if found == -1:
-            if self.is_monotonic_increasing and closest:
-                found = find(
-                    self.data_array_view(mode="read"),
-                    value,
-                    mask=self.mask,
-                    compare=compare,
-                )
-                if found == -1:
-                    raise ValueError("value not found")
-            else:
-                raise ValueError("value not found")
-        return found
-
-    def find_first_value(
-        self, value: ScalarLike, closest: bool = False
-    ) -> int:
-        """
-        Returns offset of first value that matches. For monotonic
-        columns, returns the offset of the first larger value
-        if closest=True.
-        """
-        if self.is_monotonic_increasing and closest:
-            if value < self.min():
-                return 0
-            elif value > self.max():
-                return len(self)
-        return self._find_value(value, closest, cudautils.find_first, "gt")
-
-    def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
-        """
-        Returns offset of last value that matches. For monotonic
-        columns, returns the offset of the last smaller value
-        if closest=True.
-        """
-        if self.is_monotonic_increasing and closest:
-            if value < self.min():
-                return -1
-            elif value > self.max():
-                return len(self) - 1
-        return self._find_value(value, closest, cudautils.find_last, "lt")
-
     def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
         """
         Returns true if all the values in self can be
@@ -723,7 +675,7 @@ def to_pandas(
         index: Optional[pd.Index] = None,
         nullable: bool = False,
         **kwargs,
-    ) -> "pd.Series":
+    ) -> pd.Series:
         if nullable and self.dtype in np_dtypes_to_pandas_dtypes:
             pandas_nullable_dtype = np_dtypes_to_pandas_dtypes[self.dtype]
             arrow_array = self.to_arrow()
@@ -768,10 +720,12 @@ def _normalize_find_and_replace_input(
         if len(col_to_normalize) == 1:
             if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]):
                 return normalized_column.astype(input_column_dtype)
-            else:
-                col_to_normalize_casted = input_column_dtype.type(
-                    col_to_normalize[0]
-                )
+            if np.isinf(col_to_normalize[0]):
+                return normalized_column
+            col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
+                input_column_dtype
+            )
+
             if not np.isnan(col_to_normalize_casted) and (
                 col_to_normalize_casted != col_to_normalize[0]
             ):
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 08c2f7cc7b1..e59d56af9dc 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -115,6 +115,16 @@ def quantile(
             result = self._numeric_quantile(q, interpolation, exact)
         if return_scalar:
             scalar_result = result.element_indexing(0)
+            if interpolation in {"lower", "higher", "nearest"}:
+                try:
+                    new_scalar = self.dtype.type(scalar_result)
+                    scalar_result = (
+                        new_scalar
+                        if new_scalar == scalar_result
+                        else scalar_result
+                    )
+                except (TypeError, ValueError):
+                    pass
             return (
                 cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
                 if scalar_result is NA
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 8e83d0c72b6..fe21dc87bac 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4609,7 +4609,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
             )
         if isinstance(self._parent, cudf.Series):
             result.index = self._parent.index.repeat(  # type: ignore
-                self.token_count()
+                self.token_count(delimiter=delimiter)
             )
         return result
 
@@ -4850,6 +4850,49 @@ def character_ngrams(
             return result.explode()
         return result
 
+    def hash_character_ngrams(
+        self, n: int = 5, as_list: bool = False
+    ) -> SeriesOrIndex:
+        """
+        Generate hashes of n-grams from characters in a column of strings.
+        The MurmurHash32 algorithm is used to produce the hash results.
+
+        Parameters
+        ----------
+        n : int
+            The degree of the n-gram (number of consecutive characters).
+            Default is 5.
+        as_list : bool
+            Set to True to return the hashes in a list column where each
+            list element is the hashes for each string.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> str_series = cudf.Series(['abcdefg','stuvwxyz'])
+        >>> str_series.str.hash_character_ngrams(5, True)
+        0               [3902511862, 570445242, 4202475763]
+        1    [556054766, 3166857694, 3760633458, 192452857]
+        dtype: list
+        >>> str_series.str.hash_character_ngrams(5)
+        0    3902511862
+        0     570445242
+        0    4202475763
+        1     556054766
+        1    3166857694
+        1    3760633458
+        1     192452857
+        dtype: uint32
+        """
+
+        result = self._return_or_inplace(
+            libstrings.hash_character_ngrams(self._column, n),
+            retain_index=True,
+        )
+        if isinstance(result, cudf.Series) and not as_list:
+            return result.explode()
+        return result
+
     def ngrams_tokenize(
         self, n: int = 2, delimiter: str = " ", separator: str = "_"
     ) -> SeriesOrIndex:
@@ -5209,18 +5252,16 @@ def edit_distance_matrix(self) -> SeriesOrIndex:
         should not contain nulls.
 
         Edit distance is measured based on the `Levenshtein edit distance
-        algorithm
-        <https://www.cuelogic.com/blog/the-levenshtein-algorithm>`_.
-
+        algorithm <https://www.cuelogic.com/blog/the-levenshtein-algorithm>`_.
 
         Returns
         -------
         Series of ListDtype(int64)
-            Assume `N` is the length of this series. The return series contains
-            `N` lists of size `N`, where the `j`th number in the `i`th row of
-            the series tells the edit distance between the `i`th string and the
-            `j`th string of this series.
-            The matrix is symmetric. Diagonal elements are 0.
+            Assume ``N`` is the length of this series. The return series
+            contains ``N`` lists of size ``N``, where the ``j`` th number in
+            the ``i`` th row of the series tells the edit distance between the
+            ``i`` th string and the ``j`` th string of this series.  The matrix
+            is symmetric. Diagonal elements are 0.
 
         Examples
         --------
@@ -5246,26 +5287,20 @@ def edit_distance_matrix(self) -> SeriesOrIndex:
         )
 
     def minhash(
-        self,
-        seeds: Optional[cudf.Series] = None,
-        n: int = 4,
-        method: str = "murmur3",
+        self, seeds: Optional[ColumnLike] = None, width: int = 4
     ) -> SeriesOrIndex:
         """
         Compute the minhash of a strings column.
+        This uses the MurmurHash3_x86_32 algorithm for the hash function.
 
         Parameters
         ----------
-        seeds : Series
+        seeds : ColumnLike
             The seeds used for the hash algorithm.
             Must be of type uint32.
-        n : int
+        width : int
             The width of the substring to hash.
             Default is 4 characters.
-        method : str
-            Hash function to use.
-            Only 'murmur3' (MurmurHash3_32) is supported.
-            Default is 'murmur3'.
 
         Examples
         --------
@@ -5273,9 +5308,9 @@ def minhash(
         >>> str_series = cudf.Series(['this is my', 'favorite book'])
         >>> seeds = cudf.Series([0], dtype=np.uint32)
         >>> str_series.str.minhash(seeds)
-        0     21141582
-        1    962346254
-        dtype: uint32
+        0     [21141582]
+        1    [962346254]
+        dtype: list
         >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
         >>> str_series.str.minhash(seeds)
         0    [21141582, 403093213, 1258052021]
@@ -5284,18 +5319,89 @@ def minhash(
         """
         if seeds is None:
             seeds_column = column.as_column(0, dtype=np.uint32, length=1)
-        elif isinstance(seeds, cudf.Series) and seeds.dtype == np.uint32:
-            seeds_column = seeds._column
         else:
-            raise ValueError(
-                f"Expecting a Series with dtype uint32, got {type(seeds)}"
-            )
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint32:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.minhash(self._column, seeds_column, width)
+        )
+
+    def minhash64(
+        self, seeds: Optional[ColumnLike] = None, width: int = 4
+    ) -> SeriesOrIndex:
+        """
+        Compute the minhash of a strings column.
+        This uses the MurmurHash3_x64_128 algorithm for the hash function.
+        This function generates 2 uint64 values but only the first
+        uint64 value is used.
+
+        Parameters
+        ----------
+        seeds : ColumnLike
+            The seeds used for the hash algorithm.
+            Must be of type uint64.
+        width : int
+            The width of the substring to hash.
+            Default is 4 characters.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> str_series = cudf.Series(['this is my', 'favorite book'])
+        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+        >>> str_series.str.minhash64(seeds)
+        0    [3232308021562742685, 4445611509348165860, 586435843695903598]
+        1    [23008204270530356, 1281229757012344693, 153762819128779913]
+        dtype: list
+        """
+        if seeds is None:
+            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
+        else:
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint64:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.minhash64(self._column, seeds_column, width)
+        )
+
+    def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
+        """
+        Compute the Jaccard index between this column and the given
+        input strings column.
+
+        Parameters
+        ----------
+        input : Series
+            The input strings column to compute the Jaccard index against.
+            Must have the same number of strings as this column.
+        width : int
+            The number of characters for the sliding window calculation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> str1 = cudf.Series(["the brown dog", "jumped about"])
+        >>> str2 = cudf.Series(["the black cat", "jumped around"])
+        >>> str1.str.jaccard_index(str2, 5)
+        0    0.058824
+        1    0.307692
+        dtype: float32
+        """
+
         return self._return_or_inplace(
-            libstrings.minhash(self._column, seeds_column, n, method)
+            libstrings.jaccard_index(self._column, input._column, width),
         )
 
 
 def _massage_string_arg(value, name, allow_col=False):
+    if isinstance(value, cudf.Scalar):
+        return value
+
     if isinstance(value, str):
         return cudf.Scalar(value, dtype="str")
 
@@ -5632,7 +5738,7 @@ def to_pandas(
         index: Optional[pd.Index] = None,
         nullable: bool = False,
         **kwargs,
-    ) -> "pd.Series":
+    ) -> pd.Series:
         if nullable:
             pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow())
             pd_series = pd.Series(pandas_array, copy=False)
@@ -5714,23 +5820,6 @@ def fillna(
         else:
             return super().fillna(method=method)
 
-    def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]:
-        found_indices = libcudf.search.contains(
-            column.as_column([value], dtype=self.dtype), self
-        )
-        found_indices = libcudf.unary.cast(found_indices, dtype=np.int32)
-        first = column.as_column(found_indices).find_first_value(np.int32(1))
-        last = column.as_column(found_indices).find_last_value(np.int32(1))
-        return first, last
-
-    def find_first_value(
-        self, value: ScalarLike, closest: bool = False
-    ) -> int:
-        return self._find_first_and_last(value)[0]
-
-    def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
-        return self._find_first_and_last(value)[1]
-
     def normalize_binop_value(
         self, other
     ) -> Union[column.ColumnBase, cudf.Scalar]:
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 6306bd1f32d..0bb21f4c25a 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -29,10 +29,10 @@ class StructColumn(ColumnBase):
 
     @property
     def base_size(self):
-        if not self.base_children:
-            return 0
-        else:
+        if self.base_children:
             return len(self.base_children[0])
+        else:
+            return self.size + self.offset
 
     def to_arrow(self):
         children = [
@@ -60,7 +60,7 @@ def to_arrow(self):
 
     def to_pandas(
         self, index: Optional[pd.Index] = None, **kwargs
-    ) -> "pd.Series":
+    ) -> pd.Series:
         # We cannot go via Arrow's `to_pandas` because of the following issue:
         # https://issues.apache.org/jira/browse/ARROW-12680
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index e1d913742ec..e254761e9ec 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -16,7 +16,7 @@
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import np_to_pa_dtype
-from cudf.utils.utils import _fillna_natwise
+from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise
 
 _dtype_to_format_conversion = {
     "timedelta64[ns]": "%D days %H:%M:%S",
@@ -101,7 +101,7 @@ def __init__(
             null_count=null_count,
         )
 
-        if not (self.dtype.type is np.timedelta64):
+        if self.dtype.type is not np.timedelta64:
             raise TypeError(f"{self.dtype} is not a supported duration type")
 
         self._time_unit, _ = np.datetime_data(self.dtype)
@@ -181,7 +181,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 "__ge__",
                 "NULL_EQUALS",
             }:
-                out_dtype = np.bool_
+                out_dtype = cudf.dtype(np.bool_)
             elif op == "__mod__":
                 out_dtype = determine_out_dtype(self.dtype, other.dtype)
             elif op in {"__truediv__", "__floordiv__"}:
@@ -202,23 +202,48 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         elif other.dtype.kind in {"f", "i", "u"}:
             if op in {"__mul__", "__mod__", "__truediv__", "__floordiv__"}:
                 out_dtype = self.dtype
+            elif op in {"__eq__", "NULL_EQUALS", "__ne__"}:
+                if isinstance(other, ColumnBase) and not isinstance(
+                    other, TimeDeltaColumn
+                ):
+                    result = _all_bools_with_nulls(
+                        self, other, bool_fill_value=op == "__ne__"
+                    )
+                    if cudf.get_option("mode.pandas_compatible"):
+                        result = result.fillna(op == "__ne__")
+                    return result
 
         if out_dtype is None:
             return NotImplemented
 
         lhs, rhs = (other, this) if reflect else (this, other)
 
-        return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
+        result = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
+        if cudf.get_option(
+            "mode.pandas_compatible"
+        ) and out_dtype == cudf.dtype(np.bool_):
+            result = result.fillna(op == "__ne__")
+        return result
 
     def normalize_binop_value(self, other) -> ColumnBinaryOperand:
         if isinstance(other, (ColumnBase, cudf.Scalar)):
             return other
-        if isinstance(other, datetime.timedelta):
-            other = np.timedelta64(other)
-        elif isinstance(other, pd.Timestamp):
+
+        tz_error_msg = (
+            "Cannot perform binary operation on timezone-naive columns"
+            " and timezone-aware timestamps."
+        )
+        if isinstance(other, pd.Timestamp):
+            if other.tz is not None:
+                raise NotImplementedError(tz_error_msg)
             other = other.to_datetime64()
         elif isinstance(other, pd.Timedelta):
             other = other.to_timedelta64()
+        elif isinstance(other, datetime.timedelta):
+            other = np.timedelta64(other)
+        elif isinstance(other, datetime.datetime) and other.tzinfo is not None:
+            raise NotImplementedError(tz_error_msg)
+
         if isinstance(other, np.timedelta64):
             other_time_unit = cudf.utils.dtypes.get_time_unit(other)
             if np.isnat(other):
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 832d5acf2de..bec9c367ba9 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -19,6 +19,8 @@
 
 import pandas as pd
 from packaging.version import Version
+from pandas.api.types import is_bool
+from typing_extensions import Self
 
 import cudf
 from cudf.core import column
@@ -359,7 +361,8 @@ def get_labels_by_index(self, index: Any) -> tuple:
 
         Parameters
         ----------
-        index : integer, integer slice, or list-like of integers
+        index : integer, integer slice, boolean mask,
+            or list-like of integers
             The column indexes.
 
         Returns
@@ -371,6 +374,18 @@ def get_labels_by_index(self, index: Any) -> tuple:
             return self.names[start:stop:step]
         elif pd.api.types.is_integer(index):
             return (self.names[index],)
+        elif (bn := len(index)) > 0 and all(map(is_bool, index)):
+            if bn != (n := len(self.names)):
+                raise IndexError(
+                    f"Boolean mask has wrong length: {bn} not {n}"
+                )
+            if isinstance(index, (pd.Series, cudf.Series)):
+                # Don't allow iloc indexing with series
+                raise NotImplementedError(
+                    "Cannot use Series object for mask iloc indexing"
+                )
+            # TODO: Doesn't handle on-device columns
+            return tuple(n for n, keep in zip(self.names, index) if keep)
         else:
             return tuple(self.names[i] for i in index)
 
@@ -381,7 +396,8 @@ def select_by_index(self, index: Any) -> ColumnAccessor:
 
         Parameters
         ----------
-        key : integer, integer slice, or list-like of integers
+        key : integer, integer slice, boolean mask,
+            or list-like of integers
 
         Returns
         -------
@@ -461,8 +477,29 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
         self._data[key] = value
         self._clear_cache()
 
+    def _select_by_names(self, names: abc.Sequence) -> Self:
+        return self.__class__(
+            {key: self[key] for key in names},
+            multiindex=self.multiindex,
+            level_names=self.level_names,
+        )
+
     def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
-        data = {k: self._grouped_data[k] for k in key}
+        # Might be a generator
+        key = tuple(key)
+        # Special-casing for boolean mask
+        if (bn := len(key)) > 0 and all(map(is_bool, key)):
+            if bn != (n := len(self.names)):
+                raise IndexError(
+                    f"Boolean mask has wrong length: {bn} not {n}"
+                )
+            data = dict(
+                item
+                for item, keep in zip(self._grouped_data.items(), key)
+                if keep
+            )
+        else:
+            data = {k: self._grouped_data[k] for k in key}
         if self.multiindex:
             data = _to_flat_dict(data)
         return self.__class__(
@@ -474,7 +511,7 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
     def _select_by_label_grouped(self, key: Any) -> ColumnAccessor:
         result = self._grouped_data[key]
         if isinstance(result, cudf.core.column.ColumnBase):
-            return self.__class__({key: result})
+            return self.__class__({key: result}, multiindex=self.multiindex)
         else:
             if self.multiindex:
                 result = _to_flat_dict(result)
diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py
new file mode 100644
index 00000000000..6afbc0bbc65
--- /dev/null
+++ b/python/cudf/cudf/core/copy_types.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, cast
+
+from typing_extensions import Self
+
+import cudf
+import cudf._lib as libcudf
+from cudf._lib.types import size_type_dtype
+
+if TYPE_CHECKING:
+    from cudf.core.column import NumericalColumn
+
+
+@dataclass
+class GatherMap:
+    """A representation of a column as a gather map.
+
+    This object augments the column with the information that it
+    is valid as a gather map for the specified number of rows with
+    the given nullification flag.
+
+    Parameters
+    ----------
+    column
+        The data to turn into a column and then verify
+    nrows
+        The number of rows to verify against
+    nullify
+        Will the gather map be used nullifying out of bounds
+        accesses?
+
+    Returns
+    -------
+    GatherMap
+        New object wrapping the column bearing witness to its
+        suitability as a gather map for columns with nrows.
+
+    Raises
+    ------
+    TypeError
+        If the column is of unsuitable dtype
+    IndexError
+        If the map is not in bounds.
+    """
+
+    #: The gather map
+    column: "NumericalColumn"
+    #: The number of rows the gather map has been validated for
+    nrows: int
+    #: Was the validation for nullify=True?
+    nullify: bool
+
+    def __init__(self, column: Any, nrows: int, *, nullify: bool):
+        self.column = cudf.core.column.as_column(column)
+        self.nrows = nrows
+        self.nullify = nullify
+        if len(self.column) == 0:
+            # Any empty column is valid as a gather map
+            # This is necessary because as_column([]) defaults to float64
+            # TODO: we should fix this further up.
+            # Alternately we can have an Optional[Column] and handle None
+            # specially in _gather.
+            self.column = cast(
+                "NumericalColumn", self.column.astype(size_type_dtype)
+            )
+        else:
+            if self.column.dtype.kind not in {"i", "u"}:
+                raise TypeError("Gather map must have integer dtype")
+            if not nullify:
+                lo, hi = libcudf.reduce.minmax(self.column)
+                if lo.value < -nrows or hi.value >= nrows:
+                    raise IndexError(
+                        f"Gather map is out of bounds for [0, {nrows})"
+                    )
+
+    @classmethod
+    def from_column_unchecked(
+        cls, column: "NumericalColumn", nrows: int, *, nullify: bool
+    ) -> Self:
+        """Construct a new GatherMap from a column without checks.
+
+        Parameters
+        ----------
+        column
+           The column that will be used as a gather map
+        nrows
+           The number of rows the gather map will be used for
+        nullify
+           Will the gather map be used nullifying out of bounds
+           accesses?
+
+        Returns
+        -------
+        GatherMap
+
+        Notes
+        -----
+        This method asserts, by fiat, that the column is valid.
+        Behaviour is undefined if it is not.
+        """
+        self = cls.__new__(cls)
+        self.column = column
+        self.nrows = nrows
+        self.nullify = nullify
+        return self
+
+
+@dataclass
+class BooleanMask:
+    """A representation of a column as a boolean mask.
+
+    This augments the column with information that it is valid as a
+    boolean mask for columns with a given number of rows
+
+    Parameters
+    ----------
+    column
+        The data to turn into a column to then verify
+    nrows
+        the number of rows to verify against
+
+    Returns
+    -------
+    BooleanMask
+        New object wrapping the column bearing witness to its
+        suitability as a boolean mask for columns with matching
+        row count.
+
+    Raises
+    ------
+    TypeError
+        If the column is of unsuitable dtype
+    IndexError
+        If the mask has the wrong number of rows
+    """
+
+    #: The boolean mask
+    column: "NumericalColumn"
+
+    def __init__(self, column: Any, nrows: int):
+        self.column = cudf.core.column.as_column(column)
+        if self.column.dtype.kind != "b":
+            raise TypeError("Boolean mask must have bool dtype")
+        if len(column) != nrows:
+            raise IndexError(
+                f"Column with {len(column)} rows not suitable "
+                f"as a boolean mask for {nrows} rows"
+            )
+
+    @classmethod
+    def from_column_unchecked(cls, column: "NumericalColumn") -> Self:
+        """Construct a new BooleanMask from a column without checks.
+
+        Parameters
+        ----------
+        column
+           The column that will be used as a boolean mask
+
+        Returns
+        -------
+        BooleanMask
+
+        Notes
+        -----
+        This method asserts, by fiat, that the column is valid.
+        Behaviour is undefined if it is not.
+        """
+        self = cls.__new__(cls)
+        self.column = column
+        return self
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0fb82b8583c..5a3d25a08a7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -22,7 +22,6 @@
     Optional,
     Set,
     Tuple,
-    TypeVar,
     Union,
 )
 
@@ -37,11 +36,13 @@
 from pandas.core.dtypes.common import is_float, is_integer
 from pandas.io.formats import console
 from pandas.io.formats.printing import pprint_thing
+from typing_extensions import Self, assert_never
 
 import cudf
 import cudf.core.common
 from cudf import _lib as libcudf
 from cudf._typing import ColumnLike, Dtype, NotImplementedType
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
@@ -56,7 +57,7 @@
     is_string_dtype,
     is_struct_dtype,
 )
-from cudf.core import column, df_protocol, reshape
+from cudf.core import column, df_protocol, indexing_utils, reshape
 from cudf.core.abc import Serializable
 from cudf.core.column import (
     CategoricalColumn,
@@ -68,14 +69,9 @@
     concat_columns,
 )
 from cudf.core.column_accessor import ColumnAccessor
+from cudf.core.copy_types import BooleanMask
 from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template
-from cudf.core.index import (
-    BaseIndex,
-    Index,
-    RangeIndex,
-    _index_from_data,
-    as_index,
-)
+from cudf.core.index import BaseIndex, RangeIndex, _index_from_data, as_index
 from cudf.core.indexed_frame import (
     IndexedFrame,
     _FrameIndexer,
@@ -105,9 +101,6 @@
     _external_only_api,
 )
 
-T = TypeVar("T", bound="DataFrame")
-
-
 _cupy_nan_methods_map = {
     "min": "nanmin",
     "max": "nanmax",
@@ -180,13 +173,13 @@ def _can_downcast_to_series(self, df, arg):
             all_numeric = all(is_numeric_dtype(t) for t in dtypes)
             if all_numeric:
                 return True
+            if isinstance(arg[1], tuple):
+                return True
         if ncols == 1:
             if type(arg[1]) is slice:
                 return False
             if isinstance(arg[1], tuple):
-                # Multiindex indexing with a slice
-                if any(isinstance(v, slice) for v in arg):
-                    return False
+                return len(arg[1]) == df._data.nlevels
             if not (is_list_like(arg[1]) or is_column_like(arg[1])):
                 return True
         return False
@@ -200,7 +193,10 @@ def _downcast_to_series(self, df, arg):
         nrows, ncols = df.shape
         # determine the axis along which the Series is taken:
         if nrows == 1 and ncols == 1:
-            if is_scalar(arg[0]) and is_scalar(arg[1]):
+            if is_scalar(arg[0]) and (
+                is_scalar(arg[1])
+                or (df._data.multiindex and arg[1] in df._column_names)
+            ):
                 return df[df._column_names[0]].iloc[0]
             elif not is_scalar(arg[0]):
                 axis = 1
@@ -273,7 +269,11 @@ def _getitem_tuple_arg(self, arg):
                 if isinstance(out, slice):
                     df = columns_df._slice(out)
                 else:
-                    df = columns_df._apply_boolean_mask(out)
+                    df = columns_df._apply_boolean_mask(
+                        BooleanMask.from_column_unchecked(
+                            cudf.core.column.as_column(out)
+                        )
+                    )
             else:
                 tmp_arg = arg
                 if is_scalar(arg[0]):
@@ -286,19 +286,32 @@ def _getitem_tuple_arg(self, arg):
                 tmp_arg = (as_column(tmp_arg[0]), tmp_arg[1])
 
                 if is_bool_dtype(tmp_arg[0]):
-                    df = columns_df._apply_boolean_mask(tmp_arg[0])
+                    df = columns_df._apply_boolean_mask(
+                        BooleanMask(tmp_arg[0], len(columns_df))
+                    )
                 else:
                     tmp_col_name = str(uuid4())
+                    cantor_name = "_" + "_".join(
+                        map(str, columns_df._data.names)
+                    )
+                    if columns_df._data.multiindex:
+                        # column names must be appropriate length tuples
+                        extra = tuple(
+                            "" for _ in range(columns_df._data.nlevels - 1)
+                        )
+                        tmp_col_name = (tmp_col_name, *extra)
+                        cantor_name = (cantor_name, *extra)
                     other_df = DataFrame(
                         {tmp_col_name: column.arange(len(tmp_arg[0]))},
                         index=as_index(tmp_arg[0]),
                     )
+                    columns_df[cantor_name] = column.arange(len(columns_df))
                     df = other_df.join(columns_df, how="inner")
                     # as join is not assigning any names to index,
                     # update it over here
                     df.index.name = columns_df.index.name
-                    df = df.sort_values(tmp_col_name)
-                    df.drop(columns=[tmp_col_name], inplace=True)
+                    df = df.sort_values(by=[tmp_col_name, cantor_name])
+                    df.drop(columns=[tmp_col_name, cantor_name], inplace=True)
                     # There were no indices found
                     if len(df) == 0:
                         raise KeyError(arg)
@@ -403,57 +416,55 @@ class _DataFrameIlocIndexer(_DataFrameIndexer):
     For selection by index.
     """
 
-    @_cudf_nvtx_annotate
-    def _getitem_tuple_arg(self, arg):
-        # Iloc Step 1:
-        # Gather the columns specified by the second tuple arg
-        columns_df = self._frame._from_data(
-            self._frame._data.select_by_index(arg[1]), self._frame._index
-        )
+    _frame: DataFrame
 
-        # Iloc Step 2:
-        # Gather the rows specified by the first tuple arg
-        if isinstance(columns_df.index, MultiIndex):
-            if isinstance(arg[0], slice):
-                df = columns_df[arg[0]]
-            else:
-                df = columns_df.index._get_row_major(columns_df, arg[0])
-            if (len(df) == 1 and len(columns_df) >= 1) and not (
-                isinstance(arg[0], slice) or isinstance(arg[1], slice)
-            ):
-                # Pandas returns a numpy scalar in this case
-                return df.iloc[0]
-            if self._can_downcast_to_series(df, arg):
-                return self._downcast_to_series(df, arg)
-            return df
+    def __getitem__(self, arg):
+        row_key, (
+            col_is_scalar,
+            column_names,
+        ) = indexing_utils.destructure_dataframe_iloc_indexer(arg, self._frame)
+        row_spec = indexing_utils.parse_row_iloc_indexer(
+            row_key, len(self._frame)
+        )
+        ca = self._frame._data
+        index = self._frame.index
+        if col_is_scalar:
+            s = Series._from_data(
+                ca._select_by_names(column_names), index=index
+            )
+            return s._getitem_preprocessed(row_spec)
+        if column_names != list(self._frame._column_names):
+            frame = self._frame._from_data(
+                ca._select_by_names(column_names), index=index
+            )
         else:
-            if isinstance(arg[0], slice):
-                df = columns_df._slice(arg[0])
-            elif is_scalar(arg[0]):
-                index = arg[0]
-                if index < 0:
-                    index += len(columns_df)
-                df = columns_df._slice(slice(index, index + 1, 1))
-            else:
-                arg = (as_column(arg[0]), arg[1])
-                if is_bool_dtype(arg[0]):
-                    df = columns_df._apply_boolean_mask(arg[0])
-                else:
-                    df = columns_df._gather(arg[0])
-
-        # Iloc Step 3:
-        # Reindex
-        if df.shape[0] == 1:  # we have a single row without an index
-            df.index = as_index(self._frame.index[arg[0]])
-
-        # Iloc Step 4:
-        # Downcast
-        if self._can_downcast_to_series(df, arg):
-            return self._downcast_to_series(df, arg)
-
-        if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice):
-            df._index = as_index(self._frame.index[arg[0]])
-        return df
+            frame = self._frame
+        if isinstance(row_spec, indexing_utils.MapIndexer):
+            return frame._gather(row_spec.key, keep_index=True)
+        elif isinstance(row_spec, indexing_utils.MaskIndexer):
+            return frame._apply_boolean_mask(row_spec.key, keep_index=True)
+        elif isinstance(row_spec, indexing_utils.SliceIndexer):
+            return frame._slice(row_spec.key)
+        elif isinstance(row_spec, indexing_utils.ScalarIndexer):
+            result = frame._gather(row_spec.key, keep_index=True)
+            # Attempt to turn into series.
+            try:
+                # Behaviour difference from pandas, which will merrily
+                # turn any heterogeneous set of columns into a series if
+                # you only ask for one row.
+                new_name = result.index[0]
+                result = Series._concat(
+                    [result[name] for name in column_names],
+                    index=result.keys(),
+                )
+                result.name = new_name
+                return result
+            except TypeError:
+                # Couldn't find a common type, just return a 1xN dataframe.
+                return result
+        elif isinstance(row_spec, indexing_utils.EmptyIndexer):
+            return frame._empty_like(keep_index=True)
+        assert_never(row_spec)
 
     @_cudf_nvtx_annotate
     def _setitem_tuple_arg(self, key, value):
@@ -501,10 +512,6 @@ def _setitem_tuple_arg(self, key, value):
                     for i, col in enumerate(columns_df._column_names):
                         self._frame[col].iloc[key[0]] = value[i]
 
-    def _getitem_scalar(self, arg):
-        col = self._frame.columns[arg[1]]
-        return self._frame[col].iloc[arg[0]]
-
 
 class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     """
@@ -726,6 +733,10 @@ def __init__(
         if dtype:
             self._data = self.astype(dtype)._data
 
+        self._data.multiindex = self._data.multiindex or isinstance(
+            columns, pd.MultiIndex
+        )
+
     @_cudf_nvtx_annotate
     def _init_from_series_list(self, data, columns, index):
         if index is None:
@@ -842,6 +853,21 @@ def _init_from_list_like(self, data, index=None, columns=None):
             data = DataFrame.from_pandas(pd.DataFrame(data))
             self._data = data._data
         else:
+            if any(
+                not isinstance(col, (abc.Iterable, abc.Sequence))
+                for col in data
+            ):
+                raise TypeError("Inputs should be an iterable or sequence.")
+            if (
+                len(data) > 0
+                and columns is None
+                and isinstance(data[0], tuple)
+                and hasattr(data[0], "_fields")
+            ):
+                # pandas behavior is to use the fields from the first
+                # namedtuple as the column names
+                columns = data[0]._fields
+
             data = list(itertools.zip_longest(*data))
 
             if columns is not None and len(data) == 0:
@@ -867,31 +893,29 @@ def _init_from_dict_like(
         self, data, index=None, columns=None, nan_as_null=None
     ):
         if columns is not None:
-            # remove all entries in `data` that are
-            # not in `columns`
-            keys = [key for key in data.keys() if key in columns]
-            extra_cols = [col for col in columns if col not in keys]
-            if keys:
-                # if keys is non-empty,
-                # add null columns for all values
-                # in `columns` that don't exist in `keys`:
-                data = {key: data[key] for key in keys}
-                data.update({key: None for key in extra_cols})
+            # remove all entries in data that are not in columns,
+            # inserting new empty columns for entries in columns that
+            # are not in data
+            if any(c in data for c in columns):
+                # Let the downstream logic determine the length of the
+                # empty columns here
+                empty_column = lambda: None  # noqa: E731
             else:
-                # If keys is empty, none of the data keys match the columns, so
-                # we need to create an empty DataFrame. To match pandas, the
-                # size of the dataframe must match the provided index, so we
-                # need to return a masked array of nulls if an index is given.
-                row_count = 0 if index is None else len(index)
-                masked = index is not None
-                data = {
-                    key: cudf.core.column.column_empty(
-                        row_count=row_count,
-                        dtype=None,
-                        masked=masked,
-                    )
-                    for key in extra_cols
-                }
+                # If keys is empty, none of the data keys match the
+                # columns, so we need to create an empty DataFrame. To
+                # match pandas, the size of the dataframe must match
+                # the provided index, so we need to return a masked
+                # array of nulls if an index is given.
+                empty_column = functools.partial(
+                    cudf.core.column.column_empty,
+                    row_count=(0 if index is None else len(index)),
+                    dtype=None,
+                    masked=index is not None,
+                )
+
+            data = {
+                c: data[c] if c in data else empty_column() for c in columns
+            }
 
         data, index = self._align_input_series_indices(data, index=index)
 
@@ -933,9 +957,6 @@ def _init_from_dict_like(
                     nan_as_null=nan_as_null,
                 )
 
-        if columns is not None:
-            self.columns = columns
-
     @classmethod
     def _from_data(
         cls,
@@ -1017,6 +1038,12 @@ def deserialize(cls, header, frames):
 
         return obj
 
+    @property
+    @_cudf_nvtx_annotate
+    def shape(self):
+        """Returns a tuple representing the dimensionality of the DataFrame."""
+        return self._num_rows, self._num_columns
+
     @property
     def dtypes(self):
         """
@@ -1177,7 +1204,7 @@ def __getitem__(self, arg):
                     dtype = "float64"
                 mask = pd.Series(mask, dtype=dtype)
             if mask.dtype == "bool":
-                return self._apply_boolean_mask(mask)
+                return self._apply_boolean_mask(BooleanMask(mask, len(self)))
             else:
                 return self._get_columns_by_label(mask)
         elif isinstance(arg, DataFrame):
@@ -1305,107 +1332,6 @@ def __setitem__(self, arg, value):
     def __delitem__(self, name):
         self._drop_column(name)
 
-    @_cudf_nvtx_annotate
-    def _slice(self: T, arg: slice) -> T:
-        """
-        _slice : slice the frame as per the arg
-
-        Parameters
-        ----------
-        arg : should always be of type slice
-
-        """
-        num_rows = len(self)
-        if num_rows == 0:
-            return self
-        start, stop, stride = arg.indices(num_rows)
-
-        # early stop for empty cases
-        if len(range(start, stop, stride)) == 0:
-            columns = ColumnAccessor(
-                {
-                    colname: column.column_empty_like(col, newsize=0)
-                    for colname, col in self._data.items()
-                },
-                multiindex=self._data.multiindex,
-                level_names=self._data.level_names,
-            )
-
-            if isinstance(self.index, MultiIndex):
-                mi_columns = ColumnAccessor(
-                    {
-                        colname: column.column_empty_like(col, newsize=0)
-                        for colname, col in self.index._data.items()
-                    }
-                )
-                return DataFrame._from_data(
-                    columns,
-                    index=MultiIndex._from_data(
-                        mi_columns, name=self.index.name
-                    ),
-                )
-            else:
-                return DataFrame._from_data(
-                    columns,
-                    index=(
-                        RangeIndex(
-                            start=start,
-                            stop=stop,
-                            step=stride,
-                            name=self.index.name,
-                        )
-                        if isinstance(self.index, RangeIndex)
-                        else Index(
-                            [], dtype=self.index.dtype, name=self.index.name
-                        )
-                    ),
-                )
-
-        # If index type is RangeIndex, slice without materializing.
-        is_range_index = isinstance(self.index, RangeIndex)
-        if is_range_index:
-            if self._num_columns == 0:
-                result = self._empty_like(keep_index=False)
-                result._index = self.index[start:stop:stride]
-                return result
-
-        if start < 0:
-            start = start + num_rows
-
-        # Decreasing slices that terminates at -1, such as slice(4, -1, -1),
-        # has end index of 0, The check below makes sure -1 is not wrapped
-        # to `-1 + num_rows`.
-        if stop < 0 and not (stride < 0 and stop == -1):
-            stop = stop + num_rows
-        stride = 1 if stride is None else stride
-
-        if (stop - start) * stride <= 0:
-            return self._empty_like(keep_index=True)
-
-        start = len(self) if start > num_rows else start
-        stop = len(self) if stop > num_rows else stop
-
-        if stride != 1:
-            return self._gather(
-                cudf.core.column.arange(
-                    start, stop=stop, step=stride, dtype=np.int32
-                )
-            )
-
-        columns_to_slice = [
-            *(self._index._data.columns if not is_range_index else []),
-            *self._columns,
-        ]
-        result = self._from_columns_like_self(
-            libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0],
-            self._column_names,
-            None if is_range_index else self._index.names,
-        )
-
-        if is_range_index:
-            result.index = self.index[start:stop]
-        return result
-
     @_cudf_nvtx_annotate
     def memory_usage(self, index=True, deep=False):
         mem_usage = [col.memory_usage for col in self._data.columns]
@@ -1426,6 +1352,11 @@ def __array_function__(self, func, types, args, kwargs):
             return NotImplemented
 
         try:
+            if func.__name__ in {"any", "all"}:
+                # NumPy default for `axis` is
+                # different from `cudf`/`pandas`
+                # hence need this special handling.
+                kwargs.setdefault("axis", None)
             if cudf_func := getattr(self.__class__, func.__name__, None):
                 out = cudf_func(*args, **kwargs)
                 # The dot product of two DataFrames returns an array in pandas.
@@ -1773,7 +1704,19 @@ def _clean_nulls_from_dataframe(self, df):
                 # TODO we need to handle this
                 pass
             elif df._data[col].has_nulls():
-                df[col] = df._data[col].astype("str").fillna(cudf._NA_REP)
+                fill_value = (
+                    str(cudf.NaT)
+                    if isinstance(
+                        df._data[col],
+                        (
+                            cudf.core.column.DatetimeColumn,
+                            cudf.core.column.TimeDeltaColumn,
+                        ),
+                    )
+                    else str(cudf.NA)
+                )
+
+                df[col] = df._data[col].astype("str").fillna(fill_value)
             else:
                 df[col] = df._data[col]
 
@@ -1887,13 +1830,15 @@ def _repr_latex_(self):
         return self._get_renderable_dataframe().to_pandas()._repr_latex_()
 
     @_cudf_nvtx_annotate
-    def _get_columns_by_label(self, labels, downcast=False):
+    def _get_columns_by_label(
+        self, labels, *, downcast=False
+    ) -> Self | Series:
         """
         Return columns of dataframe by `labels`
 
         If downcast is True, try and downcast from a DataFrame to a Series
         """
-        new_data = super()._get_columns_by_label(labels, downcast)
+        ca = self._data.select_by_label(labels)
         if downcast:
             if is_scalar(labels):
                 nlevels = 1
@@ -1901,11 +1846,11 @@ def _get_columns_by_label(self, labels, downcast=False):
                 nlevels = len(labels)
             if self._data.multiindex is False or nlevels == self._data.nlevels:
                 out = self._constructor_sliced._from_data(
-                    new_data, index=self.index, name=labels
+                    ca, index=self.index, name=labels
                 )
                 return out
         out = self.__class__._from_data(
-            new_data, index=self.index, columns=new_data.to_pandas_index()
+            ca, index=self.index, columns=ca.to_pandas_index()
         )
         return out
 
@@ -1924,19 +1869,30 @@ def _make_operands_and_index_for_binop(
             NotImplementedType,
         ],
         Optional[BaseIndex],
+        bool,
     ]:
         lhs, rhs = self._data, other
         index = self._index
         fill_requires_key = False
         left_default: Any = False
+        equal_columns = False
+        can_use_self_column_name = True
 
         if _is_scalar_or_zero_d_array(other):
             rhs = {name: other for name in self._data}
+            equal_columns = True
         elif isinstance(other, Series):
-            rhs = dict(zip(other.index.values_host, other.values_host))
+            rhs = dict(zip(other.index.to_pandas(), other.values_host))
             # For keys in right but not left, perform binops between NaN (not
             # NULL!) and the right value (result is NaN).
             left_default = as_column(np.nan, length=len(self))
+            equal_columns = other.index.to_pandas().equals(
+                self._data.to_pandas_index()
+            )
+            can_use_self_column_name = (
+                equal_columns
+                or list(other._index._data.names) == self._data._level_names
+            )
         elif isinstance(other, DataFrame):
             if (
                 not can_reindex
@@ -1958,13 +1914,18 @@ def _make_operands_and_index_for_binop(
             # For DataFrame-DataFrame ops, always default to operating against
             # the fill value.
             left_default = fill_value
+            equal_columns = self._column_names == other._column_names
+            can_use_self_column_name = (
+                equal_columns
+                or self._data._level_names == other._data._level_names
+            )
         elif isinstance(other, (dict, abc.Mapping)):
             # Need to fail early on host mapping types because we ultimately
             # convert everything to a dict.
-            return NotImplemented, None
+            return NotImplemented, None, True
 
         if not isinstance(rhs, (dict, abc.Mapping)):
-            return NotImplemented, None
+            return NotImplemented, None, True
 
         operands = {
             k: (
@@ -1980,7 +1941,22 @@ def _make_operands_and_index_for_binop(
             for k, v in rhs.items():
                 if k not in lhs:
                     operands[k] = (left_default, v, reflect, None)
-        return operands, index
+
+        if not equal_columns:
+            if isinstance(other, DataFrame):
+                column_names_list = self._data.to_pandas_index().join(
+                    other._data.to_pandas_index(), how="outer"
+                )
+            elif isinstance(other, Series):
+                column_names_list = self._data.to_pandas_index().join(
+                    other.index.to_pandas(), how="outer"
+                )
+            else:
+                raise ValueError("other must be a DataFrame or Series.")
+
+            sorted_dict = {key: operands[key] for key in column_names_list}
+            return sorted_dict, index, can_use_self_column_name
+        return operands, index, can_use_self_column_name
 
     @classmethod
     @_cudf_nvtx_annotate
@@ -2142,6 +2118,7 @@ def to_dict(
             - 'records' : list like
               [{column -> value}, ... , {column -> value}]
             - 'index' : dict like {index -> {column -> value}}
+
             Abbreviations are allowed. `s` indicates `series` and `sp`
             indicates `split`.
 
@@ -2546,6 +2523,7 @@ def reindex(
         We _highly_ recommend using keyword arguments to clarify your intent.
 
         Create a dataframe with some fictional data.
+
         >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
         >>> df = cudf.DataFrame({'http_status': [200, 200, 404, 404, 301],
         ...                    'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
@@ -2587,6 +2565,7 @@ def reindex(
         Chrome                 200           0.02
 
         We can also reindex the columns.
+
         >>> df.reindex(columns=['http_status', 'user_agent'])
                 http_status user_agent
         Firefox            200       <NA>
@@ -2596,6 +2575,7 @@ def reindex(
         Konqueror          301       <NA>
 
         Or we can use "axis-style" keyword arguments
+
         >>> df.reindex(columns=['http_status', 'user_agent'])
                 http_status user_agent
         Firefox            200       <NA>
@@ -2615,7 +2595,7 @@ def reindex(
                 "Cannot specify both 'axis' and any of 'index' or 'columns'."
             )
 
-        axis = self._get_axis_from_axis_arg(axis)
+        axis = 0 if axis is None else self._get_axis_from_axis_arg(axis)
         if axis == 0:
             if index is None:
                 index = labels
@@ -3167,34 +3147,46 @@ def diff(self, periods=1, axis=0):
 
     @_cudf_nvtx_annotate
     def drop_duplicates(
-        self, subset=None, keep="first", inplace=False, ignore_index=False
+        self,
+        subset=None,
+        keep="first",
+        inplace=False,
+        ignore_index=False,
     ):
         """
-        Return DataFrame with duplicate rows removed, optionally only
-        considering certain subset of columns.
+        Return DataFrame with duplicate rows removed.
+
+        Considering certain columns is optional. Indexes, including time
+        indexes are ignored.
 
         Parameters
         ----------
         subset : column label or sequence of labels, optional
             Only consider certain columns for identifying duplicates, by
             default use all of the columns.
-        keep : {'first', 'last', False}, default 'first'
+        keep : {'first', 'last', ``False``}, default 'first'
             Determines which duplicates (if any) to keep.
-            - ``first`` : Drop duplicates except for the first occurrence.
-            - ``last`` : Drop duplicates except for the last occurrence.
-            - False : Drop all duplicates.
-        inplace : bool, default False
+            - 'first' : Drop duplicates except for the first occurrence.
+            - 'last' : Drop duplicates except for the last occurrence.
+            - ``False`` : Drop all duplicates.
+        inplace : bool, default ``False``
             Whether to drop duplicates in place or to return a copy.
-        ignore_index : bool, default False
-            If True, the resulting axis will be labeled 0, 1, …, n - 1.
+        ignore_index : bool, default ``False``
+            If True, the resulting axis will be labeled 0, 1, ..., n - 1.
 
         Returns
         -------
         DataFrame or None
             DataFrame with duplicates removed or None if ``inplace=True``.
 
+        See Also
+        --------
+        DataFrame.value_counts: Count unique combinations of columns.
+
         Examples
         --------
+        Consider a dataset containing ramen ratings.
+
         >>> import cudf
         >>> df = cudf.DataFrame({
         ...     'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
@@ -3209,36 +3201,34 @@ def drop_duplicates(
         3  Indomie  pack    15.0
         4  Indomie  pack     5.0
 
-        By default, it removes duplicate rows based
-        on all columns. Note that order of
-        the rows being returned is not guaranteed
-        to be sorted.
+        By default, it removes duplicate rows based on all columns.
 
         >>> df.drop_duplicates()
              brand style  rating
+        0  Yum Yum   cup     4.0
         2  Indomie   cup     3.5
-        4  Indomie  pack     5.0
         3  Indomie  pack    15.0
-        0  Yum Yum   cup     4.0
+        4  Indomie  pack     5.0
 
-        To remove duplicates on specific column(s),
-        use `subset`.
+        To remove duplicates on specific column(s), use ``subset``.
 
         >>> df.drop_duplicates(subset=['brand'])
              brand style  rating
-        2  Indomie   cup     3.5
         0  Yum Yum   cup     4.0
+        2  Indomie   cup     3.5
 
-        To remove duplicates and keep last occurrences, use `keep`.
+        To remove duplicates and keep last occurrences, use ``keep``.
 
         >>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
              brand style  rating
+        1  Yum Yum   cup     4.0
         2  Indomie   cup     3.5
         4  Indomie  pack     5.0
-        1  Yum Yum   cup     4.0
         """  # noqa: E501
         outdf = super().drop_duplicates(
-            subset=subset, keep=keep, ignore_index=ignore_index
+            subset=subset,
+            keep=keep,
+            ignore_index=ignore_index,
         )
 
         return self._mimic_inplace(outdf, inplace=inplace)
@@ -3952,7 +3942,7 @@ def merge(
         on the kind of join:
         - For inner joins, the result will be the intersection of the
         categories
-        - For left or right joins, the result will be the the left or
+        - For left or right joins, the result will be the left or
         right dtype respectively. This extends to semi and anti joins.
         - For outer joins, the result will be the union of categories
         from both sides.
@@ -4069,7 +4059,7 @@ def groupby(
         axis=0,
         level=None,
         as_index=True,
-        sort=False,
+        sort=no_default,
         group_keys=False,
         squeeze=False,
         observed=True,
@@ -4148,6 +4138,12 @@ def query(self, expr, local_dict=None):
         ...          local_dict={'search_date': search_date2})
            datetimes
         1 2018-10-08
+
+        .. pandas-compat::
+            **DataFrame.query**
+
+            One difference from pandas is that ``query`` currently only
+            supports numeric, datetime, timedelta, or bool dtypes.
         """
         # can't use `annotate` decorator here as we inspect the calling
         # environment.
@@ -4173,7 +4169,9 @@ def query(self, expr, local_dict=None):
             }
             # Run query
             boolmask = queryutils.query_execute(self, expr, callenv)
-            return self._apply_boolean_mask(boolmask)
+            return self._apply_boolean_mask(
+                BooleanMask.from_column_unchecked(boolmask)
+            )
 
     @_cudf_nvtx_annotate
     def apply(
@@ -4196,10 +4194,9 @@ def apply(
         func : function
             Function to apply to each row.
         axis : {0 or 'index', 1 or 'columns'}, default 0
-            Axis along which the function is applied:
-            * 0 or 'index': apply function to each column.
-              Note: axis=0 is not yet supported.
-            * 1 or 'columns': apply function to each row.
+            Axis along which the function is applied.
+            - 0 or 'index': apply function to each column (not yet supported).
+            - 1 or 'columns': apply function to each row.
         raw: bool, default False
             Not yet supported
         result_type: {'expand', 'reduce', 'broadcast', None}, default None
@@ -4352,7 +4349,7 @@ def apply(
         >>> df.apply(f, axis=1)  # doctest: +SKIP
 
         For a complete list of supported functions and methods that may be
-        used to manipulate string data, see the the UDF guide,
+        used to manipulate string data, see the UDF guide,
         <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>
         """
         if axis != 1:
@@ -4934,6 +4931,7 @@ def describe(
             if datetime_is_numeric:
                 default_include.append("datetime")
             else:
+                # Do not remove until pandas 2.0 support is added.
                 warnings.warn(
                     "`datetime_is_numeric` is deprecated. Specify "
                     "`datetime_is_numeric=True` to silence this "
@@ -5127,6 +5125,7 @@ def from_pandas(cls, dataframe, nan_as_null=None):
 
         index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
         df = cls._from_data(data, index)
+        df._data._level_names = list(dataframe.columns.names)
 
         # Set columns only if it is a MultiIndex
         if isinstance(dataframe.columns, pd.MultiIndex):
@@ -5170,13 +5169,19 @@ def from_arrow(cls, table):
         2  3  6
         """
         index_col = None
+        col_index_names = None
         if isinstance(table, pa.Table) and isinstance(
             table.schema.pandas_metadata, dict
         ):
             index_col = table.schema.pandas_metadata["index_columns"]
+            if "column_indexes" in table.schema.pandas_metadata:
+                col_index_names = []
+                for col_meta in table.schema.pandas_metadata["column_indexes"]:
+                    col_index_names.append(col_meta["name"])
 
         out = super().from_arrow(table)
-
+        if col_index_names is not None:
+            out._data._level_names = col_index_names
         if index_col:
             if isinstance(index_col[0], dict):
                 idx = cudf.RangeIndex(
@@ -5422,6 +5427,8 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             df._data[names[0]] = column.as_column(
                 data, nan_as_null=nan_as_null
             )
+        if isinstance(columns, pd.Index):
+            df._data._level_names = list(columns.names)
 
         if index is None:
             df._index = RangeIndex(start=0, stop=len(data))
@@ -5482,16 +5489,23 @@ def quantile(
         numeric_only : bool, default True
             If False, the quantile of datetime and timedelta data will be
             computed as well.
-        interpolation : {`linear`, `lower`, `higher`, `midpoint`, `nearest`}
+        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
             This parameter specifies the interpolation method to use,
             when the desired quantile lies between two data points i and j.
-            Default is ``linear`` for ``method="single"``, and ``nearest``
+            Default is ``'linear'`` for ``method="single"``, and ``'nearest'``
             for ``method="table"``.
+
+                * linear: `i + (j - i) * fraction`, where `fraction` is the
+                  fractional part of the index surrounded by `i` and `j`.
+                * lower: `i`.
+                * higher: `j`.
+                * nearest: `i` or `j` whichever is nearest.
+                * midpoint: (`i` + `j`) / 2.
         columns : list of str
             List of column names to include.
         exact : boolean
             Whether to use approximate or exact quantile algorithm.
-        method : {`single`, `table`}, default `single`
+        method : {'single', 'table'}, default `'single'`
             Whether to compute quantiles per-column ('single') or over all
             columns ('table'). When 'table', the only allowed interpolation
             methods are 'nearest', 'lower', and 'higher'.
@@ -5829,7 +5843,6 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs):
     _SUPPORT_AXIS_LOOKUP = {
         0: 0,
         1: 1,
-        None: 0,
         "index": 0,
         "columns": 1,
     }
@@ -5855,11 +5868,28 @@ def _reduce(
             )
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
-                return Series(index=cudf.StringIndex([]))
+                return Series(index=cudf.Index([], dtype="str"))
 
-        axis = source._get_axis_from_axis_arg(axis)
+        if axis is None:
+            if op in {"any", "all"}:
+                axis = 2
+            else:
+                # Do not remove until pandas 2.0 support is added.
+                warnings.warn(
+                    f"In a future version, {type(self).__name__}"
+                    f".{op}(axis=None) will return a scalar {op} over "
+                    "the entire DataFrame. To retain the old behavior, "
+                    f"use '{type(self).__name__}.{op}(axis=0)' or "
+                    f"just '{type(self)}.{op}()'",
+                    FutureWarning,
+                )
+                axis = 0
+        elif axis is no_default:
+            axis = 0
+        else:
+            axis = source._get_axis_from_axis_arg(axis)
 
-        if axis == 0:
+        if axis in {0, 2}:
             try:
                 result = [
                     getattr(source._data[col], op)(**kwargs)
@@ -5881,6 +5911,7 @@ def _reduce(
                 )
 
                 if numeric_only is None and op in numeric_ops:
+                    # Do not remove until pandas 2.0 support is added.
                     warnings.warn(
                         f"The default value of numeric_only in DataFrame.{op} "
                         "is deprecated. In a future version, it will default "
@@ -5897,7 +5928,10 @@ def _reduce(
                     )
                     source = self._get_columns_by_label(numeric_cols)
                     if source.empty:
-                        return Series(index=cudf.StringIndex([]))
+                        if axis == 2:
+                            return getattr(as_column([]), op)(**kwargs)
+                        else:
+                            return Series(index=cudf.Index([], dtype="str"))
                     try:
                         result = [
                             getattr(source._data[col], op)(**kwargs)
@@ -5909,12 +5943,16 @@ def _reduce(
                         )
                 else:
                     raise
-
-            return Series._from_data(
-                {None: result}, as_index(source._data.names)
-            )
+            if axis == 2:
+                return getattr(as_column(result), op)(**kwargs)
+            else:
+                return Series._from_data(
+                    {None: result}, as_index(source._data.names)
+                )
         elif axis == 1:
             return source._apply_cupy_method_axis_1(op, **kwargs)
+        else:
+            raise ValueError(f"Invalid value of {axis=} received for {op}")
 
     @_cudf_nvtx_annotate
     def _scan(
@@ -5924,6 +5962,8 @@ def _scan(
         *args,
         **kwargs,
     ):
+        if axis is None:
+            axis = 0
         axis = self._get_axis_from_axis_arg(axis)
 
         if axis == 0:
@@ -6354,26 +6394,12 @@ def to_csv(
         encoding=None,
         compression=None,
         lineterminator=None,
-        line_terminator=None,
         chunksize=None,
         storage_options=None,
     ):
         """{docstring}"""
         from cudf.io import csv
 
-        if line_terminator is not None:
-            warnings.warn(
-                "line_terminator is a deprecated keyword argument, "
-                "use lineterminator instead.",
-                FutureWarning,
-            )
-            if lineterminator is not None:
-                warnings.warn(
-                    f"Ignoring {line_terminator=} in favor "
-                    f"of {lineterminator=}"
-                )
-            else:
-                lineterminator = line_terminator
         if lineterminator is None:
             lineterminator = os.linesep
         return csv.to_csv(
@@ -6424,40 +6450,210 @@ def to_orc(
     def stack(self, level=-1, dropna=True):
         """Stack the prescribed level(s) from columns to index
 
-        Return a reshaped Series
+        Return a reshaped DataFrame or Series having a multi-level
+        index with one or more new inner-most levels compared to
+        the current DataFrame. The new inner-most levels are created
+        by pivoting the columns of the current dataframe:
+
+          - if the columns have a single level, the output is a Series;
+          - if the columns have multiple levels, the new index
+            level(s) is (are) taken from the prescribed level(s) and
+            the output is a DataFrame.
 
         Parameters
         ----------
+        level : int, str, list default -1
+            Level(s) to stack from the column axis onto the index axis,
+            defined as one index or label, or a list of indices or labels.
         dropna : bool, default True
-            Whether to drop rows in the resulting Series with missing values.
+            Whether to drop rows in the resulting Frame/Series with missing
+            values. When multiple levels are specified, `dropna==False` is
+            unsupported.
 
         Returns
         -------
-        The stacked cudf.Series
+        DataFrame or Series
+            Stacked dataframe or series.
+
+        See Also
+        --------
+        DataFrame.unstack : Unstack prescribed level(s) from index axis
+             onto column axis.
+        DataFrame.pivot : Reshape dataframe from long format to wide
+             format.
+        DataFrame.pivot_table : Create a spreadsheet-style pivot table
+             as a DataFrame.
+
+        Notes
+        -----
+        The function is named by analogy with a collection of books
+        being reorganized from being side by side on a horizontal
+        position (the columns of the dataframe) to being stacked
+        vertically on top of each other (in the index of the
+        dataframe).
 
         Examples
         --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [0, 1, 3], 'b': [1, 2, 4]})
-        >>> df.stack()
-        0  a    0
-           b    1
-        1  a    1
-           b    2
-        2  a    3
-           b    4
+        **Single level columns**
+
+        >>> df_single_level_cols = cudf.DataFrame([[0, 1], [2, 3]],
+        ...                                     index=['cat', 'dog'],
+        ...                                     columns=['weight', 'height'])
+
+        Stacking a dataframe with a single level column axis returns a Series:
+
+        >>> df_single_level_cols
+             weight height
+        cat       0      1
+        dog       2      3
+        >>> df_single_level_cols.stack()
+        cat  height    1
+             weight    0
+        dog  height    3
+             weight    2
         dtype: int64
+
+        **Multi level columns: simple case**
+
+        >>> import pandas as pd
+        >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
+        ...                                        ('weight', 'pounds')])
+        >>> df_multi_level_cols1 = cudf.DataFrame([[1, 2], [2, 4]],
+        ...                                     index=['cat', 'dog'],
+        ...                                     columns=multicol1)
+
+        Stacking a dataframe with a multi-level column axis:
+
+        >>> df_multi_level_cols1
+             weight
+                 kg    pounds
+        cat       1        2
+        dog       2        4
+        >>> df_multi_level_cols1.stack()
+                    weight
+        cat kg           1
+            pounds       2
+        dog kg           2
+            pounds       4
+
+        **Missing values**
+
+        >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
+        ...                                        ('height', 'm')])
+        >>> df_multi_level_cols2 = cudf.DataFrame([[1.0, 2.0], [3.0, 4.0]],
+        ...                                     index=['cat', 'dog'],
+        ...                                     columns=multicol2)
+
+        It is common to have missing values when stacking a dataframe
+        with multi-level columns, as the stacked dataframe typically
+        has more values than the original dataframe. Missing values
+        are filled with NULLs:
+
+        >>> df_multi_level_cols2
+            weight height
+                kg      m
+        cat    1.0    2.0
+        dog    3.0    4.0
+        >>> df_multi_level_cols2.stack()
+            height weight
+        cat kg   <NA>    1.0
+            m     2.0   <NA>
+        dog kg   <NA>    3.0
+            m     4.0   <NA>
+
+        **Prescribing the level(s) to be stacked**
+
+        The first parameter controls which level or levels are stacked:
+
+        >>> df_multi_level_cols2.stack(0)
+                    kg     m
+        cat height  <NA>   2.0
+            weight   1.0  <NA>
+        dog height  <NA>   4.0
+            weight   3.0  <NA>
+
+        >>> df_multi_level_cols2.stack([0, 1])
+        cat  height  m     2.0
+             weight  kg    1.0
+        dog  height  m     4.0
+             weight  kg    3.0
+        dtype: float64
         """
-        assert level in (None, -1)
-        repeated_index = self.index.repeat(self.shape[1])
-        name_index = libcudf.reshape.tile(
-            [as_column(self._column_names)], self.shape[0]
-        )
-        new_index_columns = [*repeated_index._columns, *name_index]
-        if isinstance(self._index, MultiIndex):
-            index_names = self._index.names + [None]
+
+        if isinstance(level, (int, str)):
+            level = [level]
+        elif isinstance(level, list):
+            if not all(isinstance(lv, (int, str)) for lv in level):
+                raise ValueError(
+                    "level must be either an int/str, or a list of int/str."
+                )
         else:
-            index_names = [None] * len(new_index_columns)
+            raise ValueError(
+                "level must be either an int/str, or a list of int/str."
+            )
+
+        level = [level] if not isinstance(level, list) else level
+
+        if len(level) > 1 and not dropna:
+            raise NotImplementedError(
+                "When stacking multiple levels, setting `dropna` to False "
+                "will generate new column combination that does not exist "
+                "in original dataframe. This behavior is unsupported in "
+                "cuDF. See pandas deprecation note: "
+                "https://github.com/pandas-dev/pandas/issues/53515"
+            )
+
+        # Compute the columns to stack based on specified levels
+
+        level_indices: list[int] = []
+
+        # If all passed in level names match up to the dataframe column's level
+        # names, cast them to indices
+        if all(lv in self._data.level_names for lv in level):
+            level_indices = [self._data.level_names.index(lv) for lv in level]
+        elif not all(isinstance(lv, int) for lv in level):
+            raise ValueError(
+                "`level` must either be a list of names or positions, not a "
+                "mixture of both."
+            )
+        else:
+            # Must be a list of positions, normalize negative positions
+            level_indices = [
+                lv + self._data.nlevels if lv < 0 else lv for lv in level
+            ]
+
+        unnamed_levels_indices = [
+            i for i in range(self._data.nlevels) if i not in level_indices
+        ]
+        has_unnamed_levels = len(unnamed_levels_indices) > 0
+
+        column_name_idx = self._data.to_pandas_index()
+        # Construct new index from the levels specified by `level`
+        named_levels = pd.MultiIndex.from_arrays(
+            [column_name_idx.get_level_values(lv) for lv in level_indices]
+        )
+
+        # Since `level` may only specify a subset of all levels, `unique()` is
+        # required to remove duplicates. In pandas, the order of the keys in
+        # the specified levels are always sorted.
+        unique_named_levels = named_levels.unique().sort_values()
+
+        # Each index from the original dataframe should repeat by the number
+        # of unique values in the named_levels
+        repeated_index = self.index.repeat(len(unique_named_levels))
+
+        # Each column name should tile itself by len(df) times
+        tiled_index = libcudf.reshape.tile(
+            [
+                as_column(unique_named_levels.get_level_values(i))
+                for i in range(unique_named_levels.nlevels)
+            ],
+            self.shape[0],
+        )
+
+        # Assemble the final index
+        new_index_columns = [*repeated_index._columns, *tiled_index]
+        index_names = [*self._index.names, *unique_named_levels.names]
         new_index = MultiIndex.from_frame(
             DataFrame._from_data(
                 dict(zip(range(0, len(new_index_columns)), new_index_columns))
@@ -6465,30 +6661,88 @@ def stack(self, level=-1, dropna=True):
             names=index_names,
         )
 
-        # Collect datatypes and cast columns as that type
-        common_type = np.result_type(*self.dtypes)
-        homogenized = DataFrame._from_data(
-            {
-                c: (
-                    self._data[c].astype(common_type)
-                    if not np.issubdtype(self._data[c].dtype, common_type)
-                    else self._data[c]
-                )
-                for c in self._data
-            }
+        # Compute the column indices that serves as the input for
+        # `interleave_columns`
+        column_idx_df = pd.DataFrame(
+            data=range(len(self._data)), index=named_levels
         )
 
-        result = Series._from_data(
-            {
-                None: libcudf.reshape.interleave_columns(
-                    [*homogenized._columns]
+        column_indices: list[list[int]] = []
+        if has_unnamed_levels:
+            unnamed_level_values = list(
+                map(column_name_idx.get_level_values, unnamed_levels_indices)
+            )
+            unnamed_level_values = pd.MultiIndex.from_arrays(
+                unnamed_level_values
+            )
+
+        def unnamed_group_generator():
+            if has_unnamed_levels:
+                for _, grpdf in column_idx_df.groupby(by=unnamed_level_values):
+                    # When stacking part of the levels, some combinations
+                    # of keys may not be present in this group but can be
+                    # present in others. Reindexing with the globally computed
+                    # `unique_named_levels` assigns -1 to these key
+                    # combinations, representing an all-null column that
+                    # is used in the subsequent libcudf call.
+                    yield grpdf.reindex(
+                        unique_named_levels, axis=0, fill_value=-1
+                    ).sort_index().values
+            else:
+                yield column_idx_df.sort_index().values
+
+        column_indices = list(unnamed_group_generator())
+
+        # For each of the group constructed from the unnamed levels,
+        # invoke `interleave_columns` to stack the values.
+        stacked = []
+
+        for column_idx in column_indices:
+            # Collect columns based on indices, append None for -1 indices.
+            columns = [
+                None if i == -1 else self._data.select_by_index(i).columns[0]
+                for i in column_idx
+            ]
+
+            # Collect datatypes and cast columns as that type
+            common_type = np.result_type(
+                *(col.dtype for col in columns if col is not None)
+            )
+
+            all_nulls = functools.cache(
+                functools.partial(
+                    column_empty, self.shape[0], common_type, masked=True
                 )
-            },
-            index=new_index,
-        )
+            )
+
+            # homogenize the dtypes of the columns
+            homogenized = [
+                col.astype(common_type) if col is not None else all_nulls()
+                for col in columns
+            ]
+
+            stacked.append(libcudf.reshape.interleave_columns(homogenized))
+
+        # Construct the resulting dataframe / series
+        if not has_unnamed_levels:
+            result = Series._from_data(
+                data={None: stacked[0]}, index=new_index
+            )
+        else:
+            if unnamed_level_values.nlevels == 1:
+                unnamed_level_values = unnamed_level_values.get_level_values(0)
+            unnamed_level_values = unnamed_level_values.unique().sort_values()
+
+            data = ColumnAccessor(
+                dict(zip(unnamed_level_values, stacked)),
+                isinstance(unnamed_level_values, pd.MultiIndex),
+                unnamed_level_values.names,
+            )
+
+            result = DataFrame._from_data(data, index=new_index)
 
         if dropna:
-            return result.dropna()
+            return result.dropna(how="all")
         else:
             return result
 
@@ -7541,13 +7795,13 @@ def _align_indices(lhs, rhs):
         lhs_out = DataFrame(index=df.index)
         rhs_out = DataFrame(index=df.index)
         common = set(lhs._column_names) & set(rhs._column_names)
-        common_x = {f"{x}_x" for x in common}
-        common_y = {f"{x}_y" for x in common}
+        common_x = {f"{x}_x": x for x in common}
+        common_y = {f"{x}_y": x for x in common}
         for col in df._column_names:
             if col in common_x:
-                lhs_out[col[:-2]] = df[col]
+                lhs_out[common_x[col]] = df[col]
             elif col in common_y:
-                rhs_out[col[:-2]] = df[col]
+                rhs_out[common_y[col]] = df[col]
             elif col in lhs:
                 lhs_out[col] = df[col]
             elif col in rhs:
@@ -7631,7 +7885,7 @@ def _get_union_of_indices(indexes):
     else:
         merged_index = cudf.core.index.GenericIndex._concat(indexes)
         merged_index = merged_index.drop_duplicates()
-        _, inds = merged_index._values.sort_by_values()
+        inds = merged_index._values.argsort()
         return merged_index.take(inds)
 
 
@@ -7695,7 +7949,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
             # Combine and de-dupe the categories
             categories[idx] = cudf.Series(
                 concat_columns([col.categories for col in cols])
-            )._column.unique(preserve_order=True)
+            )._column.unique()
             # Set the column dtype to the codes' dtype. The categories
             # will be re-assigned at the end
             dtypes[idx] = min_scalar_type(len(categories[idx]))
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index edd557aad1f..5fb092c7cbc 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -144,6 +144,16 @@ class CategoricalDtype(_BaseDtype):
         when used in operations that combine categoricals, e.g. astype, and
         will resolve to False if there is no existing ordered to maintain.
 
+    Attributes
+    ----------
+    categories
+    ordered
+
+    Methods
+    -------
+    from_pandas
+    to_pandas
+
     Examples
     --------
     >>> import cudf
@@ -320,6 +330,16 @@ class ListDtype(_BaseDtype):
     element_type : object
         A dtype with which represents the element types in the list.
 
+    Attributes
+    ----------
+    element_type
+    leaf_type
+
+    Methods
+    -------
+    from_arrow
+    to_arrow
+
     Examples
     --------
     >>> import cudf
@@ -496,6 +516,16 @@ class StructDtype(_BaseDtype):
         A mapping of field names to dtypes, the dtypes can themselves
         be of ``StructDtype`` too.
 
+    Attributes
+    ----------
+    fields
+    itemsize
+
+    Methods
+    -------
+    from_arrow
+    to_arrow
+
     Examples
     --------
     >>> import cudf
@@ -649,19 +679,32 @@ def itemsize(self):
         scale : int, optional
             The scale of the dtype. See Notes below.
 
+        Attributes
+        ----------
+        precision
+        scale
+        itemsize
+
+        Methods
+        -------
+        to_arrow
+        from_arrow
+
         Notes
         -----
-            When the scale is positive:
-                - numbers with fractional parts (e.g., 0.0042) can be represented
-                - the scale is the total number of digits to the right of the
-                decimal point
-            When the scale is negative:
-                - only multiples of powers of 10 (including 10**0) can be
-                represented (e.g., 1729, 4200, 1000000)
-                - the scale represents the number of trailing zeros in the value.
-            For example, 42 is representable with precision=2 and scale=0.
-            13.0051 is representable with precision=6 and scale=4,
-            and *not* representable with precision<6 or scale<4.
+        When the scale is positive:
+            - numbers with fractional parts (e.g., 0.0042) can be represented
+            - the scale is the total number of digits to the right of the
+              decimal point
+
+        When the scale is negative:
+            - only multiples of powers of 10 (including 10**0) can be
+              represented (e.g., 1729, 4200, 1000000)
+            - the scale represents the number of trailing zeros in the value.
+
+        For example, 42 is representable with precision=2 and scale=0.
+        13.0051 is representable with precision=6 and scale=4,
+        and *not* representable with precision<6 or scale<4.
 
         Examples
         --------
@@ -862,9 +905,12 @@ def __init__(self, subtype, closed="right"):
     def subtype(self):
         return self.fields["left"]
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return f"interval[{self.subtype}, {self.closed}]"
 
+    def __str__(self) -> str:
+        return self.__repr__()
+
     @classmethod
     def from_arrow(cls, typ):
         return IntervalDtype(typ.subtype.to_pandas_dtype(), typ.closed)
@@ -1064,7 +1110,7 @@ def is_interval_dtype(obj):
             obj,
             (
                 cudf.core.dtypes.IntervalDtype,
-                pd.core.dtypes.dtypes.IntervalDtype,
+                pd.IntervalDtype,
             ),
         )
         or obj is cudf.core.dtypes.IntervalDtype
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 230a5054a00..6224793d6f1 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -16,19 +16,23 @@
     MutableMapping,
     Optional,
     Tuple,
-    TypeVar,
     Union,
 )
 
+# TODO: The `numpy` import is needed for typing purposes during doc builds
+# only, need to figure out why the `np` alias is insufficient then remove.
 import cupy
+import numpy
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import Dtype
-from cudf.api.types import is_dtype_equal, is_scalar
+from cudf.api.extensions import no_default
+from cudf.api.types import is_bool_dtype, is_dtype_equal, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -43,9 +47,11 @@
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import find_common_type
-from cudf.utils.utils import _array_ufunc, _cudf_nvtx_annotate
-
-T = TypeVar("T", bound="Frame")
+from cudf.utils.utils import (
+    _array_ufunc,
+    _cudf_nvtx_annotate,
+    _warn_no_dask_cudf,
+)
 
 
 # TODO: It looks like Frame is missing a declaration of `copy`, need to add
@@ -91,6 +97,11 @@ def _dtypes(self):
             zip(self._data.names, (col.dtype for col in self._data.columns))
         )
 
+    @property
+    def _has_nulls(self):
+        return any(col.has_nulls() for col in self._data.values())
+
+    @_cudf_nvtx_annotate
     def serialize(self):
         header = {
             "type-serialized": pickle.dumps(type(self)),
@@ -100,6 +111,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
+    @_cudf_nvtx_annotate
     def deserialize(cls, header, frames):
         cls_deserialize = pickle.loads(header["type-serialized"])
         column_names = pickle.loads(header["column_names"])
@@ -145,9 +157,10 @@ def _from_columns_like_self(
         frame = self.__class__._from_columns(columns, column_names)
         return frame._copy_type_metadata(self, override_dtypes=override_dtypes)
 
+    @_cudf_nvtx_annotate
     def _mimic_inplace(
-        self: T, result: T, inplace: bool = False
-    ) -> Optional[Frame]:
+        self, result: Self, inplace: bool = False
+    ) -> Optional[Self]:
         if inplace:
             for col in self._data:
                 if col in result._data:
@@ -160,6 +173,7 @@ def _mimic_inplace(
             return result
 
     @property
+    @_cudf_nvtx_annotate
     def size(self):
         """
         Return the number of elements in the underlying data.
@@ -236,72 +250,7 @@ def size(self):
         """
         return self._num_columns * self._num_rows
 
-    @property
-    def shape(self):
-        """Returns a tuple representing the dimensionality of the DataFrame."""
-        return self._num_rows, self._num_columns
-
-    @property
-    def empty(self):
-        """
-        Indicator whether DataFrame or Series is empty.
-
-        True if DataFrame/Series is entirely empty (no items),
-        meaning any of the axes are of length 0.
-
-        Returns
-        -------
-        out : bool
-            If DataFrame/Series is empty, return True, if not return False.
-
-        Notes
-        -----
-        If DataFrame/Series contains only `null` values, it is still not
-        considered empty. See the example below.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'A' : []})
-        >>> df
-        Empty DataFrame
-        Columns: [A]
-        Index: []
-        >>> df.empty
-        True
-
-        If we only have `null` values in our DataFrame, it is
-        not considered empty! We will need to drop
-        the `null`'s to make the DataFrame empty:
-
-        >>> df = cudf.DataFrame({'A' : [None, None]})
-        >>> df
-              A
-        0  <NA>
-        1  <NA>
-        >>> df.empty
-        False
-        >>> df.dropna().empty
-        True
-
-        Non-empty and empty Series example:
-
-        >>> s = cudf.Series([1, 2, None])
-        >>> s
-        0       1
-        1       2
-        2    <NA>
-        dtype: int64
-        >>> s.empty
-        False
-        >>> s = cudf.Series([])
-        >>> s
-        Series([], dtype: float64)
-        >>> s.empty
-        True
-        """
-        return self.size == 0
-
+    @_cudf_nvtx_annotate
     def memory_usage(self, deep=False):
         """Return the memory usage of an object.
 
@@ -317,6 +266,7 @@ def memory_usage(self, deep=False):
         """
         raise NotImplementedError
 
+    @_cudf_nvtx_annotate
     def __len__(self):
         return self._num_rows
 
@@ -340,15 +290,16 @@ def astype(self, dtype, copy=False, **kwargs):
     def equals(self, other):
         """
         Test whether two objects contain the same elements.
-        This function allows two Series or DataFrames to be compared against
+
+        This function allows two objects to be compared against
         each other to see if they have the same shape and elements. NaNs in
         the same location are considered equal. The column headers do not
         need to have the same type.
 
         Parameters
         ----------
-        other : Series or DataFrame
-            The other Series or DataFrame to be compared with the first.
+        other : Index, Series, DataFrame
+            The other object to be compared with.
 
         Returns
         -------
@@ -411,14 +362,15 @@ def equals(self, other):
         )
 
     @_cudf_nvtx_annotate
-    def _get_columns_by_label(self, labels, downcast=False):
+    def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
         """
         Returns columns of the Frame specified by `labels`
 
         """
-        return self._data.select_by_label(labels)
+        return self.__class__._from_data(self._data.select_by_label(labels))
 
     @property
+    @_cudf_nvtx_annotate
     def values(self):
         """
         Return a CuPy representation of the DataFrame.
@@ -434,6 +386,7 @@ def values(self):
         return self.to_cupy()
 
     @property
+    @_cudf_nvtx_annotate
     def values_host(self):
         """
         Return a NumPy representation of the data.
@@ -448,6 +401,7 @@ def values_host(self):
         """
         return self.to_numpy()
 
+    @_cudf_nvtx_annotate
     def __array__(self, dtype=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
@@ -456,12 +410,14 @@ def __array__(self, dtype=None):
             "using .to_numpy()."
         )
 
+    @_cudf_nvtx_annotate
     def __arrow_array__(self, type=None):
         raise TypeError(
             "Implicit conversion to a host PyArrow object via __arrow_array__ "
             "is not allowed. Consider using .to_arrow()"
         )
 
+    @_cudf_nvtx_annotate
     def _to_array(
         self,
         get_column_values: Callable,
@@ -557,7 +513,7 @@ def to_numpy(
         dtype: Union[Dtype, None] = None,
         copy: bool = True,
         na_value=None,
-    ) -> np.ndarray:
+    ) -> numpy.ndarray:
         """Convert the Frame to a NumPy array.
 
         Parameters
@@ -1069,7 +1025,6 @@ def from_arrow(cls, data):
         # Handle dict arrays
         cudf_category_frame = {}
         if len(dict_indices):
-
             dict_indices_table = pa.table(dict_indices)
             data = data.drop(dict_indices_table.column_names)
             indices_columns = libcudf.interop.from_arrow(dict_indices_table)
@@ -1178,6 +1133,7 @@ def to_arrow(self):
             {str(name): col.to_arrow() for name, col in self._data.items()}
         )
 
+    @_cudf_nvtx_annotate
     def _positions_from_column_names(self, column_names):
         """Map each column name into their positions in the frame.
 
@@ -1190,12 +1146,13 @@ def _positions_from_column_names(self, column_names):
             if name in set(column_names)
         ]
 
+    @_cudf_nvtx_annotate
     def _copy_type_metadata(
-        self: T,
-        other: T,
+        self,
+        other: Self,
         *,
         override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
-    ) -> T:
+    ) -> Self:
         """
         Copy type metadata from each column of `other` to the corresponding
         column of `self`.
@@ -1452,12 +1409,25 @@ def searchsorted(
         if len(values) != len(self._data):
             raise ValueError("Mismatch number of columns to search for.")
 
+        # TODO: Change behavior based on the decision in
+        # https://github.com/pandas-dev/pandas/issues/54668
+        common_dtype_list = [
+            find_common_type([col.dtype, val.dtype])
+            for col, val in zip(self._columns, values)
+        ]
         sources = [
             col
-            if is_dtype_equal(col.dtype, val.dtype)
-            else col.astype(val.dtype)
-            for col, val in zip(self._columns, values)
+            if is_dtype_equal(col.dtype, common_dtype)
+            else col.astype(common_dtype)
+            for col, common_dtype in zip(self._columns, common_dtype_list)
+        ]
+        values = [
+            val
+            if is_dtype_equal(val.dtype, common_dtype)
+            else val.astype(common_dtype)
+            for val, common_dtype in zip(values, common_dtype_list)
         ]
+
         outcol = libcudf.search.search_sorted(
             sources,
             values,
@@ -1561,9 +1531,12 @@ def argsort(
             by=by, ascending=ascending, na_position=na_position
         ).values
 
+    @_cudf_nvtx_annotate
     def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
-        # Get an int64 column consisting of the indices required to sort self
-        # according to the columns specified in by.
+        """
+        Get the indices required to sort self according to the columns
+        specified in by.
+        """
 
         to_sort = [
             *(
@@ -1577,7 +1550,12 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
         if np.isscalar(ascending):
             ascending = [ascending] * len(to_sort)
 
-        return libcudf.sort.order_by(to_sort, ascending, na_position)
+        return libcudf.sort.order_by(
+            to_sort,
+            ascending,
+            na_position,
+            stable=True,
+        )
 
     @_cudf_nvtx_annotate
     def abs(self):
@@ -1756,9 +1734,11 @@ def _colwise_binop(
 
         return output
 
+    @_cudf_nvtx_annotate
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return _array_ufunc(self, ufunc, method, inputs, kwargs)
 
+    @_cudf_nvtx_annotate
     @acquire_spill_lock()
     def _apply_cupy_ufunc_to_operands(
         self, ufunc, cupy_func, operands, **kwargs
@@ -1872,30 +1852,45 @@ def dot(self, other, reflect=False):
             return cudf.DataFrame(result)
         return result.item()
 
+    @_cudf_nvtx_annotate
     def __matmul__(self, other):
         return self.dot(other)
 
+    @_cudf_nvtx_annotate
     def __rmatmul__(self, other):
         return self.dot(other, reflect=True)
 
     # Unary logical operators
+    @_cudf_nvtx_annotate
     def __neg__(self):
-        return -1 * self
+        """Negate for integral dtypes, logical NOT for bools."""
+        return self._from_data_like_self(
+            {
+                name: col.unary_operator("not")
+                if is_bool_dtype(col.dtype)
+                else -1 * col
+                for name, col in self._data.items()
+            }
+        )
 
+    @_cudf_nvtx_annotate
     def __pos__(self):
         return self.copy(deep=True)
 
+    @_cudf_nvtx_annotate
     def __abs__(self):
         return self._unaryop("abs")
 
     # Reductions
     @classmethod
+    @_cudf_nvtx_annotate
     def _get_axis_from_axis_arg(cls, axis):
         try:
             return cls._SUPPORT_AXIS_LOOKUP[axis]
         except KeyError:
             raise ValueError(f"No axis named {axis} for object type {cls}")
 
+    @_cudf_nvtx_annotate
     def _reduce(self, *args, **kwargs):
         raise NotImplementedError(
             f"Reductions are not supported for objects of type {type(self)}."
@@ -1904,7 +1899,7 @@ def _reduce(self, *args, **kwargs):
     @_cudf_nvtx_annotate
     def min(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         level=None,
         numeric_only=None,
@@ -1955,7 +1950,7 @@ def min(
     @_cudf_nvtx_annotate
     def max(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         level=None,
         numeric_only=None,
@@ -2006,7 +2001,7 @@ def max(
     @_cudf_nvtx_annotate
     def sum(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         dtype=None,
         level=None,
@@ -2064,7 +2059,7 @@ def sum(
     @_cudf_nvtx_annotate
     def product(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         dtype=None,
         level=None,
@@ -2108,11 +2103,11 @@ def product(
         b    5040
         dtype: int64
         """
-        axis = self._get_axis_from_axis_arg(axis)
+
         return self._reduce(
             # cuDF columns use "product" as the op name, but cupy uses "prod"
             # and we need cupy if axis == 1.
-            "product" if axis == 0 else "prod",
+            "prod" if axis in {1, "columns"} else "product",
             axis=axis,
             skipna=skipna,
             dtype=dtype,
@@ -2127,7 +2122,12 @@ def product(
 
     @_cudf_nvtx_annotate
     def mean(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
+        self,
+        axis=no_default,
+        skipna=True,
+        level=None,
+        numeric_only=None,
+        **kwargs,
     ):
         """
         Return the mean of the values for the requested axis.
@@ -2173,7 +2173,7 @@ def mean(
     @_cudf_nvtx_annotate
     def std(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         level=None,
         ddof=1,
@@ -2229,7 +2229,7 @@ def std(
     @_cudf_nvtx_annotate
     def var(
         self,
-        axis=None,
+        axis=no_default,
         skipna=True,
         level=None,
         ddof=1,
@@ -2283,7 +2283,12 @@ def var(
 
     @_cudf_nvtx_annotate
     def kurtosis(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
+        self,
+        axis=no_default,
+        skipna=True,
+        level=None,
+        numeric_only=None,
+        **kwargs,
     ):
         """
         Return Fisher's unbiased kurtosis of a sample.
@@ -2324,7 +2329,7 @@ def kurtosis(
         b   -1.2
         dtype: float64
         """
-        if axis not in (0, "index", None):
+        if axis not in (0, "index", None, no_default):
             raise NotImplementedError("Only axis=0 is currently supported.")
 
         return self._reduce(
@@ -2341,7 +2346,12 @@ def kurtosis(
 
     @_cudf_nvtx_annotate
     def skew(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
+        self,
+        axis=no_default,
+        skipna=True,
+        level=None,
+        numeric_only=None,
+        **kwargs,
     ):
         """
         Return unbiased Fisher-Pearson skew of a sample.
@@ -2385,7 +2395,7 @@ def skew(
         b   -0.37037
         dtype: float64
         """
-        if axis not in (0, "index", None):
+        if axis not in (0, "index", None, no_default):
             raise NotImplementedError("Only axis=0 is currently supported.")
 
         return self._reduce(
@@ -2404,6 +2414,15 @@ def all(self, axis=0, skipna=True, level=None, **kwargs):
 
         Parameters
         ----------
+        axis : {0 or 'index', 1 or 'columns', None}, default 0
+            Indicate which axis or axes should be reduced. For `Series`
+            this parameter is unused and defaults to `0`.
+
+            - 0 or 'index' : reduce the index, return a Series
+                whose index is the original column labels.
+            - 1 or 'columns' : reduce the columns, return a Series
+                whose index is the original index.
+            - None : reduce all axes, return a scalar.
         skipna: bool, default True
             Exclude NA/null values. If the entire row/column is NA and
             skipna is True, then the result will be True, as for an
@@ -2417,7 +2436,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs):
 
         Notes
         -----
-        Parameters currently not supported are `axis`, `bool_only`, `level`.
+        Parameters currently not supported are `bool_only`, `level`.
 
         Examples
         --------
@@ -2443,6 +2462,15 @@ def any(self, axis=0, skipna=True, level=None, **kwargs):
 
         Parameters
         ----------
+        axis : {0 or 'index', 1 or 'columns', None}, default 0
+            Indicate which axis or axes should be reduced. For `Series`
+            this parameter is unused and defaults to `0`.
+
+            - 0 or 'index' : reduce the index, return a Series
+                whose index is the original column labels.
+            - 1 or 'columns' : reduce the columns, return a Series
+                whose index is the original index.
+            - None : reduce all axes, return a scalar.
         skipna: bool, default True
             Exclude NA/null values. If the entire row/column is NA and
             skipna is True, then the result will be False, as for an
@@ -2456,7 +2484,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs):
 
         Notes
         -----
-        Parameters currently not supported are `axis`, `bool_only`, `level`.
+        Parameters currently not supported are `bool_only`, `level`.
 
         Examples
         --------
@@ -2475,35 +2503,6 @@ def any(self, axis=0, skipna=True, level=None, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
-    def sum_of_squares(self, dtype=None):
-        """Return the sum of squares of values.
-
-        Parameters
-        ----------
-        dtype: data type
-            Data type to cast the result to.
-
-        Returns
-        -------
-        Series
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]})
-        >>> df.sum_of_squares()
-        a     38
-        b    249
-        dtype: int64
-        """
-        warnings.warn(
-            f"Support for {self.__class__}.sum_of_squares is deprecated and "
-            "will be removed",
-            FutureWarning,
-        )
-        return self._reduce("sum_of_squares", dtype=dtype)
-
     @_cudf_nvtx_annotate
     def median(
         self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
@@ -2594,6 +2593,7 @@ def to_string(self):
         """
         return repr(self)
 
+    @_cudf_nvtx_annotate
     def __str__(self):
         return self.to_string()
 
@@ -2625,10 +2625,6 @@ def head(self, n=5):
         DataFrame or Series
             The first `n` rows of the caller object.
 
-        See Also
-        --------
-        Frame.tail: Returns the last `n` rows.
-
         Examples
         --------
         **Series**
@@ -2797,6 +2793,7 @@ def __invert__(self):
             }
         )
 
+    @_cudf_nvtx_annotate
     def nunique(self, dropna: bool = True):
         """
         Returns a per column mapping with counts of unique values for
@@ -2818,6 +2815,7 @@ def nunique(self, dropna: bool = True):
         }
 
     @staticmethod
+    @_cudf_nvtx_annotate
     def _repeat(
         columns: List[ColumnBase], repeats, axis=None
     ) -> List[ColumnBase]:
@@ -2831,12 +2829,21 @@ def _repeat(
 
         return libcudf.filling.repeat(columns, repeats)
 
+    @_cudf_nvtx_annotate
+    @_warn_no_dask_cudf
+    def __dask_tokenize__(self):
+        return [
+            type(self),
+            self._dtypes,
+            self.to_pandas(),
+        ]
+
 
 def _apply_inverse_column(col: ColumnBase) -> ColumnBase:
     """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
     if np.issubdtype(col.dtype, np.integer):
         return col.unary_operator("invert")
-    elif np.issubdtype(col.dtype, np.bool_):
+    elif is_bool_dtype(col.dtype):
         return col.unary_operator("not")
     else:
         raise TypeError(
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index f955c5dfbd6..b300c55b537 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
+import copy
 import itertools
 import pickle
 import textwrap
@@ -13,19 +14,21 @@
 import pandas as pd
 
 import cudf
+from cudf import _lib as libcudf
 from cudf._lib import groupby as libgroupby
 from cudf._lib.null_mask import bitmask_or
 from cudf._lib.reshape import interleave_columns
 from cudf._lib.sort import segmented_sort_by_key
 from cudf._lib.types import size_type_dtype
 from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
-from cudf.api.types import is_list_like
+from cudf.api.types import is_bool_dtype, is_list_like
 from cudf.core.abc import Serializable
 from cudf.core.column.column import ColumnBase, arange, as_column
 from cudf.core.column_accessor import ColumnAccessor
+from cudf.core.join._join_helpers import _match_join_keys
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
-from cudf.core.udf.groupby_utils import jit_groupby_apply
+from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
 from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate
 
 
@@ -259,20 +262,21 @@ def __init__(
         """
         self.obj = obj
         self._as_index = as_index
-        self._by = by
+        self._by = by.copy(deep=True) if isinstance(by, _Grouping) else by
         self._level = level
         self._sort = sort
         self._dropna = dropna
         self._group_keys = group_keys
 
-        if isinstance(by, _Grouping):
-            by._obj = self.obj
-            self.grouping = by
+        if isinstance(self._by, _Grouping):
+            self._by._obj = self.obj
+            self.grouping = self._by
         else:
-            self.grouping = _Grouping(obj, by, level)
+            self.grouping = _Grouping(obj, self._by, level)
 
     def __iter__(self):
         if isinstance(self._by, list) and len(self._by) == 1:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "In a future version of cudf, a length 1 tuple will be "
                 "returned when iterating over a groupby with a grouper equal "
@@ -308,10 +312,11 @@ def dtypes(self):
         2  object  int64
         3  object  int64
         """
-        index = self.grouping.keys.unique().to_pandas()
+        index = self.grouping.keys.unique().sort_values().to_pandas()
+        obj_dtypes = self.obj._dtypes
         return pd.DataFrame(
             {
-                name: [self.obj._dtypes[name]] * len(index)
+                name: [obj_dtypes[name]] * len(index)
                 for name in self.grouping.values._column_names
             },
             index=index,
@@ -549,8 +554,8 @@ def agg(self, func):
             orig_dtypes,
         ):
             for agg, col in zip(aggs, cols):
+                agg_name = agg.__name__ if callable(agg) else agg
                 if multilevel:
-                    agg_name = agg.__name__ if callable(agg) else agg
                     key = (col_name, agg_name)
                 else:
                     key = col_name
@@ -560,7 +565,26 @@ def agg(self, func):
                 ):
                     # Structs lose their labels which we reconstruct here
                     col = col._with_type_metadata(cudf.ListDtype(orig_dtype))
-                data[key] = col
+
+                if (
+                    self.obj.empty
+                    and (
+                        isinstance(agg_name, str)
+                        and agg_name in Reducible._SUPPORTED_REDUCTIONS
+                    )
+                    and len(col) == 0
+                    and not isinstance(
+                        col,
+                        (
+                            cudf.core.column.ListColumn,
+                            cudf.core.column.StructColumn,
+                            cudf.core.column.DecimalBaseColumn,
+                        ),
+                    )
+                ):
+                    data[key] = col.astype(orig_dtype)
+                else:
+                    data[key] = col
         data = ColumnAccessor(data, multiindex=multilevel)
         if not multilevel:
             data = data.rename_levels({np.nan: None}, level=0)
@@ -568,6 +592,37 @@ def agg(self, func):
 
         if self._sort:
             result = result.sort_index()
+        else:
+            if cudf.get_option(
+                "mode.pandas_compatible"
+            ) and not libgroupby._is_all_scan_aggregate(normalized_aggs):
+                # Even with `sort=False`, pandas guarantees that
+                # groupby preserves the order of rows within each group.
+                left_cols = list(
+                    self.grouping.keys.drop_duplicates()._data.columns
+                )
+                right_cols = list(result_index._data.columns)
+                join_keys = [
+                    _match_join_keys(lcol, rcol, "left")
+                    for lcol, rcol in zip(left_cols, right_cols)
+                ]
+                # TODO: In future, see if we can centralize
+                # logic else where that has similar patterns.
+                join_keys = map(list, zip(*join_keys))
+                _, indices = libcudf.join.join(
+                    *join_keys,
+                    how="left",
+                )
+                result = result.take(indices)
+                if isinstance(result._index, cudf.CategoricalIndex):
+                    # Needs re-ordering the categories in the order
+                    # they are after grouping.
+                    result._index = cudf.Index(
+                        result._index._column.reorder_categories(
+                            result._index._column._get_decategorized_column()
+                        ),
+                        name=result._index.name,
+                    )
 
         if not self._as_index:
             result = result.reset_index()
@@ -678,7 +733,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
             # subsample the gather map from the full input ordering,
             # rather than permuting the gather map of the output.
             _, (ordering,), _ = self._groupby.groups(
-                [arange(0, self.obj._data.nrows)]
+                [arange(0, len(self.obj))]
             )
             # Invert permutation from original order to groups on the
             # subset of entries we want.
@@ -864,25 +919,27 @@ def ngroup(self, ascending=True):
         5    0
         dtype: int64
         """
-        num_groups = len(index := self.grouping.keys.unique())
+        index = self.grouping.keys.unique().sort_values()
+        num_groups = len(index)
         _, has_null_group = bitmask_or([*index._columns])
 
         if ascending:
-            if has_null_group:
-                group_ids = cudf.Series._from_data(
-                    {None: cp.arange(-1, num_groups - 1)}
-                )
-            else:
-                group_ids = cudf.Series._from_data(
-                    {None: cp.arange(num_groups)}
-                )
+            # Count ascending from 0 to num_groups - 1
+            group_ids = cudf.Series._from_data({None: cp.arange(num_groups)})
+        elif has_null_group:
+            # Count descending from num_groups - 1 to 0, but subtract one more
+            # for the null group making it num_groups - 2 to -1.
+            group_ids = cudf.Series._from_data(
+                {None: cp.arange(num_groups - 2, -2, -1)}
+            )
         else:
+            # Count descending from num_groups - 1 to 0
             group_ids = cudf.Series._from_data(
                 {None: cp.arange(num_groups - 1, -1, -1)}
             )
 
         if has_null_group:
-            group_ids.iloc[0] = cudf.NA
+            group_ids.iloc[-1] = cudf.NA
 
         group_ids._index = index
         return self._broadcast(group_ids)
@@ -1004,6 +1061,7 @@ def sample(
                     as_column(group_offsets),
                     [],
                     [],
+                    stable=True,
                 )
                 indices = cp.asarray(indices.data_array_view(mode="read"))
             # Which indices are we going to want?
@@ -1065,7 +1123,7 @@ def _grouped(self):
             column_names=self.obj._column_names,
             index_names=self.obj._index_names,
         )
-        group_names = grouped_keys.unique()
+        group_names = grouped_keys.unique().sort_values()
         return (group_names, offsets, grouped_keys, grouped_values)
 
     def _normalize_aggs(
@@ -1166,11 +1224,8 @@ def _jit_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
         # Nulls are not yet supported
-        for colname in self.grouping.values._data.keys():
-            if self.obj._data[colname].has_nulls():
-                raise ValueError(
-                    "Nulls not yet supported with groupby JIT engine"
-                )
+        if self.grouping._obj._has_nulls:
+            raise ValueError("Nulls not yet supported with groupby JIT engine")
 
         chunk_results = jit_groupby_apply(
             offsets, grouped_values, function, *args
@@ -1179,8 +1234,7 @@ def _jit_groupby_apply(
             {None: chunk_results}, index=group_names
         )
         result.index.names = self.grouping.names
-        result = result.reset_index()
-        result[None] = result.pop(0)
+
         return result
 
     @_cudf_nvtx_annotate
@@ -1247,7 +1301,7 @@ def _post_process_chunk_results(
         return result
 
     @_cudf_nvtx_annotate
-    def apply(self, function, *args, engine="cudf"):
+    def apply(self, function, *args, engine="auto"):
         """Apply a python transformation function over the grouped chunk.
 
         Parameters
@@ -1257,7 +1311,7 @@ def apply(self, function, *args, engine="cudf"):
           on the grouped chunk.
         args : tuple
             Optional positional arguments to pass to the function.
-        engine: {'cudf', 'jit'}, default 'cudf'
+        engine: 'auto', 'cudf', or 'jit', default 'auto'
           Selects the GroupBy.apply implementation. Use `jit` to
           select the numba JIT pipeline. Only certain operations are allowed
           within the function when using this option: min, max, sum, mean, var,
@@ -1266,6 +1320,11 @@ def apply(self, function, *args, engine="cudf"):
           `df['x'] * 2` is not yet allowed.
           For more information, see the `cuDF guide to user defined functions
           <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>`__.
+          Use `cudf` to select the iterative groupby apply algorithm which aims
+          to provide maximum flexibility at the expense of performance.
+          The default value `auto` will attempt to use the numba JIT pipeline
+          where possible and will fall back to the iterative algorithm if
+          necessary.
 
         Examples
         --------
@@ -1334,15 +1393,36 @@ def mult(df):
         ...   lambda group: group['b'].max() - group['b'].min(),
         ...   engine='jit'
         ... )
-           a  None
-        0  1     1
-        1  2     1
-        2  3     1
+        a
+        1    1
+        2    1
+        3    1
+        dtype: int64
+
         """
+
+        if self.obj.empty:
+            res = self.obj.copy(deep=True)
+            res.index = self.grouping.keys
+            if function in {"sum", "product"}:
+                # For `sum` & `product`, boolean types
+                # will need to result in `int64` type.
+                for name, col in res._data.items():
+                    if is_bool_dtype(col.dtype):
+                        res._data[name] = col.astype("int")
+            return res
+
         if not callable(function):
             raise TypeError(f"type {type(function)} is not callable")
         group_names, offsets, group_keys, grouped_values = self._grouped()
 
+        if engine == "auto":
+            if (not grouped_values._has_nulls) and _can_be_jitted(
+                grouped_values, function, args
+            ):
+                engine = "jit"
+            else:
+                engine = "cudf"
         if engine == "jit":
             result = self._jit_groupby_apply(
                 function,
@@ -1366,6 +1446,9 @@ def mult(df):
 
         if self._sort:
             result = result.sort_index()
+        if self._as_index is False:
+            result = result.reset_index()
+            result[None] = result.pop(0)
         return result
 
     @_cudf_nvtx_annotate
@@ -1726,7 +1809,7 @@ def corr(self, method="pearson", min_periods=1):
             val3  0.714575  1.000000  1.000000
         """
 
-        if not method.lower() in ("pearson",):
+        if method.lower() not in ("pearson",):
             raise NotImplementedError(
                 "Only pearson correlation is currently supported"
             )
@@ -2023,6 +2106,7 @@ def pad(self, limit=None):
         if limit is not None:
             raise NotImplementedError("Does not support limit param yet.")
 
+        # Do not remove until pandas 2.0 support is added.
         warnings.warn(
             "pad is deprecated and will be removed in a future version. "
             "Use ffill instead.",
@@ -2059,6 +2143,7 @@ def backfill(self, limit=None):
         if limit is not None:
             raise NotImplementedError("Does not support limit param yet.")
 
+        # Do not remove until pandas 2.0 support is added.
         warnings.warn(
             "backfill is deprecated and will be removed in a future version. "
             "Use bfill instead.",
@@ -2239,6 +2324,7 @@ def pct_change(
 
         if fill_method in ("pad", "backfill"):
             alternative = "ffill" if fill_method == "pad" else "bfill"
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 f"{fill_method} is deprecated and will be removed in a future "
                 f"version. Use f{alternative} instead.",
@@ -2258,11 +2344,29 @@ def _mimic_pandas_order(
         """
         # TODO: copy metadata after this method is a common pattern, should
         # merge in this method.
-        _, order_cols, _ = self._groupby.groups(
-            [arange(0, result._data.nrows)]
-        )
-        gather_map = order_cols[0].argsort()
-        result = result.take(gather_map)
+
+        # This function is used to reorder the results of scan-based
+        # groupbys which have the same output size as input size.
+        # However, if the grouping key has NAs and dropna=True, the
+        # result coming back from libcudf has null_count few rows than
+        # the input, so we must produce an ordering from the full
+        # input range.
+        _, (ordering,), _ = self._groupby.groups([arange(0, len(self.obj))])
+        if self._dropna and any(
+            c.has_nulls(include_nan=True) > 0
+            for c in self.grouping._key_columns
+        ):
+            # Scan aggregations with null/nan keys put nulls in the
+            # corresponding output rows in pandas, to do that here
+            # expand the result by reindexing.
+            ri = cudf.RangeIndex(0, len(self.obj))
+            result.index = cudf.Index(ordering)
+            # This reorders and expands
+            result = result.reindex(ri)
+        else:
+            # Just reorder according to the groupings
+            result = result.take(ordering.argsort())
+        # Now produce the actual index we first thought of
         result.index = self.obj.index
         return result
 
@@ -2278,6 +2382,7 @@ def __getitem__(self, key):
             dropna=self._dropna,
             sort=self._sort,
             group_keys=self._group_keys,
+            as_index=self._as_index,
         )
 
 
@@ -2484,6 +2589,13 @@ def deserialize(cls, header, frames):
         out._key_columns = key_columns
         return out
 
+    def copy(self, deep=True):
+        out = _Grouping.__new__(_Grouping)
+        out.names = copy.deepcopy(self.names)
+        out._named_columns = copy.deepcopy(self._named_columns)
+        out._key_columns = [col.copy(deep=deep) for col in self._key_columns]
+        return out
+
 
 def _is_multi_agg(aggs):
     """
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 0ec06f8d81f..c7e25cdc430 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,7 +5,7 @@
 import math
 import pickle
 import warnings
-from functools import cached_property
+from functools import cache, cached_property
 from numbers import Number
 from typing import (
     Any,
@@ -15,7 +15,6 @@
     Optional,
     Tuple,
     Type,
-    TypeVar,
     Union,
 )
 
@@ -28,6 +27,7 @@
 from cudf._lib.datetime import extract_quarter, is_leap_year
 from cudf._lib.filling import sequence
 from cudf._lib.search import search_sorted
+from cudf._lib.types import size_type_dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     is_categorical_dtype,
@@ -37,7 +37,7 @@
     is_scalar,
     is_string_dtype,
 )
-from cudf.core._base_index import BaseIndex, _index_astype_docstring
+from cudf.core._base_index import BaseIndex
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
@@ -47,7 +47,6 @@
     StringColumn,
     StructColumn,
     TimeDeltaColumn,
-    arange,
     column,
 )
 from cudf.core.column.column import as_column, concat_columns
@@ -56,16 +55,19 @@
 from cudf.core.frame import Frame
 from cudf.core.mixins import BinaryOperand
 from cudf.core.single_column_frame import SingleColumnFrame
-from cudf.utils.docutils import copy_docstring, doc_apply
+from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
     numeric_normalize_types,
 )
-from cudf.utils.utils import _cudf_nvtx_annotate, search_range
-
-T = TypeVar("T", bound="Frame")
+from cudf.utils.utils import (
+    _cudf_nvtx_annotate,
+    _is_same_name,
+    _warn_no_dask_cudf,
+    search_range,
+)
 
 
 def _lexsorted_equal_range(
@@ -154,9 +156,16 @@ class RangeIndex(BaseIndex, BinaryOperand):
     copy : bool, default False
         Unused, accepted for homogeneity with other index types.
 
-    Returns
+    Attributes
+    ----------
+    start
+    stop
+    step
+
+    Methods
     -------
-    RangeIndex
+    to_numpy
+    to_arrow
 
     Examples
     --------
@@ -203,12 +212,21 @@ def _copy_type_metadata(
         # have an underlying column.
         return self
 
+    def searchsorted(
+        self,
+        value: int,
+        side: str = "left",
+        ascending: bool = True,
+        na_position: str = "last",
+    ):
+        assert (len(self) <= 1) or (
+            ascending == (self._step > 0)
+        ), "Invalid ascending flag"
+        return search_range(value, self.as_range, side=side)
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def name(self):
-        """
-        Returns the name of the Index.
-        """
         return self._name
 
     @name.setter  # type: ignore
@@ -297,6 +315,10 @@ def __contains__(self, item):
             item, tuple(np.sctypes["int"] + np.sctypes["float"] + [int, float])
         ):
             return False
+        try:
+            item = pd.core.dtypes.common.ensure_python_int(item)
+        except TypeError:
+            return False
         if not item % 1 == 0:
             return False
         return item in range(self._start, self._stop, self._step)
@@ -332,6 +354,7 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
         New RangeIndex instance with same range, casted to new dtype
         """
         if dtype is not None:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "parameter dtype is deprecated and will be removed in a "
                 "future version. Use the astype method instead.",
@@ -339,6 +362,7 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
             )
 
         if names is not None:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "parameter names is deprecated and will be removed in a "
                 "future version. Use the name parameter instead.",
@@ -357,7 +381,6 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
         )
 
     @_cudf_nvtx_annotate
-    @doc_apply(_index_astype_docstring)
     def astype(self, dtype, copy: bool = True):
         if is_dtype_equal(dtype, self.dtype):
             return self
@@ -460,46 +483,6 @@ def dtype(self):
         dtype = np.dtype(np.int64)
         return _maybe_convert_to_default_type(dtype)
 
-    @_cudf_nvtx_annotate
-    def find_label_range(self, first=None, last=None):
-        """Find subrange in the ``RangeIndex``, marked by their positions, that
-        starts greater or equal to ``first`` and ends less or equal to ``last``
-
-        The range returned is assumed to be monotonically increasing. In cases
-        where there is no such range that suffice the constraint, an exception
-        will be raised.
-
-        Parameters
-        ----------
-        first, last : int, optional, Default None
-            The "start" and "stop" values of the subrange. If None, will use
-            ``self._start`` as first, ``self._stop`` as last.
-
-        Returns
-        -------
-        begin, end : 2-tuple of int
-            The starting index and the ending index.
-            The `last` value occurs at ``end - 1`` position.
-        """
-
-        first = self._start if first is None else first
-        last = self._stop if last is None else last
-
-        if self._step < 0:
-            first = -first
-            last = -last
-            start = -self._start
-            step = -self._step
-        else:
-            start = self._start
-            step = self._step
-
-        stop = start + len(self) * step
-        begin = search_range(start, stop, first, step, side="left")
-        end = search_range(start, stop, last, step, side="right")
-
-        return begin, end
-
     @_cudf_nvtx_annotate
     def to_pandas(self, nullable=False):
         return pd.RangeIndex(
@@ -512,62 +495,22 @@ def to_pandas(self, nullable=False):
 
     @property
     def is_unique(self):
-        """
-        Return if the index has unique values.
-        """
         return True
 
-    @property  # type: ignore
+    @cached_property
+    def as_range(self):
+        return range(self._start, self._stop, self._step)
+
+    @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic_increasing(self):
         return self._step > 0 or len(self) <= 1
 
-    @property  # type: ignore
+    @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic_decreasing(self):
         return self._step < 0 or len(self) <= 1
 
-    @_cudf_nvtx_annotate
-    def get_slice_bound(self, label, side, kind=None):
-        """
-        Calculate slice bound that corresponds to given label.
-        Returns leftmost (one-past-the-rightmost if ``side=='right'``) position
-        of given label.
-
-        Parameters
-        ----------
-        label : int
-            A valid value in the ``RangeIndex``
-        side : {'left', 'right'}
-        kind : Unused
-            To keep consistency with other index types.
-
-        Returns
-        -------
-        int
-            Index of label.
-        """
-        if kind is not None:
-            warnings.warn(
-                "'kind' argument in get_slice_bound is deprecated and will be "
-                "removed in a future version.",
-                FutureWarning,
-            )
-        if side not in {"left", "right"}:
-            raise ValueError(f"Unrecognized side parameter: {side}")
-
-        if self._step < 0:
-            label = -label
-            start = -self._start
-            step = -self._step
-        else:
-            start = self._start
-            step = self._step
-
-        stop = start + len(self) * step
-        pos = search_range(start, stop, label, step, side=side)
-        return pos
-
     @_cudf_nvtx_annotate
     def memory_usage(self, deep=False):
         if deep:
@@ -621,6 +564,7 @@ def get_loc(self, key, method=None, tolerance=None):
         # get_indexers method as an alternative, see
         # https://github.com/rapidsai/cudf/issues/12312
         if method is not None:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 f"Passing method to {self.__class__.__name__}.get_loc is "
                 "deprecated and will raise in a future version.",
@@ -839,9 +783,25 @@ def _binaryop(self, other, op: str):
     def join(
         self, other, how="left", level=None, return_indexers=False, sort=False
     ):
-        # TODO: pandas supports directly merging RangeIndex objects and can
-        # intelligently create RangeIndex outputs depending on the type of
-        # join. We need to implement that for the supported special cases.
+        if how in {"left", "right"} or self.equals(other):
+            # pandas supports directly merging RangeIndex objects and can
+            # intelligently create RangeIndex outputs depending on the type of
+            # join. Hence falling back to performing a merge on pd.RangeIndex
+            # since the conversion is cheap.
+            if isinstance(other, RangeIndex):
+                result = self.to_pandas().join(
+                    other.to_pandas(),
+                    how=how,
+                    level=level,
+                    return_indexers=return_indexers,
+                    sort=sort,
+                )
+                if return_indexers:
+                    return tuple(
+                        cudf.from_pandas(result[0]), result[1], result[2]
+                    )
+                else:
+                    return cudf.from_pandas(result)
         return self._as_int_index().join(
             other, how, level, return_indexers, sort
         )
@@ -941,6 +901,13 @@ def any(self):
     def append(self, other):
         return self._as_int_index().append(other)
 
+    def _indices_of(self, value) -> cudf.core.column.NumericalColumn:
+        try:
+            i = [range(self._start, self._stop, self._step).index(value)]
+        except ValueError:
+            i = []
+        return as_column(i, dtype=size_type_dtype)
+
     def isin(self, values):
         if is_scalar(values):
             raise TypeError(
@@ -959,6 +926,10 @@ def __pos__(self):
     def __abs__(self):
         return abs(self._as_int_index())
 
+    @_warn_no_dask_cudf
+    def __dask_tokenize__(self):
+        return (type(self), self.start, self.stop, self.step)
+
 
 class GenericIndex(SingleColumnFrame, BaseIndex):
     """
@@ -1048,7 +1019,7 @@ def _from_data(
 
     def _binaryop(
         self,
-        other: T,
+        other: Frame,
         op: str,
         fill_value: Any = None,
         *args,
@@ -1058,12 +1029,27 @@ def _binaryop(
         operands = self._make_operands_for_binop(other, fill_value, reflect)
         if operands is NotImplemented:
             return NotImplemented
-        ret = _index_from_data(self._colwise_binop(operands, op))
+        binop_result = self._colwise_binop(operands, op)
+
+        if isinstance(other, cudf.Series):
+            ret = other._from_data_like_self(binop_result)
+            ret.name = (
+                self.name
+                if cudf.utils.utils._is_same_name(self.name, other.name)
+                else None
+            )
+        else:
+            ret = _index_from_data(binop_result)
 
         # pandas returns numpy arrays when the outputs are boolean. We
         # explicitly _do not_ use isinstance here: we want only boolean
         # GenericIndexes, not dtype-specific subclasses.
-        if type(ret) is GenericIndex and ret.dtype.kind == "b":
+        if (
+            isinstance(ret, (GenericIndex, cudf.Series))
+            and ret.dtype.kind == "b"
+        ):
+            if ret._column.has_nulls():
+                ret = ret.fillna(op == "__ne__")
             return ret.values
         return ret
 
@@ -1103,17 +1089,13 @@ def _concat(cls, objs):
     def memory_usage(self, deep=False):
         return self._column.memory_usage
 
+    @cached_property  # type: ignore
     @_cudf_nvtx_annotate
-    def equals(self, other, **kwargs):
-        """
-        Determine if two Index objects contain the same elements.
+    def is_unique(self):
+        return self._column.is_unique
 
-        Returns
-        -------
-        out: bool
-            True if "other" is an Index and it has the same elements
-            as calling index; False otherwise.
-        """
+    @_cudf_nvtx_annotate
+    def equals(self, other):
         if (
             other is None
             or not isinstance(other, BaseIndex)
@@ -1172,6 +1154,7 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
         New index instance, casted to new dtype
         """
         if dtype is not None:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "parameter dtype is deprecated and will be removed in a "
                 "future version. Use the astype method instead.",
@@ -1179,6 +1162,7 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
             )
 
         if names is not None:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "parameter names is deprecated and will be removed in a "
                 "future version. Use the name parameter instead.",
@@ -1192,7 +1176,6 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
         return _index_from_data({name: col.copy(True) if deep else col})
 
     @_cudf_nvtx_annotate
-    @doc_apply(_index_astype_docstring)
     def astype(self, dtype, copy: bool = True):
         return _index_from_data(super().astype({self.name: dtype}, copy))
 
@@ -1241,6 +1224,7 @@ def get_loc(self, key, method=None, tolerance=None):
         # get_indexers method as an alternative, see
         # https://github.com/rapidsai/cudf/issues/12312
         if method is not None:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 f"Passing method to {self.__class__.__name__}.get_loc is "
                 "deprecated and will raise in a future version.",
@@ -1363,7 +1347,7 @@ def __repr__(self):
             else:
                 output = repr(preprocess.to_pandas())
 
-            output = output.replace("nan", cudf._NA_REP)
+            output = output.replace("nan", str(cudf.NA))
         elif preprocess._values.nullable:
             output = repr(self._clean_nulls_from_index().to_pandas())
 
@@ -1416,26 +1400,6 @@ def dtype(self):
         """
         return self._values.dtype
 
-    @_cudf_nvtx_annotate
-    def find_label_range(self, first, last):
-        """Find range that starts with *first* and ends with *last*,
-        inclusively.
-
-        Returns
-        -------
-        begin, end : 2-tuple of int
-            The starting index and the ending index.
-            The *last* value occurs at ``end - 1`` position.
-        """
-        col = self._values
-        begin, end = None, None
-        if first is not None:
-            begin = col.find_first_value(first, closest=True)
-        if last is not None:
-            end = col.find_last_value(last, closest=True)
-            end += 1
-        return begin, end
-
     @_cudf_nvtx_annotate
     def isna(self):
         return self._column.isnull().values
@@ -1448,16 +1412,6 @@ def notna(self):
 
     notnull = notna
 
-    @_cudf_nvtx_annotate
-    def get_slice_bound(self, label, side, kind=None):
-        if kind is not None:
-            warnings.warn(
-                "'kind' argument in get_slice_bound is deprecated and will be "
-                "removed in a future version.",
-                FutureWarning,
-            )
-        return self._values.get_slice_bound(label, side, kind)
-
     def _is_numeric(self):
         return False
 
@@ -1493,7 +1447,7 @@ def argsort(
         ascending=True,
         na_position="last",
     ):
-        """Return the integer indices that would sort the Series values.
+        """Return the integer indices that would sort the index.
 
         Parameters
         ----------
@@ -1545,8 +1499,14 @@ def __contains__(self, item):
 
     def _clean_nulls_from_index(self):
         if self._values.has_nulls():
+            fill_value = (
+                str(cudf.NaT)
+                if isinstance(self, (DatetimeIndex, TimedeltaIndex))
+                else str(cudf.NA)
+            )
             return cudf.Index(
-                self._values.astype("str").fillna(cudf._NA_REP), name=self.name
+                self._values.astype("str").fillna(fill_value),
+                name=self.name,
             )
 
         return self
@@ -1613,6 +1573,16 @@ def isin(self, values):
 
         return self._values.isin(values).values
 
+    def _indices_of(self, value):
+        """Return indices of value in index"""
+        return self._column.indices_of(value)
+
+    @cache
+    @_warn_no_dask_cudf
+    def __dask_tokenize__(self):
+        # We can use caching, because an index is immutable
+        return super().__dask_tokenize__()
+
 
 class NumericIndex(GenericIndex):
     """Immutable, ordered and sliceable sequence of labels.
@@ -1638,6 +1608,7 @@ class NumericIndex(GenericIndex):
 
     @_cudf_nvtx_annotate
     def __init__(self, data=None, dtype=None, copy=False, name=None):
+        # Do not remove until pandas 2.0 support is added.
         warnings.warn(
             f"cudf.{self.__class__.__name__} is deprecated and will be "
             "removed from cudf in a future version. Use cudf.Index with the "
@@ -1694,6 +1665,14 @@ class Int8Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     Int8Index
@@ -1719,6 +1698,14 @@ class Int16Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     Int16Index
@@ -1744,6 +1731,14 @@ class Int32Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     Int32Index
@@ -1769,6 +1764,14 @@ class Int64Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     Int64Index
@@ -1794,6 +1797,14 @@ class UInt8Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     UInt8Index
@@ -1819,6 +1830,14 @@ class UInt16Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     UInt16Index
@@ -1844,6 +1863,14 @@ class UInt32Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     UInt32Index
@@ -1869,6 +1896,14 @@ class UInt64Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     UInt64Index
@@ -1894,6 +1929,14 @@ class Float32Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     Float32Index
@@ -1925,6 +1968,14 @@ class Float64Index(NumericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    None
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     Float64Index
@@ -1965,6 +2016,32 @@ class DatetimeIndex(GenericIndex):
         If True parse dates in data with the year first order.
         This is not yet supported
 
+    Attributes
+    ----------
+    year
+    month
+    day
+    hour
+    minute
+    second
+    microsecond
+    nanosecond
+    date
+    time
+    dayofyear
+    day_of_year
+    weekday
+    quarter
+    freq
+
+    Methods
+    -------
+    ceil
+    floor
+    round
+    tz_convert
+    tz_localize
+
     Returns
     -------
     DatetimeIndex
@@ -2032,6 +2109,26 @@ def __init__(
 
         super().__init__(data, **kwargs)
 
+    def __getitem__(self, index):
+        value = super().__getitem__(index)
+        if cudf.get_option("mode.pandas_compatible") and isinstance(
+            value, np.datetime64
+        ):
+            return pd.Timestamp(value)
+        return value
+
+    def searchsorted(
+        self,
+        value,
+        side: str = "left",
+        ascending: bool = True,
+        na_position: str = "last",
+    ):
+        value = self.dtype.type(value)
+        return super().searchsorted(
+            value, side=side, ascending=ascending, na_position=na_position
+        )
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def year(self):
@@ -2528,7 +2625,7 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
         ...                                   '2018-10-28 03:46:00']))
         >>> s.dt.tz_localize("CET")
         0    2018-10-28 01:20:00.000000000
-        1                             <NA>
+        1                              NaT
         2    2018-10-28 03:46:00.000000000
         dtype: datetime64[ns, CET]
 
@@ -2539,14 +2636,55 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
         ambiguous or nonexistent timestamps are converted
         to 'NaT'.
         """
-        from cudf.core._internals.timezones import localize
+        from cudf.core._internals.timezones import delocalize, localize
 
         if tz is None:
-            result_col = self._column._local_time
+            result_col = delocalize(self._column)
         else:
             result_col = localize(self._column, tz, ambiguous, nonexistent)
         return DatetimeIndex._from_data({self.name: result_col})
 
+    def tz_convert(self, tz):
+        """
+        Convert tz-aware datetimes from one time zone to another.
+
+        Parameters
+        ----------
+        tz : str
+            Time zone for time. Corresponding timestamps would be converted
+            to this time zone of the Datetime Array/Index.
+            A `tz` of None will convert to UTC and remove the timezone
+            information.
+
+        Returns
+        -------
+        DatetimeIndex containing timestamps corresponding to the timezone
+        `tz`.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> dti = cudf.date_range('2018-03-01 09:00', periods=3, freq='D')
+        >>> dti = dti.tz_localize("America/New_York")
+        >>> dti
+        DatetimeIndex(['2018-03-01 09:00:00-05:00',
+                       '2018-03-02 09:00:00-05:00',
+                       '2018-03-03 09:00:00-05:00'],
+                      dtype='datetime64[ns, America/New_York]')
+        >>> dti.tz_convert("Europe/London")
+        DatetimeIndex(['2018-03-01 14:00:00+00:00',
+                       '2018-03-02 14:00:00+00:00',
+                       '2018-03-03 14:00:00+00:00'],
+                      dtype='datetime64[ns, Europe/London]')
+        """
+        from cudf.core._internals.timezones import convert
+
+        if tz is None:
+            result_col = self._column._utc_time
+        else:
+            result_col = convert(self._column, tz)
+        return DatetimeIndex._from_data({self.name: result_col})
+
 
 class TimedeltaIndex(GenericIndex):
     """
@@ -2571,6 +2709,19 @@ class TimedeltaIndex(GenericIndex):
     name : object
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    days
+    seconds
+    microseconds
+    nanoseconds
+    components
+    inferred_freq
+
+    Methods
+    -------
+    None
+
     Returns
     -------
     TimedeltaIndex
@@ -2624,6 +2775,14 @@ def __init__(
 
         super().__init__(data, **kwargs)
 
+    def __getitem__(self, index):
+        value = super().__getitem__(index)
+        if cudf.get_option("mode.pandas_compatible") and isinstance(
+            value, np.timedelta64
+        ):
+            return pd.Timedelta(value)
+        return value
+
     @_cudf_nvtx_annotate
     def to_pandas(self, nullable=False):
         return pd.TimedeltaIndex(
@@ -2715,6 +2874,15 @@ class CategoricalIndex(GenericIndex):
     name : object, optional
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    codes
+    categories
+
+    Methods
+    -------
+    equals
+
     Returns
     -------
     CategoricalIndex
@@ -2860,89 +3028,56 @@ def interval_range(
             closed='left',
             dtype='interval')
     """
-    if freq and periods and start and end:
+    nargs = sum(_ is not None for _ in (start, end, periods, freq))
+
+    # we need at least three of (start, end, periods, freq)
+    if nargs == 2 and freq is None:
+        freq = 1
+        nargs += 1
+
+    if nargs != 3:
         raise ValueError(
             "Of the four parameters: start, end, periods, and "
             "freq, exactly three must be specified"
         )
-    args = [
-        cudf.Scalar(x) if x is not None else None
-        for x in (start, end, freq, periods)
-    ]
+
+    start = cudf.Scalar(start) if start is not None else start
+    end = cudf.Scalar(end) if end is not None else end
+    periods = cudf.Scalar(int(periods)) if periods is not None else periods
+    freq = cudf.Scalar(freq) if freq is not None else freq
+
+    if start is None:
+        start = end - freq * periods
+    elif freq is None:
+        quotient, remainder = divmod((end - start).value, periods.value)
+        if remainder:
+            freq = (end - start) / periods
+        else:
+            freq = cudf.Scalar(int(quotient))
+    elif periods is None:
+        periods = cudf.Scalar(int((end - start) / freq))
+    elif end is None:
+        end = start + periods * freq
+
     if any(
-        not _is_non_decimal_numeric_dtype(x.dtype) if x is not None else False
-        for x in args
+        not _is_non_decimal_numeric_dtype(x.dtype)
+        for x in (start, periods, freq, end)
     ):
         raise ValueError("start, end, periods, freq must be numeric values.")
-    *rargs, periods = args
-    common_dtype = find_common_type([x.dtype for x in rargs if x])
-    start, end, freq = rargs
-    periods = periods.astype("int64") if periods is not None else None
-
-    if periods and not freq:
-        # if statement for mypy to pass
-        if end is not None and start is not None:
-            # divmod only supported on host side scalars
-            quotient, remainder = divmod((end - start).value, periods.value)
-            if remainder:
-                freq_step = cudf.Scalar((end - start) / periods)
-            else:
-                freq_step = cudf.Scalar(quotient)
-            if start.dtype != freq_step.dtype:
-                start = start.astype(freq_step.dtype)
-            bin_edges = sequence(
-                size=periods + 1,
-                init=start.device_value,
-                step=freq_step.device_value,
-            )
-            left_col = bin_edges.slice(0, len(bin_edges) - 1)
-            right_col = bin_edges.slice(1, len(bin_edges))
-    elif freq and periods:
-        if end:
-            start = end - (freq * periods)
-        if start:
-            end = freq * periods + start
-        if end is not None and start is not None:
-            left_col = arange(
-                start.value, end.value, freq.value, dtype=common_dtype
-            )
-            end = end + 1
-            start = start + freq
-            right_col = arange(
-                start.value, end.value, freq.value, dtype=common_dtype
-            )
-    elif freq and not periods:
-        if end is not None and start is not None:
-            end = end - freq + 1
-            left_col = arange(
-                start.value, end.value, freq.value, dtype=common_dtype
-            )
-            end = end + freq + 1
-            start = start + freq
-            right_col = arange(
-                start.value, end.value, freq.value, dtype=common_dtype
-            )
-    elif start is not None and end is not None:
-        # if statements for mypy to pass
-        if freq:
-            left_col = arange(
-                start.value, end.value, freq.value, dtype=common_dtype
-            )
-        else:
-            left_col = arange(start.value, end.value, dtype=common_dtype)
-        start = start + 1
-        end = end + 1
-        if freq:
-            right_col = arange(
-                start.value, end.value, freq.value, dtype=common_dtype
-            )
-        else:
-            right_col = arange(start.value, end.value, dtype=common_dtype)
-    else:
-        raise ValueError(
-            "Of the four parameters: start, end, periods, and "
-            "freq, at least two must be specified"
-        )
+
+    periods = periods.astype("int64")
+    common_dtype = find_common_type((start.dtype, freq.dtype, end.dtype))
+    start = start.astype(common_dtype)
+    freq = freq.astype(common_dtype)
+
+    bin_edges = sequence(
+        size=periods + 1,
+        init=start.device_value,
+        step=freq.device_value,
+    )
+    left_col = bin_edges.slice(0, len(bin_edges) - 1)
+    right_col = bin_edges.slice(1, len(bin_edges))
+
     if len(right_col) == 0 or len(left_col) == 0:
         dtype = IntervalDtype("int64", closed)
         data = column.column_empty_like_same_mask(left_col, dtype)
@@ -2973,6 +3108,15 @@ class IntervalIndex(GenericIndex):
     name : object, optional
         Name to be stored in the index.
 
+    Attributes
+    ----------
+    values
+
+    Methods
+    -------
+    from_breaks
+    get_loc
+
     Returns
     -------
     IntervalIndex
@@ -2990,6 +3134,10 @@ def __init__(
         if copy:
             data = column.as_column(data, dtype=dtype).copy()
         kwargs = _setdefault_name(data, name=name)
+
+        if closed is None:
+            closed = "right"
+
         if isinstance(data, IntervalColumn):
             data = data
         elif isinstance(data, pd.Series) and (is_interval_dtype(data.dtype)):
@@ -3039,7 +3187,7 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None):
         >>> import cudf
         >>> import pandas as pd
         >>> cudf.IntervalIndex.from_breaks([0, 1, 2, 3])
-        IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval')
+        IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval[int64, right]')
         """
         if copy:
             breaks = column.as_column(breaks, dtype=dtype).copy()
@@ -3063,10 +3211,16 @@ def _is_interval(self):
     def _is_boolean(self):
         return False
 
+    def _clean_nulls_from_index(self):
+        return self
+
 
 class StringIndex(GenericIndex):
     """String defined indices into another Column
 
+    .. deprecated:: 23.06
+        `StringIndex` is deprecated, use `Index` instead.
+
     Attributes
     ----------
     _values: A StringColumn object or NDArray of strings
@@ -3075,6 +3229,13 @@ class StringIndex(GenericIndex):
 
     @_cudf_nvtx_annotate
     def __init__(self, values, copy=False, **kwargs):
+        # Do not remove until pandas 2.0 support is added.
+        warnings.warn(
+            f"cudf.{self.__class__.__name__} is deprecated and will be "
+            "removed from cudf in a future version. Use cudf.Index with the "
+            "appropriate dtype instead.",
+            FutureWarning,
+        )
         kwargs = _setdefault_name(values, **kwargs)
         if isinstance(values, StringColumn):
             values = values.copy(deep=copy)
@@ -3118,7 +3279,7 @@ def str(self):
 
     def _clean_nulls_from_index(self):
         if self._values.has_nulls():
-            return self.fillna(cudf._NA_REP)
+            return self.fillna(str(cudf.NA))
         else:
             return self
 
@@ -3155,7 +3316,7 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
     if isinstance(arbitrary, cudf.MultiIndex):
         return arbitrary
     elif isinstance(arbitrary, BaseIndex):
-        if arbitrary.name == kwargs["name"]:
+        if _is_same_name(arbitrary.name, kwargs["name"]):
             return arbitrary
         idx = arbitrary.copy(deep=False)
         idx.rename(kwargs["name"], inplace=True)
@@ -3245,9 +3406,9 @@ class Index(BaseIndex, metaclass=IndexMeta):
     Warnings
     --------
     This class should not be subclassed. It is designed as a factory for
-    different subclasses of :class:`BaseIndex` depending on the provided input.
+    different subclasses of `BaseIndex` depending on the provided input.
     If you absolutely must, and if you're intimately familiar with the
-    internals of cuDF, subclass :class:`BaseIndex` instead.
+    internals of cuDF, subclass `BaseIndex` instead.
 
     Examples
     --------
@@ -3298,6 +3459,14 @@ def from_arrow(cls, obj):
             # Try interpreting object as a MultiIndex before failing.
             return cudf.MultiIndex.from_arrow(obj)
 
+    @cached_property
+    def is_monotonic_increasing(self):
+        return super().is_monotonic_increasing
+
+    @cached_property
+    def is_monotonic_decreasing(self):
+        return super().is_monotonic_decreasing
+
 
 @_cudf_nvtx_annotate
 def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 95931af038c..69b25c51a66 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -27,6 +27,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+from typing_extensions import Self
 
 import cudf
 import cudf._lib as libcudf
@@ -37,13 +38,13 @@
     Dtype,
     NotImplementedType,
 )
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     is_bool_dtype,
     is_categorical_dtype,
     is_decimal_dtype,
     is_dict_like,
-    is_integer_dtype,
     is_list_dtype,
     is_list_like,
     is_scalar,
@@ -52,6 +53,7 @@
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import ColumnBase, as_column, full
 from cudf.core.column_accessor import ColumnAccessor
+from cudf.core.copy_types import BooleanMask, GatherMap
 from cudf.core.dtypes import ListDtype
 from cudf.core.frame import Frame
 from cudf.core.groupby.groupby import GroupBy
@@ -66,7 +68,8 @@
     _return_arr_from_dtype,
 )
 from cudf.utils import docutils
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils._numba import _CUDFNumbaConfig
+from cudf.utils.utils import _cudf_nvtx_annotate, _warn_no_dask_cudf
 
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
@@ -187,7 +190,7 @@ def _indices_from_labels(obj, labels):
     rhs = cudf.DataFrame(
         {"_": cudf.core.column.arange(len(obj))}, index=obj.index
     )
-    return lhs.join(rhs).sort_values("__")["_"]
+    return lhs.join(rhs).sort_values(by=["__", "_"])["_"]
 
 
 def _get_label_range_or_mask(index, start, stop, step):
@@ -210,8 +213,7 @@ def _get_label_range_or_mask(index, start, stop, step):
             boolean_mask = index <= stop
         return boolean_mask
     else:
-        start, stop = index.find_label_range(start, stop)
-        return slice(start, stop, step)
+        return index.find_label_range(slice(start, stop, step))
 
 
 class _FrameIndexer:
@@ -224,8 +226,6 @@ def __init__(self, frame):
 _LocIndexerClass = TypeVar("_LocIndexerClass", bound="_FrameIndexer")
 _IlocIndexerClass = TypeVar("_IlocIndexerClass", bound="_FrameIndexer")
 
-T = TypeVar("T", bound="IndexedFrame")
-
 
 class IndexedFrame(Frame):
     """A frame containing an index.
@@ -293,7 +293,9 @@ def _from_data(
 
     @_cudf_nvtx_annotate
     def _from_data_like_self(self, data: MutableMapping):
-        return self._from_data(data, self._index)
+        out = self._from_data(data, self._index)
+        out._data._level_names = self._data._level_names
+        return out
 
     @classmethod
     @_cudf_nvtx_annotate
@@ -335,7 +337,7 @@ def _from_columns_like_self(
         index_names: Optional[List[str]] = None,
         *,
         override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
-    ):
+    ) -> Self:
         """Construct a `Frame` from a list of columns with metadata from self.
 
         If `index_names` is set, the first `len(index_names)` columns are
@@ -357,8 +359,8 @@ def _from_columns_like_self(
         )
 
     def _mimic_inplace(
-        self: T, result: T, inplace: bool = False
-    ) -> Optional[Frame]:
+        self, result: Self, inplace: bool = False
+    ) -> Optional[Self]:
         if inplace:
             self._index = result._index
         return super()._mimic_inplace(result, inplace)
@@ -417,11 +419,7 @@ def _scan(self, op, axis=None, skipna=True):
                     result_col = col
             else:
                 if col.has_nulls(include_nan=True):
-                    # Workaround as find_first_value doesn't seem to work
-                    # in case of bools.
-                    first_index = int(
-                        col.isnull().astype("int8").find_first_value(1)
-                    )
+                    first_index = col.isnull().find_first_value(True)
                     result_col = col.copy()
                     result_col[first_index:] = None
                 else:
@@ -441,7 +439,7 @@ def _scan(self, op, axis=None, skipna=True):
             results[name] = getattr(result_col, op)()
         return self._from_data(results, self._index)
 
-    def _check_data_index_length_match(self: T) -> None:
+    def _check_data_index_length_match(self) -> None:
         # Validate that the number of rows in the data matches the index if the
         # data is not empty. This is a helper for the constructor.
         if self._data.nrows > 0 and self._data.nrows != len(self._index):
@@ -450,7 +448,69 @@ def _check_data_index_length_match(self: T) -> None:
                 f"match length of index ({len(self._index)})"
             )
 
-    def copy(self: T, deep: bool = True) -> T:
+    @property
+    @_cudf_nvtx_annotate
+    def empty(self):
+        """
+        Indicator whether DataFrame or Series is empty.
+
+        True if DataFrame/Series is entirely empty (no items),
+        meaning any of the axes are of length 0.
+
+        Returns
+        -------
+        out : bool
+            If DataFrame/Series is empty, return True, if not return False.
+
+        Notes
+        -----
+        If DataFrame/Series contains only `null` values, it is still not
+        considered empty. See the example below.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'A' : []})
+        >>> df
+        Empty DataFrame
+        Columns: [A]
+        Index: []
+        >>> df.empty
+        True
+
+        If we only have `null` values in our DataFrame, it is
+        not considered empty! We will need to drop
+        the `null`'s to make the DataFrame empty:
+
+        >>> df = cudf.DataFrame({'A' : [None, None]})
+        >>> df
+              A
+        0  <NA>
+        1  <NA>
+        >>> df.empty
+        False
+        >>> df.dropna().empty
+        True
+
+        Non-empty and empty Series example:
+
+        >>> s = cudf.Series([1, 2, None])
+        >>> s
+        0       1
+        1       2
+        2    <NA>
+        dtype: int64
+        >>> s.empty
+        False
+        >>> s = cudf.Series([])
+        >>> s
+        Series([], dtype: float64)
+        >>> s.empty
+        True
+        """
+        return self.size == 0
+
+    def copy(self, deep: bool = True) -> Self:
         """Make a copy of this object's indices and data.
 
         When ``deep=True`` (default), a new object will be created with a
@@ -762,11 +822,7 @@ def replace(
             ) = _get_replacement_values_for_columns(
                 to_replace=to_replace,
                 value=value,
-                # TODO: This should be replaced with `DataFrame._dtypes` once
-                # that is moved up to `Frame`.
-                columns_dtype_map={
-                    col: self._data[col].dtype for col in self._data
-                },
+                columns_dtype_map=self._dtypes,
             )
 
             for name, col in self._data.items():
@@ -918,12 +974,12 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
         return self._mimic_inplace(output, inplace=inplace)
 
     def _copy_type_metadata(
-        self: T,
-        other: T,
+        self,
+        other: Self,
         include_index: bool = True,
         *,
         override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
-    ) -> T:
+    ) -> Self:
         """
         Copy type metadata from each column of `other` to the corresponding
         column of `self`.
@@ -1008,7 +1064,13 @@ def interpolate(
 
         if not isinstance(data._index, cudf.RangeIndex):
             perm_sort = data._index.argsort()
-            data = data._gather(perm_sort)
+            data = data._gather(
+                GatherMap.from_column_unchecked(
+                    cudf.core.column.as_column(perm_sort),
+                    len(data),
+                    nullify=False,
+                )
+            )
 
         interpolator = cudf.core.algorithms.get_column_interpolator(method)
         columns = {}
@@ -1024,7 +1086,14 @@ def interpolate(
         return (
             result
             if isinstance(data._index, cudf.RangeIndex)
-            else result._gather(perm_sort.argsort())
+            # TODO: This should be a scatter, avoiding an argsort.
+            else result._gather(
+                GatherMap.from_column_unchecked(
+                    cudf.core.column.as_column(perm_sort.argsort()),
+                    len(result),
+                    nullify=False,
+                )
+            )
         )
 
     @_cudf_nvtx_annotate
@@ -1457,7 +1526,9 @@ def sort_index(
         na_position : {'first', 'last'}, default 'last'
             Puts NaNs at the beginning if first; last puts NaNs at the end.
         sort_remaining : bool, default True
-            Not yet supported
+            When sorting a multiindex on a subset of its levels,
+            should entries be lexsorted by the remaining
+            (non-specified) levels as well?
         ignore_index : bool, default False
             if True, index will be replaced with RangeIndex.
         key : callable, optional
@@ -1480,6 +1551,7 @@ def sort_index(
         Examples
         --------
         **Series**
+
         >>> import cudf
         >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4])
         >>> series
@@ -1505,6 +1577,7 @@ def sort_index(
         dtype: object
 
         **DataFrame**
+
         >>> df = cudf.DataFrame(
         ... {"b":[3, 2, 1], "a":[2, 1, 3]}, index=[1, 3, 2])
         >>> df.sort_index(axis=0)
@@ -1521,11 +1594,6 @@ def sort_index(
         if kind is not None:
             raise NotImplementedError("kind is not yet supported")
 
-        if not sort_remaining:
-            raise NotImplementedError(
-                "sort_remaining == False is not yet supported"
-            )
-
         if key is not None:
             raise NotImplementedError("key is not yet supported.")
 
@@ -1538,18 +1606,28 @@ def sort_index(
                 if level is not None:
                     # Pandas doesn't handle na_position in case of MultiIndex.
                     na_position = "first" if ascending is True else "last"
-                    labels = [
-                        idx._get_level_label(lvl)
-                        for lvl in (level if is_list_like(level) else (level,))
-                    ]
-                    # Explicitly construct a Frame rather than using type(self)
-                    # to avoid constructing a SingleColumnFrame (e.g. Series).
-                    idx = Frame._from_data(idx._data.select_by_label(labels))
+                    if not is_list_like(level):
+                        level = [level]
+                    by = list(map(idx._get_level_label, level))
+                    if sort_remaining:
+                        handled = set(by)
+                        by.extend(
+                            filter(
+                                lambda n: n not in handled,
+                                self.index._data.names,
+                            )
+                        )
+                else:
+                    by = list(idx._data.names)
 
                 inds = idx._get_sorted_inds(
-                    ascending=ascending, na_position=na_position
+                    by=by, ascending=ascending, na_position=na_position
+                )
+                out = self._gather(
+                    GatherMap.from_column_unchecked(
+                        inds, len(self), nullify=False
+                    )
                 )
-                out = self._gather(inds)
                 # TODO: frame factory function should handle multilevel column
                 # names
                 if (
@@ -1565,7 +1643,13 @@ def sort_index(
                 inds = idx.argsort(
                     ascending=ascending, na_position=na_position
                 )
-                out = self._gather(inds)
+                out = self._gather(
+                    GatherMap.from_column_unchecked(
+                        cudf.core.column.as_column(inds),
+                        len(self),
+                        nullify=False,
+                    )
+                )
                 if (
                     isinstance(self, cudf.core.dataframe.DataFrame)
                     and self._data.multiindex
@@ -1724,37 +1808,121 @@ def hash_values(self, method="murmur3", seed=None):
         )
 
     def _gather(
-        self, gather_map, keep_index=True, nullify=False, check_bounds=True
+        self,
+        gather_map: GatherMap,
+        keep_index=True,
     ):
         """Gather rows of frame specified by indices in `gather_map`.
 
-        Skip bounds checking if check_bounds is False.
-        Set rows to null for all out of bound indices if nullify is `True`.
-        """
-        gather_map = cudf.core.column.as_column(gather_map)
-
-        # TODO: For performance, the check and conversion of gather map should
-        # be done by the caller. This check will be removed in future release.
-        if not is_integer_dtype(gather_map.dtype):
-            gather_map = gather_map.astype(size_type_dtype)
-
-        if not libcudf.copying._gather_map_is_valid(
-            gather_map, len(self), check_bounds, nullify
-        ):
-            raise IndexError("Gather map index is out of bounds.")
+        Maintain the index if keep_index is True.
 
+        This function does no expensive bounds checking, but does
+        check that the number of rows of self matches the validated
+        number of rows.
+        """
+        if not gather_map.nullify and len(self) != gather_map.nrows:
+            raise IndexError("Gather map is out of bounds")
         return self._from_columns_like_self(
             libcudf.copying.gather(
                 list(self._index._columns + self._columns)
                 if keep_index
                 else list(self._columns),
-                gather_map,
-                nullify=nullify,
+                gather_map.column,
+                nullify=gather_map.nullify,
             ),
             self._column_names,
             self._index.names if keep_index else None,
         )
 
+    def _slice(self, arg: slice, keep_index: bool = True) -> Self:
+        """Slice a frame.
+
+        Parameters
+        ----------
+        arg
+            The slice
+        keep_index
+            Preserve the index when slicing?
+
+        Returns
+        -------
+        Sliced frame
+
+        Notes
+        -----
+        This slicing has normal python semantics.
+        """
+        num_rows = len(self)
+        if num_rows == 0:
+            return self
+        start, stop, stride = arg.indices(num_rows)
+        index = self.index
+        has_range_index = isinstance(index, RangeIndex)
+        if len(range(start, stop, stride)) == 0:
+            # Avoid materialising the range index column
+            result = self._empty_like(
+                keep_index=keep_index and not has_range_index
+            )
+            if keep_index and has_range_index:
+                lo = index.start + start * index.step
+                hi = index.start + stop * index.step
+                step = index.step * stride
+                result.index = RangeIndex(
+                    start=lo, stop=hi, step=step, name=index.name
+                )
+            return result
+        if start < 0:
+            start = start + num_rows
+
+        # At this point, we have converted slice arguments into
+        # indices that no longer wrap around.
+        # For example slice(4, None, -1) will produce the
+        # start, stop, stride tuple (4, -1, -1)
+        # This check makes sure -1 is not wrapped (again) to
+        # produce -1 + num_rows.
+        if stop < 0 and not (stride < 0 and stop == -1):
+            stop = stop + num_rows
+        stride = 1 if stride is None else stride
+
+        if (stop - start) * stride <= 0:
+            return self._empty_like(keep_index=True)
+
+        start = min(start, num_rows)
+        stop = min(stop, num_rows)
+
+        if stride != 1:
+            return self._gather(
+                GatherMap.from_column_unchecked(
+                    cudf.core.column.arange(
+                        start,
+                        stop=stop,
+                        step=stride,
+                        dtype=libcudf.types.size_type_dtype,
+                    ),
+                    len(self),
+                    nullify=False,
+                ),
+                keep_index=keep_index,
+            )
+
+        columns_to_slice = [
+            *(
+                self._index._data.columns
+                if keep_index and not has_range_index
+                else []
+            ),
+            *self._columns,
+        ]
+        result = self._from_columns_like_self(
+            libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0],
+            self._column_names,
+            None if has_range_index or not keep_index else self._index.names,
+        )
+
+        if keep_index and has_range_index:
+            result.index = self.index[start:stop]
+        return result
+
     def _positions_from_column_names(
         self, column_names, offset_by_index_columns=False
     ):
@@ -1932,7 +2100,7 @@ def duplicated(self, subset=None, keep="first"):
         return s
 
     @_cudf_nvtx_annotate
-    def _empty_like(self, keep_index=True):
+    def _empty_like(self, keep_index=True) -> Self:
         return self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
                 [
@@ -2006,6 +2174,7 @@ def backfill(self, value=None, axis=None, inplace=None, limit=None):
         -------
             Object with missing values filled or None if ``inplace=True``.
         """
+        # Do not remove until pandas removes this.
         warnings.warn(
             "DataFrame.backfill/Series.backfill is deprecated. Use "
             "DataFrame.bfill/Series.bfill instead",
@@ -2042,6 +2211,7 @@ def pad(self, value=None, axis=None, inplace=None, limit=None):
         -------
             Object with missing values filled or None if ``inplace=True``.
         """
+        # Do not remove until pandas removes this.
         warnings.warn(
             "DataFrame.pad/Series.pad is deprecated. Use "
             "DataFrame.ffill/Series.ffill instead",
@@ -2135,6 +2305,7 @@ def add_suffix(self, suffix):
         Examples
         --------
         **Series**
+
         >>> s = cudf.Series([1, 2, 3, 4])
         >>> s
         0    1
@@ -2150,6 +2321,7 @@ def add_suffix(self, suffix):
         dtype: int64
 
         **DataFrame**
+
         >>> df = cudf.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
         >>> df
            A  B
@@ -2164,10 +2336,7 @@ def add_suffix(self, suffix):
         2       3       5
         3       4       6
         """
-        raise NotImplementedError(
-            "`IndexedFrame.add_suffix` not currently implemented. \
-                Use `Series.add_suffix` or `DataFrame.add_suffix`"
-        )
+        raise NotImplementedError
 
     @acquire_spill_lock()
     @_cudf_nvtx_annotate
@@ -2193,7 +2362,8 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
         input_args = _get_input_args_from_frame(self)
         launch_args = output_args + input_args + list(args)
         try:
-            kernel.forall(len(self))(*launch_args)
+            with _CUDFNumbaConfig():
+                kernel.forall(len(self))(*launch_args)
         except Exception as e:
             raise RuntimeError("UDF kernel execution failed.") from e
 
@@ -2273,8 +2443,12 @@ def sort_values(
 
         # argsort the `by` column
         out = self._gather(
-            self._get_columns_by_label(by)._get_sorted_inds(
-                ascending=ascending, na_position=na_position
+            GatherMap.from_column_unchecked(
+                self._get_columns_by_label(by)._get_sorted_inds(
+                    ascending=ascending, na_position=na_position
+                ),
+                len(self),
+                nullify=False,
             ),
             keep_index=not ignore_index,
         )
@@ -2290,6 +2464,20 @@ def _n_largest_or_smallest(self, largest, n, columns, keep):
         if isinstance(columns, str):
             columns = [columns]
 
+        method = "nlargest" if largest else "nsmallest"
+        for col in columns:
+            if isinstance(self._data[col], cudf.core.column.StringColumn):
+                if isinstance(self, cudf.DataFrame):
+                    error_msg = (
+                        f"Column '{col}' has dtype {self._data[col].dtype}, "
+                        f"cannot use method '{method}' with this dtype"
+                    )
+                else:
+                    error_msg = (
+                        f"Cannot use method '{method}' with "
+                        f"dtype {self._data[col].dtype}"
+                    )
+                raise TypeError(error_msg)
         if len(self) == 0:
             return self
 
@@ -2299,11 +2487,14 @@ def _n_largest_or_smallest(self, largest, n, columns, keep):
 
             # argsort the `by` column
             return self._gather(
-                self._get_columns_by_label(columns)
-                ._get_sorted_inds(ascending=not largest)
-                .slice(*slice(None, n).indices(len(self))),
+                GatherMap.from_column_unchecked(
+                    self._get_columns_by_label(columns)
+                    ._get_sorted_inds(ascending=not largest)
+                    .slice(*slice(None, n).indices(len(self))),
+                    len(self),
+                    nullify=False,
+                ),
                 keep_index=True,
-                check_bounds=False,
             )
         elif keep == "last":
             indices = self._get_columns_by_label(columns)._get_sorted_inds(
@@ -2317,17 +2508,22 @@ def _n_largest_or_smallest(self, largest, n, columns, keep):
                 indices = indices.slice(
                     *slice(None, -n - 1, -1).indices(len(self))
                 )
-            return self._gather(indices, keep_index=True, check_bounds=False)
+            return self._gather(
+                GatherMap.from_column_unchecked(
+                    indices, len(self), nullify=False
+                ),
+                keep_index=True,
+            )
         else:
             raise ValueError('keep must be either "first", "last"')
 
     def _align_to_index(
-        self: T,
+        self,
         index: ColumnLike,
         how: str = "outer",
         sort: bool = True,
         allow_non_unique: bool = False,
-    ) -> T:
+    ) -> Self:
         index = cudf.core.index.as_index(index)
 
         if self.index.equals(index):
@@ -2399,10 +2595,12 @@ def _reindex(
 
         df = self
         if index is not None:
-            index = cudf.core.index.as_index(index)
+            index = cudf.core.index.as_index(
+                index, name=getattr(index, "name", self._index.name)
+            )
 
             idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
-                left_dtype == right_dtype
+                _is_same_dtype(left_dtype, right_dtype)
                 for left_dtype, right_dtype in zip(
                     (col.dtype for col in df.index._data.columns),
                     (col.dtype for col in index._data.columns),
@@ -2421,7 +2619,9 @@ def _reindex(
                 rhs = cudf.DataFrame._from_data(
                     {
                         # bookkeeping workaround for unnamed series
-                        name or 0: col
+                        (name or 0)
+                        if isinstance(self, cudf.Series)
+                        else name: col
                         for name, col in df._data.items()
                     },
                     index=df._index,
@@ -2896,21 +3096,27 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
             self._index.names,
         )
 
-    def _apply_boolean_mask(self, boolean_mask):
+    def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
         """Apply boolean mask to each row of `self`.
 
         Rows corresponding to `False` is dropped.
-        """
-        boolean_mask = cudf.core.column.as_column(boolean_mask)
-        if not is_bool_dtype(boolean_mask.dtype):
-            raise ValueError("boolean_mask is not boolean type.")
 
+        If keep_index is False, the index is not preserved.
+        """
+        if len(boolean_mask.column) != len(self):
+            raise IndexError(
+                "Boolean mask has wrong length: "
+                f"{len(boolean_mask.column)} not {len(self)}"
+            )
         return self._from_columns_like_self(
             libcudf.stream_compaction.apply_boolean_mask(
-                list(self._index._columns + self._columns), boolean_mask
+                list(self._index._columns + self._columns)
+                if keep_index
+                else list(self._columns),
+                boolean_mask.column,
             ),
             column_names=self._column_names,
-            index_names=self._index.names,
+            index_names=self._index.names if keep_index else None,
         )
 
     def take(self, indices, axis=0):
@@ -2955,7 +3161,7 @@ def take(self, indices, axis=0):
         if self._get_axis_from_axis_arg(axis) != 0:
             raise NotImplementedError("Only axis=0 is supported.")
 
-        return self._gather(indices)
+        return self._gather(GatherMap(indices, len(self), nullify=False))
 
     def _reset_index(self, level, drop, col_level=0, col_fill=""):
         """Shared path for DataFrame.reset_index and Series.reset_index."""
@@ -3002,7 +3208,9 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""):
         # inserted to the left of existing data columns.
         return (
             ColumnAccessor(
-                {**new_column_data, **self._data}, self._data.multiindex
+                {**new_column_data, **self._data},
+                self._data.multiindex,
+                self._data._level_names,
             ),
             index,
         )
@@ -3228,7 +3436,7 @@ def sample(
         0  1  3
         1  2  4
         """
-        axis = self._get_axis_from_axis_arg(axis)
+        axis = 0 if axis is None else self._get_axis_from_axis_arg(axis)
         size = self.shape[axis]
 
         # Compute `n` from parameter `frac`.
@@ -3301,19 +3509,21 @@ def _sample_axis_0(
         ignore_index: bool,
     ):
         try:
-            gather_map_array = random_state.choice(
-                len(self), size=n, replace=replace, p=weights
+            gather_map = GatherMap.from_column_unchecked(
+                cudf.core.column.as_column(
+                    random_state.choice(
+                        len(self), size=n, replace=replace, p=weights
+                    )
+                ),
+                len(self),
+                nullify=False,
             )
         except NotImplementedError as e:
             raise NotImplementedError(
                 "Random sampling with cupy does not support these inputs."
             ) from e
 
-        return self._gather(
-            cudf.core.column.as_column(gather_map_array),
-            keep_index=not ignore_index,
-            check_bounds=False,
-        )
+        return self._gather(gather_map, keep_index=not ignore_index)
 
     def _sample_axis_1(
         self,
@@ -3337,14 +3547,24 @@ def _binaryop(
         **kwargs,
     ):
         reflect, op = self._check_reflected_op(op)
-        operands, out_index = self._make_operands_and_index_for_binop(
+        (
+            operands,
+            out_index,
+            can_use_self_column_name,
+        ) = self._make_operands_and_index_for_binop(
             other, op, fill_value, reflect, can_reindex
         )
         if operands is NotImplemented:
             return NotImplemented
 
+        level_names = (
+            self._data._level_names if can_use_self_column_name else None
+        )
         return self._from_data(
-            ColumnAccessor(type(self)._colwise_binop(operands, op)),
+            ColumnAccessor(
+                type(self)._colwise_binop(operands, op),
+                level_names=level_names,
+            ),
             index=out_index,
         )
 
@@ -3363,6 +3583,7 @@ def _make_operands_and_index_for_binop(
             NotImplementedType,
         ],
         Optional[cudf.BaseIndex],
+        bool,
     ]:
         raise NotImplementedError(
             f"Binary operations are not supported for {self.__class__}"
@@ -3388,7 +3609,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         if cupy_func:
             if ufunc.nin == 2:
                 other = inputs[self is inputs[0]]
-                inputs, index = self._make_operands_and_index_for_binop(
+                inputs, index, _ = self._make_operands_and_index_for_binop(
                     other, fname
                 )
             else:
@@ -3915,12 +4136,15 @@ def groupby(
         axis=0,
         level=None,
         as_index=True,
-        sort=False,
+        sort=no_default,
         group_keys=False,
         squeeze=False,
         observed=True,
         dropna=True,
     ):
+        if sort is no_default:
+            sort = cudf.get_option("mode.pandas_compatible")
+
         if axis not in (0, "index"):
             raise NotImplementedError("axis parameter is not yet implemented")
 
@@ -4890,6 +5114,46 @@ def rank(
             index=source._index,
         ).astype(np.float64)
 
+    def convert_dtypes(
+        self,
+        infer_objects=True,
+        convert_string=True,
+        convert_integer=True,
+        convert_boolean=True,
+        convert_floating=True,
+        dtype_backend=None,
+    ):
+        """
+        Convert columns to the best possible nullable dtypes.
+
+        If the dtype is numeric, and consists of all integers, convert
+        to an appropriate integer extension type. Otherwise, convert
+        to an appropriate floating type.
+
+        All other dtypes are always returned as-is as all dtypes in
+        cudf are nullable.
+        """
+        result = self.copy()
+
+        if convert_floating:
+            # cast any floating columns to int64 if
+            # they are all integer data:
+            for name, col in result._data.items():
+                if col.dtype.kind == "f":
+                    col = col.fillna(0)
+                    if cp.allclose(col, col.astype("int64")):
+                        result._data[name] = col.astype("int64")
+        return result
+
+    @_warn_no_dask_cudf
+    def __dask_tokenize__(self):
+        return [
+            type(self),
+            self._dtypes,
+            self.index,
+            self.hash_values().values_host,
+        ]
+
 
 def _check_duplicate_level_names(specified, level_names):
     """Raise if any of `specified` has duplicates in `level_names`."""
@@ -5156,3 +5420,24 @@ def _drop_rows_by_labels(
             res = obj.to_frame(name="tmp").join(key_df, how="leftanti")["tmp"]
             res.name = obj.name
             return res
+
+
+def _is_same_dtype(lhs_dtype, rhs_dtype):
+    # Utility specific to `_reindex` to check
+    # for matching column dtype.
+    if lhs_dtype == rhs_dtype:
+        return True
+    elif (
+        is_categorical_dtype(lhs_dtype)
+        and not is_categorical_dtype(rhs_dtype)
+        and lhs_dtype.categories.dtype == rhs_dtype
+    ):
+        return True
+    elif (
+        is_categorical_dtype(rhs_dtype)
+        and not is_categorical_dtype(lhs_dtype)
+        and rhs_dtype.categories.dtype == lhs_dtype
+    ):
+        return True
+    else:
+        return False
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
new file mode 100644
index 00000000000..7242de9964f
--- /dev/null
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -0,0 +1,243 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, List, Tuple, Union
+
+from typing_extensions import TypeAlias
+
+import cudf
+from cudf.api.types import (
+    _is_scalar_or_zero_d_array,
+    is_bool_dtype,
+    is_integer,
+    is_integer_dtype,
+)
+from cudf.core.copy_types import BooleanMask, GatherMap
+
+
+class EmptyIndexer:
+    """An indexer that will produce an empty result."""
+
+    pass
+
+
+@dataclass
+class MapIndexer:
+    """An indexer for a gather map."""
+
+    key: GatherMap
+
+
+@dataclass
+class MaskIndexer:
+    """An indexer for a boolean mask."""
+
+    key: BooleanMask
+
+
+@dataclass
+class SliceIndexer:
+    """An indexer for a slice."""
+
+    key: slice
+
+
+@dataclass
+class ScalarIndexer:
+    """An indexer for a scalar value."""
+
+    key: GatherMap
+
+
+IndexingSpec: TypeAlias = Union[
+    EmptyIndexer, MapIndexer, MaskIndexer, ScalarIndexer, SliceIndexer
+]
+
+ColumnLabels: TypeAlias = List[str]
+
+
+def destructure_iloc_key(
+    key: Any, frame: Union[cudf.Series, cudf.DataFrame]
+) -> tuple[Any, ...]:
+    """
+    Destructure a potentially tuple-typed key into row and column indexers.
+
+    Tuple arguments to iloc indexing are treated specially. They are
+    picked apart into indexers for the row and column. If the number
+    of entries is less than the number of modes of the frame, missing
+    entries are slice-expanded.
+
+    If the user-provided key is not a tuple, it is treated as if it
+    were a singleton tuple, and then slice-expanded.
+
+    Once this destructuring has occurred, any entries that are
+    callables are then called with the indexed frame. This should
+    return a valid indexing object for the rows (respectively
+    columns), namely one of:
+
+    - A boolean mask of the same length as the frame in the given
+      dimension
+    - A scalar integer that indexes the frame
+    - An array-like of integers that index the frame
+    - A slice that indexes the frame
+
+    Integer and slice-based indexing follows usual Python conventions.
+
+    Parameters
+    ----------
+    key
+        The key to destructure
+    frame
+        DataFrame or Series to provide context
+
+    Returns
+    -------
+    tuple
+        Indexers with length equal to the dimension of the frame
+
+    Raises
+    ------
+    IndexError
+        If there are too many indexers, or any individual indexer is a tuple.
+    """
+    n = len(frame.shape)
+    if isinstance(key, tuple):
+        # Key potentially indexes rows and columns, slice-expand to
+        # shape of frame
+        indexers = key + (slice(None),) * (n - len(key))
+        if len(indexers) > n:
+            raise IndexError(
+                f"Too many indexers: got {len(indexers)} expected {n}"
+            )
+    else:
+        # Key indexes rows, slice-expand to shape of frame
+        indexers = (key, *(slice(None),) * (n - 1))
+    indexers = tuple(k(frame) if callable(k) else k for k in indexers)
+    if any(isinstance(k, tuple) for k in indexers):
+        raise IndexError(
+            "Too many indexers: can't have nested tuples in iloc indexing"
+        )
+    return indexers
+
+
+def destructure_dataframe_iloc_indexer(
+    key: Any, frame: cudf.DataFrame
+) -> Tuple[Any, Tuple[bool, ColumnLabels]]:
+    """Destructure an index key for DataFrame iloc getitem.
+
+    Parameters
+    ----------
+    key
+        Key to destructure
+    frame
+        DataFrame to provide context context
+
+    Returns
+    -------
+    tuple
+        2-tuple of a key for the rows and tuple of
+        (column_index_is_scalar, column_names) for the columns
+
+    Raises
+    ------
+    TypeError
+        If the column indexer is invalid
+    IndexError
+        If the provided key does not destructure correctly
+    NotImplementedError
+        If the requested column indexer repeats columns
+    """
+    rows, cols = destructure_iloc_key(key, frame)
+    if cols is Ellipsis:
+        cols = slice(None)
+    scalar = is_integer(cols)
+    try:
+        column_names: ColumnLabels = list(
+            frame._data.get_labels_by_index(cols)
+        )
+        if len(set(column_names)) != len(column_names):
+            raise NotImplementedError(
+                "cudf DataFrames do not support repeated column names"
+            )
+    except TypeError:
+        raise TypeError(
+            "Column indices must be integers, slices, "
+            "or list-like of integers"
+        )
+    if scalar:
+        assert (
+            len(column_names) == 1
+        ), "Scalar column indexer should not produce more than one column"
+
+    return rows, (scalar, column_names)
+
+
+def destructure_series_iloc_indexer(key: Any, frame: cudf.Series) -> Any:
+    """Destructure an index key for Series iloc getitem.
+
+    Parameters
+    ----------
+    key
+        Key to destructure
+    frame
+        Series for unpacking context
+
+    Returns
+    -------
+    Single key that will index the rows
+    """
+    (rows,) = destructure_iloc_key(key, frame)
+    return rows
+
+
+def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec:
+    """
+    Normalize and produce structured information about a row indexer.
+
+    Given a row indexer that has already been destructured by
+    :func:`destructure_iloc_key`, inspect further and produce structured
+    information for indexing operations to act upon.
+
+    Parameters
+    ----------
+    key
+        Suitably destructured key for row indexing
+    n
+        Length of frame to index
+
+    Returns
+    -------
+    IndexingSpec
+        Structured data for indexing. A tag + parsed data.
+
+    Raises
+    ------
+    IndexError
+        If a valid type of indexer is provided, but it is out of
+        bounds
+    TypeError
+        If the indexing key is otherwise invalid.
+    """
+    if key is Ellipsis:
+        return SliceIndexer(slice(None))
+    elif isinstance(key, slice):
+        return SliceIndexer(key)
+    elif _is_scalar_or_zero_d_array(key):
+        return ScalarIndexer(GatherMap(key, n, nullify=False))
+    else:
+        key = cudf.core.column.as_column(key)
+        if isinstance(key, cudf.core.column.CategoricalColumn):
+            key = key.as_numerical_column(key.codes.dtype)
+        if is_bool_dtype(key.dtype):
+            return MaskIndexer(BooleanMask(key, n))
+        elif len(key) == 0:
+            return EmptyIndexer()
+        elif is_integer_dtype(key.dtype):
+            return MapIndexer(GatherMap(key, n, nullify=False))
+        else:
+            raise TypeError(
+                "Cannot index by location "
+                f"with non-integer key of type {type(key)}"
+            )
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index 885a0e41b66..7d799fa1573 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -85,17 +85,43 @@ def _match_join_keys(
             "of the same precision and scale"
         )
 
-    if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)):
+    if (
+        np.issubdtype(ltype, np.number)
+        and np.issubdtype(rtype, np.number)
+        and not (
+            np.issubdtype(ltype, np.timedelta64)
+            or np.issubdtype(rtype, np.timedelta64)
+        )
+    ):
         common_type = (
             max(ltype, rtype)
             if ltype.kind == rtype.kind
             else np.find_common_type([], (ltype, rtype))
         )
-
-    elif np.issubdtype(ltype, np.datetime64) and np.issubdtype(
-        rtype, np.datetime64
+    elif (
+        np.issubdtype(ltype, np.datetime64)
+        and np.issubdtype(rtype, np.datetime64)
+    ) or (
+        np.issubdtype(ltype, np.timedelta64)
+        and np.issubdtype(rtype, np.timedelta64)
     ):
         common_type = max(ltype, rtype)
+    elif (
+        np.issubdtype(ltype, np.datetime64)
+        or np.issubdtype(ltype, np.timedelta64)
+    ) and not rcol.fillna(0).can_cast_safely(ltype):
+        raise TypeError(
+            f"Cannot join between {ltype} and {rtype}, please type-cast both "
+            "columns to the same type."
+        )
+    elif (
+        np.issubdtype(rtype, np.datetime64)
+        or np.issubdtype(rtype, np.timedelta64)
+    ) and not lcol.fillna(0).can_cast_safely(rtype):
+        raise TypeError(
+            f"Cannot join between {rtype} and {ltype}, please type-cast both "
+            "columns to the same type."
+        )
 
     if how == "left" and rcol.fillna(0).can_cast_safely(ltype):
         return lcol, rcol.astype(ltype)
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index de4e323a8d7..6a6e37180ca 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import warnings
@@ -6,6 +6,7 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf.core.copy_types import GatherMap
 from cudf.core.join._join_helpers import (
     _coerce_to_tuple,
     _ColumnIndexer,
@@ -190,17 +191,25 @@ def perform_merge(self) -> cudf.DataFrame:
         )
 
         gather_kwargs = {
-            "nullify": True,
-            "check_bounds": False,
             "keep_index": self._using_left_index or self._using_right_index,
         }
         left_result = (
-            self.lhs._gather(gather_map=left_rows, **gather_kwargs)
+            self.lhs._gather(
+                GatherMap.from_column_unchecked(
+                    left_rows, len(self.lhs), nullify=True
+                ),
+                **gather_kwargs,
+            )
             if left_rows is not None
             else cudf.DataFrame._from_data({})
         )
         right_result = (
-            self.rhs._gather(gather_map=right_rows, **gather_kwargs)
+            self.rhs._gather(
+                GatherMap.from_column_unchecked(
+                    right_rows, len(self.rhs), nullify=True
+                ),
+                **gather_kwargs,
+            )
             if right_rows is not None
             else cudf.DataFrame._from_data({})
         )
@@ -304,11 +313,14 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
             by.extend([result._data[col.name] for col in self._right_keys])
         if by:
             to_sort = cudf.DataFrame._from_data(dict(enumerate(by)))
-            sort_order = to_sort.argsort()
+            sort_order = GatherMap.from_column_unchecked(
+                cudf.core.column.as_column(to_sort.argsort()),
+                len(result),
+                nullify=False,
+            )
             result = result._gather(
                 sort_order,
                 keep_index=self._using_left_index or self._using_right_index,
-                check_bounds=False,
             )
         return result
 
@@ -422,6 +434,7 @@ def _validate_merge_params(
             # modified in the size 0 case.
             and max(lhs._data.nlevels, 1) != max(rhs._data.nlevels, 1)
         ):
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "merging between different levels is deprecated and will be "
                 f"removed in a future version. ({lhs._data.nlevels} levels on "
diff --git a/python/cudf/cudf/core/missing.py b/python/cudf/cudf/core/missing.py
index 02bcb7636f4..0d48a1d4136 100644
--- a/python/cudf/cudf/core/missing.py
+++ b/python/cudf/cudf/core/missing.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 
 # Pandas NAType enforces a single instance exists at a time
 # instantiating this class will yield the existing instance
 # of pandas._libs.missing.NAType, id(cudf.NA) == id(pd.NA).
-from pandas import NA
+from pandas import NA, NaT
 
-__all__ = ["NA"]
+__all__ = ["NA", "NaT"]
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index edabdb34435..bc6726879c1 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -4,6 +4,7 @@
 
 import itertools
 import numbers
+import operator
 import pickle
 import warnings
 from collections import abc
@@ -23,14 +24,8 @@
 from cudf.core import column
 from cudf.core._compat import PANDAS_GE_150
 from cudf.core.frame import Frame
-from cudf.core.index import (
-    BaseIndex,
-    _index_astype_docstring,
-    _lexsorted_equal_range,
-    as_index,
-)
-from cudf.utils.docutils import doc_apply
-from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
+from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
+from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate, _is_same_name
 
 
 def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
@@ -73,6 +68,33 @@ class MultiIndex(Frame, BaseIndex, NotIterable):
         Check that the levels/codes are consistent and valid.
         Not yet supported
 
+    Attributes
+    ----------
+    names
+    nlevels
+    dtypes
+    levels
+    codes
+
+    Methods
+    -------
+    from_arrays
+    from_tuples
+    from_product
+    from_frame
+    set_levels
+    set_codes
+    to_frame
+    to_flat_index
+    sortlevel
+    droplevel
+    swaplevel
+    reorder_levels
+    remove_unused_levels
+    get_level_values
+    get_loc
+    drop
+
     Returns
     -------
     MultiIndex
@@ -198,7 +220,12 @@ def names(self, value):
         self._names = pd.core.indexes.frozen.FrozenList(value)
 
     @_cudf_nvtx_annotate
-    @doc_apply(_index_astype_docstring)
+    def to_series(self, index=None, name=None):
+        raise NotImplementedError(
+            "MultiIndex.to_series isn't implemented yet."
+        )
+
+    @_cudf_nvtx_annotate
     def astype(self, dtype, copy: bool = True):
         if not is_object_dtype(dtype):
             raise TypeError(
@@ -285,8 +312,8 @@ def set_names(self, names, level=None, inplace=False):
             level = [self._level_index_from_level(lev) for lev in level]
 
         existing_names = list(self.names)
-        for i, l in enumerate(level):
-            existing_names[l] = names[i]
+        for i, lev in enumerate(level):
+            existing_names[lev] = names[i]
         names = existing_names
 
         return self._set_names(names=names, inplace=inplace)
@@ -403,6 +430,7 @@ def copy(
         # TODO: Update message when set_levels is implemented.
         # https://github.com/rapidsai/cudf/issues/12307
         if levels is not None:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "parameter levels is deprecated and will be removed in a "
                 "future version.",
@@ -412,6 +440,7 @@ def copy(
         # TODO: Update message when set_codes is implemented.
         # https://github.com/rapidsai/cudf/issues/12308
         if codes is not None:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "parameter codes is deprecated and will be removed in a "
                 "future version.",
@@ -419,6 +448,7 @@ def copy(
             )
 
         if dtype is not None:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "parameter dtype is deprecated and will be removed in a "
                 "future version. Use the astype method instead.",
@@ -426,7 +456,7 @@ def copy(
             )
 
         dtype = object if dtype is None else dtype
-        if not pd.core.dtypes.common.is_object_dtype(dtype):
+        if not pd.api.types.is_object_dtype(dtype):
             raise TypeError("Dtype for MultiIndex only supports object type.")
 
         # ._data needs to be rebuilt
@@ -480,7 +510,7 @@ def __repr__(self):
                     ),
                 ):
                     preprocess_df[name] = col.astype("str").fillna(
-                        cudf._NA_REP
+                        str(cudf.NaT)
                     )
 
             tuples_list = list(
@@ -560,6 +590,9 @@ def codes(self):
             self._compute_levels_and_codes()
         return self._codes
 
+    def get_slice_bound(self, label, side, kind=None):
+        raise NotImplementedError()
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def nlevels(self):
@@ -592,14 +625,8 @@ def levels(self):
                     (3, 12)],
                 names=['a', 'b'])
         >>> midx.levels
-        [0    1
-        1    2
-        2    3
-        dtype: int64, 0    10
-        1    11
-        2    12
-        dtype: int64]
-        """
+        [Int64Index([1, 2, 3], dtype='int64', name='a'), Int64Index([10, 11, 12], dtype='int64', name='b')]
+        """  # noqa: E501
         if self._levels is None:
             self._compute_levels_and_codes()
         return self._levels
@@ -709,18 +736,12 @@ def isin(self, values, level=None):
                 values_idx = cudf.MultiIndex.from_tuples(
                     values, names=self.names
                 )
-
-            res = []
-            for name in self.names:
-                level_idx = self.get_level_values(name)
-                value_idx = values_idx.get_level_values(name)
-
-                existence = level_idx.isin(value_idx)
-                res.append(existence)
-
-            result = res[0]
-            for i in res[1:]:
-                result = result & i
+            self_df = self.to_frame(index=False).reset_index()
+            values_df = values_idx.to_frame(index=False)
+            idx = self_df.merge(values_df)._data["index"]
+            res = cudf.core.column.full(size=len(self), fill_value=False)
+            res[idx] = True
+            result = res.values
         else:
             level_series = self.get_level_values(level)
             result = level_series.isin(values)
@@ -745,8 +766,9 @@ def _compute_levels_and_codes(self):
                 # `factorize` show up in other parts of public APIs.
                 warnings.simplefilter("ignore")
                 code, cats = cudf.Series._from_data({None: col}).factorize()
+            cats.name = name
             codes[name] = code.astype(np.int64)
-            levels.append(cudf.Series(cats, name=None))
+            levels.append(cats)
 
         self._levels = levels
         self._codes = cudf.DataFrame._from_data(codes)
@@ -769,7 +791,20 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
             ],
             axis=1,
         )
-        result = lookup.merge(data_table)["idx"]
+        # Sort indices in pandas compatible mode
+        # because we want the indices to be fetched
+        # in a deterministic order.
+        # TODO: Remove this after merge/join
+        # obtain deterministic ordering.
+        if cudf.get_option("mode.pandas_compatible"):
+            lookup_order = "_" + "_".join(map(str, lookup._data.names))
+            lookup[lookup_order] = column.arange(len(lookup))
+            postprocess = operator.methodcaller(
+                "sort_values", by=[lookup_order, "idx"]
+            )
+        else:
+            postprocess = lambda r: r  # noqa: E731
+        result = postprocess(lookup.merge(data_table))["idx"]
         # Avoid computing levels unless the result of the merge is empty,
         # which suggests that a KeyError should be raised.
         if len(result) == 0:
@@ -1122,6 +1157,10 @@ def from_tuples(cls, tuples, names=None):
         pdi = pd.MultiIndex.from_tuples(tuples, names=names)
         return cls.from_pandas(pdi)
 
+    @_cudf_nvtx_annotate
+    def to_numpy(self):
+        return self.values_host
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def values_host(self):
@@ -1510,7 +1549,11 @@ def from_pandas(cls, multiindex, nan_as_null=None):
     def is_unique(self):
         return len(self) == len(self.unique())
 
-    @property  # type: ignore
+    @property
+    def dtype(self):
+        return np.dtype("O")
+
+    @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic_increasing(self):
         """
@@ -1519,7 +1562,7 @@ def is_monotonic_increasing(self):
         """
         return self._is_sorted(ascending=None, null_position=None)
 
-    @property  # type: ignore
+    @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic_decreasing(self):
         """
@@ -1600,7 +1643,7 @@ def memory_usage(self, deep=False):
     def difference(self, other, sort=None):
         if hasattr(other, "to_pandas"):
             other = other.to_pandas()
-        return self.to_pandas().difference(other, sort)
+        return cudf.from_pandas(self.to_pandas().difference(other, sort))
 
     @_cudf_nvtx_annotate
     def append(self, other):
@@ -1847,7 +1890,7 @@ def _maybe_match_names(self, other):
         if len(self.names) != len(other.names):
             return [None] * len(self.names)
         return [
-            self_name if self_name == other_name else None
+            self_name if _is_same_name(self_name, other_name) else None
             for self_name, other_name in zip(self.names, other.names)
         ]
 
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index df901f05787..eb59cf83926 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2021-2022, NVIDIA CORPORATION &
+# SPDX-FileCopyrightText: Copyright (c) 2021-2023, NVIDIA CORPORATION &
 # AFFILIATES. All rights reserved.  SPDX-License-Identifier:
 # Apache-2.0
 #
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pickle
+
 import numpy as np
 import pandas as pd
 
@@ -82,6 +84,30 @@ def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
             allow_non_unique=True,
         )
 
+    def serialize(self):
+        header, frames = super().serialize()
+        grouping_head, grouping_frames = self.grouping.serialize()
+        header["grouping"] = grouping_head
+        header["resampler_type"] = pickle.dumps(type(self))
+        header["grouping_frames_count"] = len(grouping_frames)
+        frames.extend(grouping_frames)
+        return header, frames
+
+    @classmethod
+    def deserialize(cls, header, frames):
+        obj_type = pickle.loads(header["obj_type"])
+        obj = obj_type.deserialize(
+            header["obj"], frames[: header["num_obj_frames"]]
+        )
+        grouping = _ResampleGrouping.deserialize(
+            header["grouping"], frames[header["num_obj_frames"] :]
+        )
+        resampler_cls = pickle.loads(header["resampler_type"])
+        out = resampler_cls.__new__(resampler_cls)
+        out.grouping = grouping
+        super().__init__(out, obj, by=grouping)
+        return out
+
 
 class DataFrameResampler(_Resampler, DataFrameGroupBy):
     pass
@@ -95,6 +121,39 @@ class _ResampleGrouping(_Grouping):
 
     bin_labels: cudf.core.index.Index
 
+    def copy(self, deep=True):
+        out = super().copy(deep=deep)
+        result = _ResampleGrouping.__new__(_ResampleGrouping)
+        result.names = out.names
+        result._named_columns = out._named_columns
+        result._key_columns = out._key_columns
+        result.bin_labels = self.bin_labels.copy(deep=deep)
+        return result
+
+    def serialize(self):
+        header, frames = super().serialize()
+        labels_head, labels_frames = self.bin_labels.serialize()
+        header["__bin_labels"] = labels_head
+        header["__bin_labels_count"] = len(labels_frames)
+        frames.extend(labels_frames)
+        return header, frames
+
+    @classmethod
+    def deserialize(cls, header, frames):
+        names = pickle.loads(header["names"])
+        _named_columns = pickle.loads(header["_named_columns"])
+        key_columns = cudf.core.column.deserialize_columns(
+            header["columns"], frames[: -header["__bin_labels_count"]]
+        )
+        out = _ResampleGrouping.__new__(_ResampleGrouping)
+        out.names = names
+        out._named_columns = _named_columns
+        out._key_columns = key_columns
+        out.bin_labels = cudf.core.index.Index.deserialize(
+            header["__bin_labels"], frames[-header["__bin_labels_count"] :]
+        )
+        return out
+
     def _handle_frequency_grouper(self, by):
         # if `by` is a time frequency grouper, we bin the key column
         # using bin intervals specified by `by.freq`, then use *that*
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index df1a543c4aa..3c8489481d8 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1,9 +1,11 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import itertools
+import warnings
 from collections import abc
 from typing import Dict, Optional
 
+import cupy
 import numpy as np
 import pandas as pd
 
@@ -11,8 +13,10 @@
 from cudf._lib.transform import one_hot_encode
 from cudf._lib.types import size_type_dtype
 from cudf._typing import Dtype
+from cudf.api.extensions import no_default
 from cudf.core.column import ColumnBase, as_column, column_empty_like
 from cudf.core.column.categorical import CategoricalColumn
+from cudf.utils.dtypes import min_unsigned_type
 
 _AXIS_MAP = {0: 0, 1: 1, "index": 0, "columns": 1}
 
@@ -532,7 +536,8 @@ def melt(
             )
     else:
         # then all remaining columns in frame
-        value_vars = list(set(frame._column_names) - set(id_vars))
+        unique_id = set(id_vars)
+        value_vars = [c for c in frame._column_names if c not in unique_id]
 
     # Error for unimplemented support for datatype
     dtypes = [frame[col].dtype for col in id_vars + value_vars]
@@ -572,11 +577,9 @@ def _tile(A, reps):
     mdata = {col: _tile(frame[col], K) for col in id_vars}
 
     # Step 2: add variable
-    var_cols = [
-        cudf.Series(cudf.core.column.full(N, i, dtype=np.int8))
-        for i in range(len(value_vars))
-    ]
-    temp = cudf.Series._concat(objs=var_cols, index=None)
+    nval = len(value_vars)
+    dtype = min_unsigned_type(nval)
+    temp = cudf.Series(cupy.repeat(cupy.arange(nval, dtype=dtype), N))
 
     if not var_name:
         var_name = "variable"
@@ -609,7 +612,7 @@ def get_dummies(
     cats=None,
     sparse=False,
     drop_first=False,
-    dtype="uint8",
+    dtype=no_default,
 ):
     """Returns a dataframe whose columns are the one hot encodings of all
     columns in `df`
@@ -700,6 +703,16 @@ def get_dummies(
     if drop_first:
         raise NotImplementedError("drop_first is not supported yet")
 
+    if dtype is no_default:
+        # Do not remove until pandas 2.0 support is added.
+        warnings.warn(
+            "Default `dtype` value will be changed to 'bool' in a future "
+            "release, please update `dtype='bool'` to adapt for "
+            "future behavior.",
+            FutureWarning,
+        )
+        dtype = "uint8"
+
     if isinstance(df, cudf.DataFrame):
         encode_fallback_dtypes = ["object", "category"]
 
@@ -1127,7 +1140,7 @@ def _get_unique(column, dummy_na):
     if isinstance(column, cudf.core.column.CategoricalColumn):
         unique = column.categories
     else:
-        unique = column.unique()
+        unique = column.unique().sort_values()
     if not dummy_na:
         if np.issubdtype(unique.dtype, np.floating):
             unique = unique.nans_to_nulls()
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index e516177ad29..a20628f6601 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import decimal
 import operator
@@ -8,9 +8,9 @@
 import pyarrow as pa
 
 import cudf
-from cudf.api.types import is_scalar
+from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
 from cudf.core.dtypes import ListDtype, StructDtype
-from cudf.core.missing import NA
+from cudf.core.missing import NA, NaT
 from cudf.core.mixins import BinaryOperand
 from cudf.utils.dtypes import (
     get_allowed_combinations_for_operator,
@@ -243,7 +243,11 @@ def _preprocess_host_value(self, value, dtype):
             dtype = cudf.dtype(dtype)
 
         if not valid:
-            value = NA
+            value = (
+                NaT
+                if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype)
+                else NA
+            )
 
         return value, dtype
 
@@ -353,14 +357,17 @@ def _binaryop(self, other, op: str):
 
     def _dispatch_scalar_binop(self, other, op):
         if isinstance(other, Scalar):
-            other = other.value
+            rhs = other.value
+        else:
+            rhs = other
+        lhs = self.value
+        reflect, op = self._check_reflected_op(op)
+        if reflect:
+            lhs, rhs = rhs, lhs
         try:
-            func = getattr(operator, op)
+            return getattr(operator, op)(lhs, rhs)
         except AttributeError:
-            func = getattr(self.value, op)
-        else:
-            return func(self.value, other)
-        return func(other)
+            return getattr(lhs, op)(rhs)
 
     def _unaop_result_type_or_error(self, op):
         if op == "__neg__" and self.dtype == "bool":
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 6d4caebb8ad..78be3085754 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -16,16 +16,17 @@
 import pandas as pd
 from pandas._config import get_option
 from pandas.core.dtypes.common import is_float
+from typing_extensions import Self, assert_never
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.scalar import _is_null_host_scalar
 from cudf._typing import (
     ColumnLike,
     DataFrameOrSeries,
     NotImplementedType,
     ScalarLike,
 )
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_scalar_or_zero_d_array,
@@ -39,6 +40,7 @@
     is_string_dtype,
     is_struct_dtype,
 )
+from cudf.core import indexing_utils
 from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
@@ -180,22 +182,15 @@ class _SeriesIlocIndexer(_FrameIndexer):
     For integer-location based selection.
     """
 
+    _frame: cudf.Series
+
     @_cudf_nvtx_annotate
     def __getitem__(self, arg):
-        if isinstance(arg, tuple):
-            arg = list(arg)
-        data = self._frame._get_elements_from_column(arg)
-
-        if (
-            isinstance(data, (dict, list))
-            or _is_scalar_or_zero_d_array(data)
-            or _is_null_host_scalar(data)
-        ):
-            return data
-        return self._frame._from_data(
-            {self._frame.name: data},
-            index=cudf.Index(self._frame.index[arg]),
+        indexing_spec = indexing_utils.parse_row_iloc_indexer(
+            indexing_utils.destructure_series_iloc_indexer(arg, self._frame),
+            len(self._frame),
         )
+        return self._frame._getitem_preprocessed(indexing_spec)
 
     @_cudf_nvtx_annotate
     def __setitem__(self, key, value):
@@ -306,12 +301,15 @@ def _loc_to_iloc(self, arg):
                     found_index = arg
                     return found_index
             try:
-                found_index = self._frame.index._values.find_first_value(
-                    arg, closest=False
-                )
-                return found_index
+                indices = self._frame.index._indices_of(arg)
+                if (n := len(indices)) == 0:
+                    raise KeyError("Label scalar is out of bounds")
+                elif n == 1:
+                    return indices.element_indexing(0)
+                else:
+                    return indices
             except (TypeError, KeyError, IndexError, ValueError):
-                raise KeyError("label scalar is out of bound")
+                raise KeyError("Label scalar is out of bounds")
 
         elif isinstance(arg, slice):
             return _get_label_range_or_mask(
@@ -660,6 +658,17 @@ def from_pandas(cls, s, nan_as_null=None):
         """
         return cls(s, nan_as_null=nan_as_null)
 
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
+    def is_unique(self):
+        """Return boolean if values in the object are unique.
+
+        Returns
+        -------
+        bool
+        """
+        return self._column.is_unique
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def dt(self):
@@ -788,17 +797,17 @@ def deserialize(cls, header, frames):
 
         return obj
 
-    def _get_columns_by_label(self, labels, downcast=False):
+    def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
         """Return the column specified by `labels`
 
         For cudf.Series, either the column, or an empty series is returned.
         Parameter `downcast` does not have effects.
         """
-        new_data = super()._get_columns_by_label(labels, downcast)
+        ca = self._data.select_by_label(labels)
 
         return (
-            self.__class__._from_data(data=new_data, index=self.index)
-            if len(new_data) > 0
+            self.__class__._from_data(data=ca, index=self.index)
+            if len(ca) > 0
             else self.__class__(dtype=self.dtype, name=self.name)
         )
 
@@ -822,6 +831,15 @@ def drop(
             labels, axis, index, columns, level, inplace, errors
         )
 
+    def tolist(self):  # noqa: D102
+        raise TypeError(
+            "cuDF does not support conversion to host memory "
+            "via the `tolist()` method. Consider using "
+            "`.to_arrow().to_pylist()` to construct a Python list."
+        )
+
+    to_list = tolist
+
     @_cudf_nvtx_annotate
     def to_dict(self, into: type[dict] = dict) -> dict:
         """
@@ -1150,7 +1168,12 @@ def __array_function__(self, func, types, args, kwargs):
         try:
             # Apply a Series method if one exists.
             if cudf_func := getattr(Series, func.__name__, None):
-                return cudf_func(*args, **kwargs)
+                result = cudf_func(*args, **kwargs)
+                if func.__name__ == "unique":
+                    # NumPy expects a sorted result for `unique`, which is not
+                    # guaranteed by cudf.Series.unique.
+                    result = result.sort_values()
+                return result
 
             # Assume that cupy subpackages match numpy and search the
             # corresponding cupy submodule based on the func's __module__.
@@ -1300,6 +1323,41 @@ def map(self, arg, na_action=None) -> "Series":
             result = self.apply(arg)
         return result
 
+    def _getitem_preprocessed(
+        self,
+        spec: indexing_utils.IndexingSpec,
+    ) -> Union[Self, ScalarLike]:
+        """Get subset of entries given structured data
+
+        Parameters
+        ----------
+        spec
+            Indexing specification
+
+        Returns
+        -------
+        Subsetted Series or else scalar (if a scalar entry is
+        requested)
+
+        Notes
+        -----
+        This function performs no bounds-checking or massaging of the
+        inputs.
+        """
+        if isinstance(spec, indexing_utils.MapIndexer):
+            return self._gather(spec.key, keep_index=True)
+        elif isinstance(spec, indexing_utils.MaskIndexer):
+            return self._apply_boolean_mask(spec.key, keep_index=True)
+        elif isinstance(spec, indexing_utils.SliceIndexer):
+            return self._slice(spec.key)
+        elif isinstance(spec, indexing_utils.ScalarIndexer):
+            return self._gather(
+                spec.key, keep_index=False
+            )._column.element_indexing(0)
+        elif isinstance(spec, indexing_utils.EmptyIndexer):
+            return self._empty_like(keep_index=True)
+        assert_never(spec)
+
     @_cudf_nvtx_annotate
     def __getitem__(self, arg):
         if isinstance(arg, slice):
@@ -1345,8 +1403,19 @@ def __repr__(self):
             preprocess._column,
             cudf.core.column.timedelta.TimeDeltaColumn,
         ):
+            fill_value = (
+                str(cudf.NaT)
+                if isinstance(
+                    preprocess._column,
+                    (
+                        cudf.core.column.TimeDeltaColumn,
+                        cudf.core.column.DatetimeColumn,
+                    ),
+                )
+                else str(cudf.NA)
+            )
             output = repr(
-                preprocess.astype("O").fillna(cudf._NA_REP).to_pandas()
+                preprocess.astype("str").fillna(fill_value).to_pandas()
             )
         elif isinstance(
             preprocess._column, cudf.core.column.CategoricalColumn
@@ -1378,7 +1447,7 @@ def __repr__(self):
                 min_rows=min_rows,
                 max_rows=max_rows,
                 length=show_dimensions,
-                na_rep=cudf._NA_REP,
+                na_rep=str(cudf.NA),
             )
         else:
             output = repr(preprocess.to_pandas())
@@ -1434,6 +1503,7 @@ def _make_operands_and_index_for_binop(
             NotImplementedType,
         ],
         Optional[BaseIndex],
+        bool,
     ]:
         # Specialize binops to align indices.
         if isinstance(other, Series):
@@ -1449,8 +1519,15 @@ def _make_operands_and_index_for_binop(
         else:
             lhs = self
 
+        try:
+            can_use_self_column_name = cudf.utils.utils._is_same_name(
+                self.name, other.name
+            )
+        except AttributeError:
+            can_use_self_column_name = False
+
         operands = lhs._make_operands_for_binop(other, fill_value, reflect)
-        return operands, lhs._index
+        return operands, lhs._index, can_use_self_column_name
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
@@ -1717,20 +1794,20 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         to be sorted.
 
         >>> s.drop_duplicates()
-        3    beetle
+        0      lama
         1       cow
+        3    beetle
         5     hippo
-        0      lama
         Name: animal, dtype: object
 
         The value 'last' for parameter `keep` keeps the last occurrence
         for each set of duplicated entries.
 
         >>> s.drop_duplicates(keep='last')
-        3    beetle
         1       cow
-        5     hippo
+        3    beetle
         4      lama
+        5     hippo
         Name: animal, dtype: object
 
         The value `False` for parameter `keep` discards all sets
@@ -1739,8 +1816,8 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
 
         >>> s.drop_duplicates(keep=False, inplace=True)
         >>> s
-        3    beetle
         1       cow
+        3    beetle
         5     hippo
         Name: animal, dtype: object
         """
@@ -2457,7 +2534,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         >>> sr.apply(f)  # doctest: +SKIP
 
         For a complete list of supported functions and methods that may be
-        used to manipulate string data, see the the UDF guide,
+        used to manipulate string data, see the UDF guide,
         <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>
 
         """
@@ -2886,13 +2963,15 @@ def unique(self):
         6       c
         dtype: object
         >>> series.unique()
-        0    <NA>
-        1       a
-        2       b
+        0       a
+        1       b
+        2    <NA>
         3       c
         dtype: object
         """
         res = self._column.unique()
+        if cudf.get_option("mode.pandas_compatible"):
+            return res.values
         return Series(res, name=self.name)
 
     @_cudf_nvtx_annotate
@@ -3053,8 +3132,13 @@ def quantile(
         interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
             This optional parameter specifies the interpolation method to use,
             when the desired quantile lies between two data points i and j:
-        columns : list of str
-            List of column names to include.
+
+                * linear: `i + (j - i) * fraction`, where `fraction` is the
+                  fractional part of the index surrounded by `i` and `j`.
+                * lower: `i`.
+                * higher: `j`.
+                * nearest: `i` or `j` whichever is nearest.
+                * midpoint: (`i` + `j`) / 2.
         exact : boolean
             Whether to use approximate or exact quantile algorithm.
         quant_index : boolean
@@ -3124,6 +3208,7 @@ def describe(
         """{docstring}"""
 
         if not datetime_is_numeric:
+            # Do not remove until pandas 2.0 support is added.
             warnings.warn(
                 "`datetime_is_numeric` is deprecated and will be removed in "
                 "a future release. Specify `datetime_is_numeric=True` to "
@@ -3294,7 +3379,7 @@ def groupby(
         axis=0,
         level=None,
         as_index=True,
-        sort=False,
+        sort=no_default,
         group_keys=False,
         squeeze=False,
         observed=True,
@@ -4596,10 +4681,10 @@ def strftime(self, date_format, *args, **kwargs):
 
     @copy_docstring(DatetimeIndex.tz_localize)
     def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
-        from cudf.core._internals.timezones import localize
+        from cudf.core._internals.timezones import delocalize, localize
 
         if tz is None:
-            result_col = self.series._column._local_time
+            result_col = delocalize(self.series._column)
         else:
             result_col = localize(
                 self.series._column, tz, ambiguous, nonexistent
@@ -4609,6 +4694,27 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
             index=self.series._index,
         )
 
+    @copy_docstring(DatetimeIndex.tz_convert)
+    def tz_convert(self, tz):
+        """
+        Parameters
+        ----------
+        tz : str
+            Time zone for time. Corresponding timestamps would be converted
+            to this time zone of the Datetime Array/Index.
+            A `tz` of None will convert to UTC and remove the
+            timezone information.
+        """
+        from cudf.core._internals.timezones import convert
+
+        if tz is None:
+            result_col = self.series._column._utc_time
+        else:
+            result_col = convert(self.series._column, tz)
+        return Series._from_data(
+            {self.series.name: result_col}, index=self.series._index
+        )
+
 
 class TimedeltaProperties:
     """
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 037ac9c378e..7c019f0722c 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -4,13 +4,14 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, Dict, Optional, Tuple, TypeVar, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
 import cupy
-import numpy as np
+import numpy
 
 import cudf
 from cudf._typing import Dtype, NotImplementedType, ScalarLike
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
@@ -20,8 +21,6 @@
 from cudf.core.frame import Frame
 from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
 
-T = TypeVar("T", bound="Frame")
-
 
 class SingleColumnFrame(Frame, NotIterable):
     """A one-dimensional frame.
@@ -32,7 +31,6 @@ class SingleColumnFrame(Frame, NotIterable):
 
     _SUPPORT_AXIS_LOOKUP = {
         0: 0,
-        None: 0,
         "index": 0,
     }
 
@@ -40,12 +38,12 @@ class SingleColumnFrame(Frame, NotIterable):
     def _reduce(
         self,
         op,
-        axis=None,
+        axis=no_default,
         level=None,
         numeric_only=None,
         **kwargs,
     ):
-        if axis not in (None, 0):
+        if axis not in (None, 0, no_default):
             raise NotImplementedError("axis parameter is not implemented yet")
 
         if level is not None:
@@ -136,18 +134,9 @@ def to_numpy(
         dtype: Union[Dtype, None] = None,
         copy: bool = True,
         na_value=None,
-    ) -> np.ndarray:  # noqa: D102
+    ) -> numpy.ndarray:  # noqa: D102
         return super().to_numpy(dtype, copy, na_value).flatten()
 
-    def tolist(self):  # noqa: D102
-        raise TypeError(
-            "cuDF does not support conversion to host memory "
-            "via the `tolist()` method. Consider using "
-            "`.to_arrow().to_pylist()` to construct a Python list."
-        )
-
-    to_list = tolist
-
     @classmethod
     @_cudf_nvtx_annotate
     def from_arrow(cls, array):
@@ -211,17 +200,6 @@ def to_arrow(self):
         """
         return self._column.to_arrow()
 
-    @property  # type: ignore
-    @_cudf_nvtx_annotate
-    def is_unique(self):
-        """Return boolean if values in the object are unique.
-
-        Returns
-        -------
-        bool
-        """
-        return self._column.is_unique
-
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic(self):
@@ -233,6 +211,7 @@ def is_monotonic(self):
         -------
         bool
         """
+        # Do not remove until pandas 2.0 support is added.
         warnings.warn(
             "is_monotonic is deprecated and will be removed in a future "
             "version. Use is_monotonic_increasing instead.",
@@ -348,7 +327,9 @@ def _make_operands_for_binop(
         # Get the appropriate name for output operations involving two objects
         # that are Series-like objects. The output shares the lhs's name unless
         # the rhs is a _differently_ named Series-like object.
-        if isinstance(other, SingleColumnFrame) and self.name != other.name:
+        if isinstance(
+            other, SingleColumnFrame
+        ) and not cudf.utils.utils._is_same_name(self.name, other.name):
             result_name = None
         else:
             result_name = self.name
@@ -405,6 +386,10 @@ def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]:
             if is_integer_dtype(arg.dtype):
                 return self._column.take(arg)
             if is_bool_dtype(arg.dtype):
+                if (bn := len(arg)) != (n := len(self)):
+                    raise IndexError(
+                        f"Boolean mask has wrong length: {bn} not {n}"
+                    )
                 return self._column.apply_boolean_mask(arg)
             raise NotImplementedError(f"Unknown indexer {type(arg)}")
 
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index 1501e887a3f..821afa2ebe2 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -216,7 +216,6 @@ def __call__(
             stride=stride,
             do_lower=self.do_lower_case,
             do_truncate=truncation,
-            max_rows_tensor=max_num_rows,
         )
 
         tokenizer_output = {
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 92ef49e92d9..f736e055163 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -1,15 +1,16 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import math
 import re
 import warnings
-from typing import Sequence, Type, TypeVar, Union
+from typing import Sequence, Union
 
 import cupy as cp
 import numpy as np
 import pandas as pd
 import pandas.tseries.offsets as pd_offset
 from pandas.core.tools.datetimes import _unit_map
+from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -143,8 +144,16 @@ def to_datetime(
     if yearfirst:
         raise NotImplementedError("yearfirst support is not yet implemented")
 
-    if format is not None and "%f" in format:
-        format = format.replace("%f", "%9f")
+    if utc:
+        raise NotImplementedError("utc is not yet implemented")
+
+    if format is not None:
+        if "%Z" in format or "%z" in format:
+            raise NotImplementedError(
+                "cuDF does not yet support timezone-aware datetimes"
+            )
+        elif "%f" in format:
+            format = format.replace("%f", "%9f")
 
     try:
         if isinstance(arg, cudf.DataFrame):
@@ -285,12 +294,8 @@ def to_datetime(
 def _process_col(col, unit, dayfirst, infer_datetime_format, format):
     if col.dtype.kind == "M":
         return col
-    elif col.dtype.kind == "m":
-        raise TypeError(
-            f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
-        )
 
-    if col.dtype.kind in ("f"):
+    elif col.dtype.kind in ("f"):
         if unit not in (None, "ns"):
             factor = cudf.Scalar(
                 column.datetime._unit_to_nanoseconds_conversion[unit]
@@ -316,8 +321,9 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
             )
         else:
             col = col.as_datetime_column(dtype="datetime64[ns]")
+        return col
 
-    if col.dtype.kind in ("i"):
+    elif col.dtype.kind in ("i"):
         if unit in ("D", "h", "m"):
             factor = cudf.Scalar(
                 column.datetime._unit_to_nanoseconds_conversion[unit]
@@ -331,6 +337,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
             )
         else:
             col = col.as_datetime_column(dtype=_unit_dtype_map[unit])
+        return col
 
     elif col.dtype.kind in ("O"):
         if unit not in (None, "ns") or col.null_count == len(col):
@@ -355,11 +362,13 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
                 format = column.datetime.infer_format(
                     element=col.element_indexing(0)
                 )
-            col = col.as_datetime_column(
+            return col.as_datetime_column(
                 dtype=_unit_dtype_map[unit],
                 format=format,
             )
-    return col
+    raise TypeError(
+        f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
+    )
 
 
 def get_units(value):
@@ -373,9 +382,6 @@ def get_units(value):
     return value
 
 
-_T = TypeVar("_T", bound="DateOffset")
-
-
 class DateOffset:
     """
     An object used for binary ops where calendrical arithmetic
@@ -647,7 +653,7 @@ def __repr__(self):
         return repr_str
 
     @classmethod
-    def _from_freqstr(cls: Type[_T], freqstr: str) -> _T:
+    def _from_freqstr(cls, freqstr: str) -> Self:
         """
         Parse a string and return a DateOffset object
         expects strings of the form 3D, 25W, 10ms, 42ns, etc.
@@ -669,9 +675,9 @@ def _from_freqstr(cls: Type[_T], freqstr: str) -> _T:
 
     @classmethod
     def _from_pandas_ticks_or_weeks(
-        cls: Type[_T],
+        cls,
         tick: Union[pd.tseries.offsets.Tick, pd.tseries.offsets.Week],
-    ) -> _T:
+    ) -> Self:
         return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n})
 
     def _maybe_as_fast_pandas_offset(self):
@@ -832,6 +838,10 @@ def date_range(
         arr = cp.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
         return cudf.DatetimeIndex._from_data({name: result})
+    elif cudf.get_option("mode.pandas_compatible"):
+        raise NotImplementedError(
+            "`DatetimeIndex` with `freq` cannot be constructed."
+        )
 
     # The code logic below assumes `freq` is defined. It is first normalized
     # into `DateOffset` for further computation with timestamps.
diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py
index 376eccb9308..fe0637cfaef 100644
--- a/python/cudf/cudf/core/udf/groupby_lowering.py
+++ b/python/cudf/cudf/core/udf/groupby_lowering.py
@@ -37,10 +37,6 @@ def group_reduction_impl_basic(context, builder, sig, args, function):
     grp_type = sig.args[0]
     group_dataty = grp_type.group_data_type
 
-    # logically take the address of the group's data pointer
-    group_data_ptr = builder.alloca(grp.group_data.type)
-    builder.store(grp.group_data, group_data_ptr)
-
     # obtain the correct forward declaration from registry
     type_key = (sig.return_type, grp_type.group_scalar_type)
     func = call_cuda_functions[function][type_key]
@@ -51,10 +47,50 @@ def group_reduction_impl_basic(context, builder, sig, args, function):
         builder,
         func,
         nb_signature(retty, group_dataty, grp_type.group_size_type),
-        (builder.load(group_data_ptr), grp.size),
+        (grp.group_data, grp.size),
     )
 
 
+def group_corr(context, builder, sig, args):
+    """
+    Instruction boilerplate used for calling a groupby correlation
+    """
+    lhs_grp = cgutils.create_struct_proxy(sig.args[0])(
+        context, builder, value=args[0]
+    )
+    rhs_grp = cgutils.create_struct_proxy(sig.args[1])(
+        context, builder, value=args[1]
+    )
+
+    device_func = call_cuda_functions["corr"][
+        (
+            sig.return_type,
+            sig.args[0].group_scalar_type,
+            sig.args[1].group_scalar_type,
+        )
+    ]
+    result = context.compile_internal(
+        builder,
+        device_func,
+        nb_signature(
+            types.float64,
+            types.CPointer(
+                sig.args[0].group_scalar_type
+            ),  # this group calls corr
+            types.CPointer(
+                sig.args[1].group_scalar_type
+            ),  # this group is passed
+            group_size_type,
+        ),
+        (
+            lhs_grp.group_data,
+            rhs_grp.group_data,
+            lhs_grp.size,
+        ),
+    )
+    return result
+
+
 @lower_builtin(Group, types.Array, group_size_type, types.Array)
 def group_constructor(context, builder, sig, args):
     """
@@ -95,13 +131,6 @@ def group_reduction_impl_idx_max_or_min(context, builder, sig, args, function):
             "are supported."
         )
 
-    group_dataty = grp_type.group_data_type
-    group_data_ptr = builder.alloca(grp.group_data.type)
-    builder.store(grp.group_data, group_data_ptr)
-
-    index_dataty = grp_type.group_index_type
-    index_ptr = builder.alloca(grp.index.type)
-    builder.store(grp.index, index_ptr)
     type_key = (index_default_type, grp_type.group_scalar_type)
     func = call_cuda_functions[function][type_key]
 
@@ -109,9 +138,12 @@ def group_reduction_impl_idx_max_or_min(context, builder, sig, args, function):
         builder,
         func,
         nb_signature(
-            retty, group_dataty, index_dataty, grp_type.group_size_type
+            retty,
+            grp_type.group_data_type,
+            grp_type.group_index_type,
+            grp_type.group_size_type,
         ),
-        (builder.load(group_data_ptr), builder.load(index_ptr), grp.size),
+        (grp.group_data, grp.index, grp.size),
     )
 
 
@@ -155,3 +187,4 @@ def cuda_Group_size(context, builder, sig, args):
     cuda_lower("GroupType.idxmin", GroupType(ty, types.int64))(
         cuda_Group_idxmin
     )
+    cuda_lower("GroupType.corr", GroupType(ty), GroupType(ty))(group_corr)
diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py
index 37381a95fdf..97afdd1c6ba 100644
--- a/python/cudf/cudf/core/udf/groupby_typing.py
+++ b/python/cudf/cudf/core/udf/groupby_typing.py
@@ -17,9 +17,14 @@
 
 index_default_type = types.int64
 group_size_type = types.int64
-SUPPORTED_GROUPBY_NUMBA_TYPES = [types.int64, types.float64]
+SUPPORTED_GROUPBY_NUMBA_TYPES = [
+    types.int32,
+    types.int64,
+    types.float32,
+    types.float64,
+]
 SUPPORTED_GROUPBY_NUMPY_TYPES = [
-    numpy_support.as_dtype(dt) for dt in [types.int64, types.float64]
+    numpy_support.as_dtype(dt) for dt in SUPPORTED_GROUPBY_NUMBA_TYPES
 ]
 
 
@@ -99,7 +104,22 @@ def __init__(self, dmm, fe_type):
 call_cuda_functions: Dict[Any, Any] = {}
 
 
-def _register_cuda_reduction_caller(funcname, inputty, retty):
+def _register_cuda_binary_reduction_caller(funcname, lty, rty, retty):
+    cuda_func = cuda.declare_device(
+        f"Block{funcname}_{lty}_{rty}",
+        retty(types.CPointer(lty), types.CPointer(rty), group_size_type),
+    )
+
+    def caller(lhs, rhs, size):
+        return cuda_func(lhs, rhs, size)
+
+    call_cuda_functions.setdefault(funcname.lower(), {})
+
+    type_key = retty, lty, rty
+    call_cuda_functions[funcname.lower()][type_key] = caller
+
+
+def _register_cuda_unary_reduction_caller(funcname, inputty, retty):
     cuda_func = cuda.declare_device(
         f"Block{funcname}_{inputty}",
         retty(types.CPointer(inputty), group_size_type),
@@ -133,6 +153,25 @@ def caller(data, index, size):
     call_cuda_functions[funcname.lower()][type_key] = caller
 
 
+def _make_unary_attr(funcname):
+    class GroupUnaryReductionAttrTyping(AbstractTemplate):
+        key = f"GroupType.{funcname}"
+
+        def generic(self, args, kws):
+            for retty, inputty in call_cuda_functions[funcname.lower()].keys():
+                if self.this.group_scalar_type == inputty:
+                    return nb_signature(retty, recvr=self.this)
+            return None
+
+    def _attr(self, mod):
+        return types.BoundFunction(
+            GroupUnaryReductionAttrTyping,
+            GroupType(mod.group_scalar_type, mod.index_type),
+        )
+
+    return _attr
+
+
 def _create_reduction_attr(name, retty=None):
     class Attr(AbstractTemplate):
         key = name
@@ -167,13 +206,24 @@ def generic(self, args, kws):
         return nb_signature(self.this.index_type, recvr=self.this)
 
 
+class GroupCorr(AbstractTemplate):
+    key = "GroupType.corr"
+
+    def generic(self, args, kws):
+        return nb_signature(types.float64, args[0], recvr=self.this)
+
+
 @cuda_registry.register_attr
 class GroupAttr(AttributeTemplate):
     key = GroupType
 
-    resolve_max = _create_reduction_attr("GroupType.max")
-    resolve_min = _create_reduction_attr("GroupType.min")
-    resolve_sum = _create_reduction_attr("GroupType.sum")
+    resolve_max = _make_unary_attr("max")
+    resolve_min = _make_unary_attr("min")
+    resolve_sum = _make_unary_attr("sum")
+
+    resolve_mean = _make_unary_attr("mean")
+    resolve_var = _make_unary_attr("var")
+    resolve_std = _make_unary_attr("std")
 
     resolve_size = _create_reduction_attr(
         "GroupType.size", retty=group_size_type
@@ -181,11 +231,6 @@ class GroupAttr(AttributeTemplate):
     resolve_count = _create_reduction_attr(
         "GroupType.count", retty=types.int64
     )
-    resolve_mean = _create_reduction_attr(
-        "GroupType.mean", retty=types.float64
-    )
-    resolve_var = _create_reduction_attr("GroupType.var", retty=types.float64)
-    resolve_std = _create_reduction_attr("GroupType.std", retty=types.float64)
 
     def resolve_idxmax(self, mod):
         return types.BoundFunction(
@@ -197,17 +242,43 @@ def resolve_idxmin(self, mod):
             GroupIdxMin, GroupType(mod.group_scalar_type, mod.index_type)
         )
 
+    def resolve_corr(self, mod):
+        return types.BoundFunction(
+            GroupCorr, GroupType(mod.group_scalar_type, mod.index_type)
+        )
+
 
 for ty in SUPPORTED_GROUPBY_NUMBA_TYPES:
-    _register_cuda_reduction_caller("Max", ty, ty)
-    _register_cuda_reduction_caller("Min", ty, ty)
-    _register_cuda_reduction_caller("Sum", ty, ty)
-    _register_cuda_reduction_caller("Mean", ty, types.float64)
-    _register_cuda_reduction_caller("Std", ty, types.float64)
-    _register_cuda_reduction_caller("Var", ty, types.float64)
+    _register_cuda_unary_reduction_caller("Max", ty, ty)
+    _register_cuda_unary_reduction_caller("Min", ty, ty)
     _register_cuda_idx_reduction_caller("IdxMax", ty)
     _register_cuda_idx_reduction_caller("IdxMin", ty)
 
+    if ty in types.integer_domain:
+        _register_cuda_binary_reduction_caller("Corr", ty, ty, types.float64)
+
+
+_register_cuda_unary_reduction_caller("Sum", types.int32, types.int64)
+_register_cuda_unary_reduction_caller("Sum", types.int64, types.int64)
+_register_cuda_unary_reduction_caller("Sum", types.float32, types.float32)
+_register_cuda_unary_reduction_caller("Sum", types.float64, types.float64)
+
+
+_register_cuda_unary_reduction_caller("Mean", types.int32, types.float64)
+_register_cuda_unary_reduction_caller("Mean", types.int64, types.float64)
+_register_cuda_unary_reduction_caller("Mean", types.float32, types.float32)
+_register_cuda_unary_reduction_caller("Mean", types.float64, types.float64)
+
+_register_cuda_unary_reduction_caller("Std", types.int32, types.float64)
+_register_cuda_unary_reduction_caller("Std", types.int64, types.float64)
+_register_cuda_unary_reduction_caller("Std", types.float32, types.float32)
+_register_cuda_unary_reduction_caller("Std", types.float64, types.float64)
+
+_register_cuda_unary_reduction_caller("Var", types.int32, types.float64)
+_register_cuda_unary_reduction_caller("Var", types.int64, types.float64)
+_register_cuda_unary_reduction_caller("Var", types.float32, types.float32)
+_register_cuda_unary_reduction_caller("Var", types.float64, types.float64)
+
 
 for attr in ("group_data", "index", "size"):
     make_attribute_wrapper(GroupType, attr, attr)
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index ae09dd1d704..b18720f5db5 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -4,6 +4,7 @@
 import cupy as cp
 import numpy as np
 from numba import cuda, types
+from numba.core.errors import TypingError
 from numba.cuda.cudadrv.devices import get_context
 from numba.np import numpy_support
 
@@ -19,14 +20,14 @@
 )
 from cudf.core.udf.utils import (
     Row,
-    _generate_cache_key,
+    _compile_or_get,
     _get_extensionty_size,
     _get_kernel,
     _get_udf_return_type,
     _supported_cols_from_frame,
     _supported_dtypes_from_frame,
-    precompiled,
 )
+from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.utils import _cudf_nvtx_annotate
 
 
@@ -123,7 +124,6 @@ def _get_groupby_apply_kernel(frame, func, args):
         "types": types,
     }
     kernel_string = _groupby_apply_kernel_string_from_template(frame, args)
-
     kernel = _get_kernel(kernel_string, global_exec_context, None, func)
 
     return kernel, return_type
@@ -145,20 +145,18 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     function : callable
         The user-defined function to execute
     """
-    offsets = cp.asarray(offsets)
-    ngroups = len(offsets) - 1
 
-    cache_key = _generate_cache_key(
-        grouped_values, function, args, suffix="__GROUPBY_APPLY_UDF"
+    kernel, return_type = _compile_or_get(
+        grouped_values,
+        function,
+        args,
+        kernel_getter=_get_groupby_apply_kernel,
+        suffix="__GROUPBY_APPLY_UDF",
     )
 
-    if cache_key not in precompiled:
-        precompiled[cache_key] = _get_groupby_apply_kernel(
-            grouped_values, function, args
-        )
-    kernel, return_type = precompiled[cache_key]
+    offsets = cp.asarray(offsets)
+    ngroups = len(offsets) - 1
 
-    return_type = numpy_support.as_dtype(return_type)
     output = cudf.core.column.column_empty(ngroups, dtype=return_type)
 
     launch_args = [
@@ -198,6 +196,34 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     )
 
     # Launch kernel
-    specialized[ngroups, tpb](*launch_args)
+    with _CUDFNumbaConfig():
+        specialized[ngroups, tpb](*launch_args)
 
     return output
+
+
+def _can_be_jitted(frame, func, args):
+    """
+    Determine if this UDF is supported through the JIT engine
+    by attempting to compile just the function to PTX using the
+    target set of types
+    """
+    if not hasattr(func, "__code__"):
+        # Numba requires bytecode to be present to proceed.
+        # See https://github.com/numba/numba/issues/4587
+        return False
+    np_field_types = np.dtype(
+        list(
+            _supported_dtypes_from_frame(
+                frame, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES
+            ).items()
+        )
+    )
+    dataframe_group_type = _get_frame_groupby_type(
+        np_field_types, frame.index.dtype
+    )
+    try:
+        _get_udf_return_type(dataframe_group_type, func, args)
+        return True
+    except TypingError:
+        return False
diff --git a/python/cudf/cudf/core/udf/masked_lowering.py b/python/cudf/cudf/core/udf/masked_lowering.py
index 74b414ce36a..ae09294e3f9 100644
--- a/python/cudf/cudf/core/udf/masked_lowering.py
+++ b/python/cudf/cudf/core/udf/masked_lowering.py
@@ -243,6 +243,7 @@ def register_const_op(op):
 # register all lowering at init
 for unary_op in unary_ops:
     register_unary_op(unary_op)
+register_unary_op(abs)
 
 
 @cuda_lower(operator.is_, MaskedType, NAType)
diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py
index 306e147886f..4c90c5bbba0 100644
--- a/python/cudf/cudf/core/udf/masked_typing.py
+++ b/python/cudf/cudf/core/udf/masked_typing.py
@@ -2,6 +2,7 @@
 
 import operator
 
+import numpy as np
 from numba import types
 from numba.core.extending import (
     make_attribute_wrapper,
@@ -17,6 +18,7 @@
 )
 from numba.core.typing.typeof import typeof
 from numba.cuda.cudadecl import registry as cuda_decl_registry
+from numba.np.numpy_support import from_dtype
 
 from cudf.core.missing import NA
 from cudf.core.udf import api
@@ -411,6 +413,33 @@ def generic(self, args, kws):
             return nb_signature(MaskedType(types.int64), args[0])
 
 
+@cuda_decl_registry.register_global(abs)
+class MaskedScalarAbsoluteValue(AbstractTemplate):
+    """
+    Typing for the builtin function abs. Returns the same
+    type as input except for boolean values which are converted
+    to integer.
+
+    This follows the expected result from the builtin abs function
+    which differs from numpy - np.abs returns a bool whereas abs
+    itself performs the cast.
+    """
+
+    def generic(self, args, kws):
+        if isinstance(args[0], MaskedType):
+            if isinstance(args[0].value_type, (StringView, UDFString)):
+                # reject string types
+                return
+            else:
+                return_type = self.context.resolve_function_type(
+                    self.key, (args[0].value_type,), kws
+                ).return_type
+                if return_type in types.signed_domain:
+                    # promote to unsigned to avoid overflow
+                    return_type = from_dtype(np.dtype("u" + return_type.name))
+                return nb_signature(MaskedType(return_type), args[0])
+
+
 @cuda_decl_registry.register_global(api.pack_return)
 class UnpackReturnToMasked(AbstractTemplate):
     """
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index d890b94127f..35a3f6c1ffd 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-import glob
 import os
 from typing import Any, Callable, Dict
 
@@ -13,7 +12,6 @@
 from numba.core.datamodel import default_manager, models
 from numba.core.errors import TypingError
 from numba.core.extending import register_model
-from numba.cuda.cudadrv.driver import Linker
 from numba.np import numpy_support
 from numba.types import CPointer, Poison, Record, Tuple, boolean, int64, void
 
@@ -33,6 +31,7 @@
     udf_string,
 )
 from cudf.utils import cudautils
+from cudf.utils._numba import _get_ptx_file
 from cudf.utils.dtypes import (
     BOOL_TYPES,
     DATETIME_TYPES,
@@ -63,58 +62,6 @@
 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
 launch_arg_getters: Dict[Any, Any] = {}
 
-
-def _get_best_ptx_file(archs, max_compute_capability):
-    """
-    Determine of the available PTX files which one is
-    the most recent up to and including the device cc
-    """
-    filtered_archs = [x for x in archs if x[0] <= max_compute_capability]
-    if filtered_archs:
-        return max(filtered_archs, key=lambda y: y[0])
-    else:
-        return None
-
-
-def _get_ptx_file(path, prefix):
-    if "RAPIDS_NO_INITIALIZE" in os.environ:
-        # cc=60 ptx is always built
-        cc = int(os.environ.get("STRINGS_UDF_CC", "60"))
-    else:
-        dev = cuda.get_current_device()
-
-        # Load the highest compute capability file available that is less than
-        # the current device's.
-        cc = int("".join(str(x) for x in dev.compute_capability))
-    files = glob.glob(os.path.join(path, f"{prefix}*.ptx"))
-    if len(files) == 0:
-        raise RuntimeError(f"Missing PTX files for cc={cc}")
-    regular_sms = []
-
-    for f in files:
-        file_name = os.path.basename(f)
-        sm_number = file_name.rstrip(".ptx").lstrip(prefix)
-        if sm_number.endswith("a"):
-            processed_sm_number = int(sm_number.rstrip("a"))
-            if processed_sm_number == cc:
-                return f
-        else:
-            regular_sms.append((int(sm_number), f))
-
-    regular_result = None
-
-    if regular_sms:
-        regular_result = _get_best_ptx_file(regular_sms, cc)
-
-    if regular_result is None:
-        raise RuntimeError(
-            "This cuDF installation is missing the necessary PTX "
-            f"files that are <={cc}."
-        )
-    else:
-        return regular_result[1]
-
-
 _PTX_FILE = _get_ptx_file(os.path.dirname(__file__), "shim_")
 
 
@@ -283,7 +230,9 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"):
 
 
 @_cudf_nvtx_annotate
-def _compile_or_get(frame, func, args, kernel_getter=None):
+def _compile_or_get(
+    frame, func, args, kernel_getter=None, suffix="__APPLY_UDF"
+):
     """
     Return a compiled kernel in terms of MaskedTypes that launches a
     kernel equivalent of `f` for the dtypes of `df`. The kernel uses
@@ -308,7 +257,7 @@ def _compile_or_get(frame, func, args, kernel_getter=None):
         raise TypeError("only scalar valued args are supported by apply")
 
     # check to see if we already compiled this function
-    cache_key = _generate_cache_key(frame, func, args)
+    cache_key = _generate_cache_key(frame, func, args, suffix=suffix)
     if precompiled.get(cache_key) is not None:
         kernel, masked_or_scalar = precompiled[cache_key]
         return kernel, masked_or_scalar
@@ -390,97 +339,6 @@ def _get_extensionty_size(ty):
     return llty.get_abi_size(target_data)
 
 
-def _get_cuda_version_from_ptx_file(path):
-    """
-    https://docs.nvidia.com/cuda/parallel-thread-execution/
-    Each PTX module must begin with a .version
-    directive specifying the PTX language version
-
-    example header:
-    //
-    // Generated by NVIDIA NVVM Compiler
-    //
-    // Compiler Build ID: CL-31057947
-    // Cuda compilation tools, release 11.6, V11.6.124
-    // Based on NVVM 7.0.1
-    //
-
-    .version 7.6
-    .target sm_52
-    .address_size 64
-
-    """
-    with open(path) as ptx_file:
-        for line in ptx_file:
-            if line.startswith(".version"):
-                ver_line = line
-                break
-        else:
-            raise ValueError("Could not read CUDA version from ptx file.")
-    version = ver_line.strip("\n").split(" ")[1]
-    # from ptx_docs/release_notes above:
-    ver_map = {
-        "7.5": (11, 5),
-        "7.6": (11, 6),
-        "7.7": (11, 7),
-        "7.8": (11, 8),
-        "8.0": (12, 0),
-    }
-
-    cuda_ver = ver_map.get(version)
-    if cuda_ver is None:
-        raise ValueError(
-            f"Could not map PTX version {version} to a CUDA version"
-        )
-
-    return cuda_ver
-
-
-def _setup_numba_linker(path):
-    from ptxcompiler.patch import NO_DRIVER, safe_get_versions
-
-    from cudf.core.udf.utils import (
-        _get_cuda_version_from_ptx_file,
-        maybe_patch_numba_linker,
-    )
-
-    versions = safe_get_versions()
-    if versions != NO_DRIVER:
-        driver_version, runtime_version = versions
-        ptx_toolkit_version = _get_cuda_version_from_ptx_file(path)
-        maybe_patch_numba_linker(
-            driver_version, runtime_version, ptx_toolkit_version
-        )
-
-
-def maybe_patch_numba_linker(
-    driver_version, runtime_version, ptx_toolkit_version
-):
-    from cubinlinker.patch import (
-        _numba_version_ok,
-        get_logger,
-        new_patched_linker,
-    )
-
-    # Numba thinks cubinlinker is only needed if the driver is older than
-    # the ctk, but when PTX files are present, it might also need to patch
-    # because those PTX files may newer than the driver as well
-    logger = get_logger()
-
-    if (driver_version < ptx_toolkit_version) or (
-        driver_version < runtime_version
-    ):
-        logger.debug(
-            "Driver version %s.%s needs patching due to PTX files"
-            % driver_version
-        )
-        if _numba_version_ok:
-            logger.debug("Patching Numba Linker")
-            Linker.new = new_patched_linker
-        else:
-            logger.debug("Cannot patch Numba Linker - unsupported version")
-
-
 @initfunc
 def set_malloc_heap_size(size=None):
     """
diff --git a/python/cudf/cudf/errors.py b/python/cudf/cudf/errors.py
index bd264940081..973e5b990b2 100644
--- a/python/cudf/cudf/errors.py
+++ b/python/cudf/cudf/errors.py
@@ -1,5 +1,9 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 
 class UnsupportedCUDAError(Exception):
     pass
+
+
+class MixedTypeError(TypeError):
+    pass
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 95e0aa18070..bacc0641639 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -123,11 +123,12 @@ def read_csv(
     if dtype is None or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
-        dtype = {} if dtype is None else dtype
+        specified_dtypes = {} if dtype is None else dtype
+        df_dtypes = df._dtypes
         unspecified_dtypes = {
-            name: df._dtypes[name]
+            name: df_dtypes[name]
             for name in df._column_names
-            if name not in dtype
+            if name not in specified_dtypes
         }
         default_dtypes = {}
 
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 4de9a92a068..efac24aee17 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -158,11 +158,12 @@ def read_json(
     if dtype is True or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
-        dtype = {} if dtype is True else dtype
+        specified_dtypes = {} if dtype is True else dtype
+        df_dtypes = df._dtypes
         unspecified_dtypes = {
-            name: df._dtypes[name]
+            name: df_dtypes[name]
             for name in df._column_names
-            if name not in dtype
+            if name not in specified_dtypes
         }
         default_dtypes = {}
 
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 8865bdd9d33..f51952d23bf 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import datetime
 import warnings
@@ -296,12 +296,14 @@ def read_orc(
     from cudf import DataFrame
 
     if skiprows is not None:
+        # Do not remove until cuIO team approves its removal.
         warnings.warn(
             "skiprows is deprecated and will be removed.",
             FutureWarning,
         )
 
     if num_rows is not None:
+        # Do not remove until cuIO team approves its removal.
         warnings.warn(
             "num_rows is deprecated and will be removed.",
             FutureWarning,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 3e1a4b1f024..d8510cf8e95 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1,14 +1,19 @@
 # Copyright (c) 2019-2023, NVIDIA CORPORATION.
+from __future__ import annotations
 
+import itertools
 import math
+import operator
 import shutil
 import tempfile
 import warnings
 from collections import defaultdict
 from contextlib import ExitStack
-from typing import Dict, List, Optional, Tuple
+from functools import partial, reduce
+from typing import Callable, Dict, List, Optional, Tuple
 from uuid import uuid4
 
+import numpy as np
 import pandas as pd
 from pyarrow import dataset as ds, parquet as pq
 
@@ -432,7 +437,6 @@ def read_parquet(
     storage_options=None,
     filters=None,
     row_groups=None,
-    strings_to_categorical=False,
     use_pandas_metadata=True,
     use_python_file_object=True,
     categorical_partitions=True,
@@ -481,6 +485,9 @@ def read_parquet(
         path_or_data=filepath_or_buffer, storage_options=storage_options
     )
 
+    # Normalize and validate filters
+    filters = _normalize_filters(filters)
+
     # Use pyarrow dataset to detect/process directory-partitioned
     # data and apply filters. Note that we can only support partitioned
     # data and filtering if the input is a single directory or list of
@@ -501,8 +508,6 @@ def read_parquet(
             categorical_partitions=categorical_partitions,
             dataset_kwargs=dataset_kwargs,
         )
-    elif filters is not None:
-        raise ValueError("cudf cannot apply filters to open file objects.")
     filepath_or_buffer = paths if paths else filepath_or_buffer
 
     filepaths_or_buffers = []
@@ -547,13 +552,25 @@ def read_parquet(
                 "for full CPU-based filtering functionality."
             )
 
-    return _parquet_to_frame(
+    # Make sure we read in the columns needed for row-wise
+    # filtering after IO. This means that one or more columns
+    # will be dropped almost immediately after IO. However,
+    # we do NEED these columns for accurate filtering.
+    projected_columns = None
+    if columns and filters:
+        projected_columns = columns
+        columns = sorted(
+            set(v[0] for v in itertools.chain.from_iterable(filters))
+            | set(columns)
+        )
+
+    # Convert parquet data to a cudf.DataFrame
+    df = _parquet_to_frame(
         filepaths_or_buffers,
         engine,
         *args,
         columns=columns,
         row_groups=row_groups,
-        strings_to_categorical=strings_to_categorical,
         use_pandas_metadata=use_pandas_metadata,
         partition_keys=partition_keys,
         partition_categories=partition_categories,
@@ -561,6 +578,123 @@ def read_parquet(
         **kwargs,
     )
 
+    # Apply filters row-wise (if any are defined), and return
+    df = _apply_post_filters(df, filters)
+    if projected_columns:
+        # Elements of `projected_columns` may now be in the index.
+        # We must filter these names from our projection
+        projected_columns = [
+            col for col in projected_columns if col in df._column_names
+        ]
+        return df[projected_columns]
+    return df
+
+
+def _normalize_filters(filters: list | None) -> List[List[tuple]] | None:
+    # Utility to normalize and validate the `filters`
+    # argument to `read_parquet`
+    if not filters:
+        return None
+
+    msg = (
+        f"filters must be None, or non-empty List[Tuple] "
+        f"or List[List[Tuple]]. Got {filters}"
+    )
+    if not isinstance(filters, list):
+        raise TypeError(msg)
+
+    def _validate_predicate(item):
+        if not isinstance(item, tuple) or len(item) != 3:
+            raise TypeError(
+                f"Predicate must be Tuple[str, str, Any], " f"got {predicate}."
+            )
+
+    filters = filters if isinstance(filters[0], list) else [filters]
+    for conjunction in filters:
+        if not conjunction or not isinstance(conjunction, list):
+            raise TypeError(msg)
+        for predicate in conjunction:
+            _validate_predicate(predicate)
+
+    return filters
+
+
+def _apply_post_filters(
+    df: cudf.DataFrame, filters: List[List[tuple]] | None
+) -> cudf.DataFrame:
+    """Apply DNF filters to an in-memory DataFrame
+
+    Disjunctive normal form (DNF) means that the inner-most
+    tuple describes a single column predicate. These inner
+    predicates are combined with an AND conjunction into a
+    larger predicate. The outer-most list then combines all
+    of the combined filters with an OR disjunction.
+    """
+
+    if not filters:
+        # No filters to apply
+        return df
+
+    def _handle_in(column: cudf.Series, value, *, negate) -> cudf.Series:
+        if not isinstance(value, (list, set, tuple)):
+            raise TypeError(
+                "Value of 'in'/'not in' filter must be a list, set, or tuple."
+            )
+        return ~column.isin(value) if negate else column.isin(value)
+
+    def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series:
+        if value not in {np.nan, None}:
+            raise TypeError(
+                "Value of 'is'/'is not' filter must be np.nan or None."
+            )
+        return ~column.isna() if negate else column.isna()
+
+    handlers: Dict[str, Callable] = {
+        "==": operator.eq,
+        "!=": operator.ne,
+        "<": operator.lt,
+        "<=": operator.le,
+        ">": operator.gt,
+        ">=": operator.ge,
+        "in": partial(_handle_in, negate=False),
+        "not in": partial(_handle_in, negate=True),
+        "is": partial(_handle_is, negate=False),
+        "is not": partial(_handle_is, negate=True),
+    }
+
+    # Can re-set the index before returning if we filter
+    # out rows from a DataFrame with a default RangeIndex
+    # (to reduce memory usage)
+    reset_index = (
+        isinstance(df.index, cudf.RangeIndex)
+        and df.index.name is None
+        and df.index.start == 0
+        and df.index.step == 1
+    )
+
+    try:
+        selection: cudf.Series = reduce(
+            operator.or_,
+            (
+                reduce(
+                    operator.and_,
+                    (
+                        handlers[op](df[column], value)
+                        for (column, op, value) in expr
+                    ),
+                )
+                for expr in filters
+            ),
+        )
+        if reset_index:
+            return df[selection].reset_index(drop=True)
+        return df[selection]
+    except (KeyError, TypeError):
+        warnings.warn(
+            f"Row-wise filtering failed in read_parquet for {filters}"
+        )
+        return df
+
 
 @_cudf_nvtx_annotate
 def _parquet_to_frame(
@@ -667,7 +801,6 @@ def _read_parquet(
     engine,
     columns=None,
     row_groups=None,
-    strings_to_categorical=None,
     use_pandas_metadata=None,
     *args,
     **kwargs,
@@ -689,7 +822,6 @@ def _read_parquet(
             filepaths_or_buffers,
             columns=columns,
             row_groups=row_groups,
-            strings_to_categorical=strings_to_categorical,
             use_pandas_metadata=use_pandas_metadata,
         )
     else:
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index a375d8236d6..7a0db49bd20 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -3,6 +3,7 @@
 import os
 import textwrap
 from collections.abc import Container
+from contextlib import ContextDecorator
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, Optional
 
@@ -304,3 +305,51 @@ def _integer_and_none_validator(val):
     ),
     _integer_validator,
 )
+
+_register_option(
+    "mode.pandas_compatible",
+    False,
+    textwrap.dedent(
+        """
+        If set to `False`, retains `cudf` specific behavior.
+        If set to `True`, enables pandas compatibility mode,
+        which will try to match pandas API behaviors in case of
+        any inconsistency.
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
+
+
+class option_context(ContextDecorator):
+    """
+    Context manager to temporarily set options in the `with` statement context.
+
+    You need to invoke as ``option_context(pat, val, [(pat, val), ...])``.
+
+
+    Examples
+    --------
+    >>> from cudf import option_context
+    >>> with option_context('mode.pandas_compatible', True, 'default_float_bitwidth', 32):
+    ...     pass
+    """  # noqa: E501
+
+    def __init__(self, *args) -> None:
+        if len(args) % 2 != 0:
+            raise ValueError(
+                "Need to invoke as option_context(pat, val, "
+                "[(pat, val), ...])."
+            )
+
+        self.ops = tuple(zip(args[::2], args[1::2]))
+
+    def __enter__(self) -> None:
+        self.undo = tuple((pat, get_option(pat)) for pat, _ in self.ops)
+        for pat, val in self.ops:
+            set_option(pat, val)
+
+    def __exit__(self, *args) -> None:
+        for pat, val in self.undo:
+            set_option(pat, val)
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 484c013f774..a9c54ddcaa1 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -19,7 +19,7 @@
     is_string_dtype,
     is_struct_dtype,
 )
-from cudf.core.missing import NA
+from cudf.core.missing import NA, NaT
 
 
 def dtype_can_compare_equal_to_other(dtype):
@@ -290,7 +290,7 @@ def assert_column_equal(
 
 
 def null_safe_scalar_equals(left, right):
-    if left in {NA, np.nan} or right in {NA, np.nan}:
+    if left in {NA, NaT, np.nan} or right in {NA, NaT, np.nan}:
         return left is right
     return left == right
 
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.EmptyDecompData.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.EmptyDecompData.orc
new file mode 100644
index 0000000000000000000000000000000000000000..a13b19efa86e8e63b2eb4988d91529a4b6bbb8f3
GIT binary patch
literal 373
zcmZvXy-EX75QWd&yR$plAH7);xGV_ERw)!>V<iZtR0ssE6oMwAh)OUK#46bPDn5fg
zjE#j@Bw)ARtSM3mb2!X=a|Wi{>(qhwlJGVo)PR$+{8<_S6@y~{1E+3nb@cDva=4Oo
zp}+bBUTHV<MH#Y@a~Nd<7HF0R8Eth)Idqceo@Iy0c!|YAoV1-5ZsD_a2mHh!;2e|)
z6v<|)1WqcAq^uK&>(t!vOs0>II$yvGeIJT8ACyJTV6XGX)|`um#3@mhjP3_l{o%!E
z(!Uu`3YIV>RqQ+HX^;JEz2f7C^TFBVvVV7*#Ut9@Sp)CIu8DXH!`gu;37THJ3I~O2
X+H;mC_u+JE3Wd0Z=b6^sUZ?pBQb<Oj

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.NestedNotNullableStruct.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.NestedNotNullableStruct.orc
new file mode 100644
index 0000000000000000000000000000000000000000..91efff903d0ec0bf85ffe4fb54ba9ee08279ee35
GIT binary patch
literal 310
zcmeYda^`1X`1ODO{r~?D@G>y`{{Q>`|9_^83}vb_4fbvSE+4aNyG&5)i)q__CaFHT
zd&fvahxfH%cIEq+)l9ib>{9ukE^E3z68agj!JC`6wYluE>V8|kz1yC@3G!_1%lEzT
zY@_Csr_I&5*UFhHj|!ZAGed+w{`rgpt)WkTf1S6;^8d%S>M1wejbr{r?Puv?+;;TO
z*+sXFBW;#{{i8Pf)62-KcC`h;3?fPjD&I6+f=h1dbo}55W?<lA<6sbAl;8r>%o3bj
z>>P{&%o1}Mg_t-PkT6pP1A|=+2eZI^Mk#I}#UQbPQ9(yTONonvM~IP8iaAj)Nr{J%
zM}|p?HCZo3FO^eDhgpg_O)p)EiG$fdi^E=kNur@aK#7TgM}xtDz3B@xvwx7YH~@_L
BYrFse

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/data/parquet/delta_encoding.parquet b/python/cudf/cudf/tests/data/parquet/delta_encoding.parquet
index e129ced34f3b570ba0ae966277f2111f8f539465..29565bef4d2e79033e2631a46eebedd3292db6b7 100644
GIT binary patch
delta 28
icmX@ea*#zhz%j^BlucAlR4E2XF^DpW@@y2=V*&tHoCNOx

delta 28
icmX@ea*#zhz%j^BlucAlR4E2XF^DpWa%~jWV*&tHk_7Dl

diff --git a/python/cudf/cudf/tests/data/parquet/fixed_len_byte_array.parquet b/python/cudf/cudf/tests/data/parquet/fixed_len_byte_array.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..b0ee8f2e4d2dd92b217bbe825417b83b36b03735
GIT binary patch
literal 259
zcmYLE!D_=W3>78=pE?GWiNJ?k7BXl{ZFf!!W00Q4PGvt(8rOx;t=p1x-?cyJai6qn
zRdyIcNRKDIC#|X%gMgCb`}FgBtugt70M*i1x)GELuBoV&C`bX2RuVloz<Bg@)qQN*
zJqp0VWi^#`b6EGW>p>-?N#|1IfNnw3KZVRGuY4{aJQS>av0!0#W>EmO#i4q{9C#?n
zrSe07;+cIukFg)Wwb@r}yXO6nnTfFzn-%r3dEI7Z8QnC@rwixiK8AZ6T-a@VaI?CM
VJLjx%)`rFGj<n|UPn;`${|vCvF>U|=

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/data/parquet/rle_boolean_encoding.parquet b/python/cudf/cudf/tests/data/parquet/rle_boolean_encoding.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..6a6de0a9422bb42b08139f58a5b18f1df4ed6ed6
GIT binary patch
literal 192
zcmWG=3^EjD6EzWyi4pB!6y*UCY@%YKEDQ`CjO@Sb>tz@@|NY)?FW=3<00ECv5)uS#
zs*T(8w|y-0W+?vTIfWM}Bg!PH<H1^zn3R($%ETbbpvoj6kdj!ESW;P#8lRM(pOc!H
z7s|vS*1;$#BPpZCAS22kDas~^sYsLss75SFtcy{N!$yLUQG$^{j7I}Z1FZoH1Em=l
K@`3CCpxXh7J}WE$

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/dataframe/test_conversion.py b/python/cudf/cudf/tests/dataframe/test_conversion.py
index 06777c8e6af..3673ea827f9 100644
--- a/python/cudf/cudf/tests/dataframe/test_conversion.py
+++ b/python/cudf/cudf/tests/dataframe/test_conversion.py
@@ -1 +1,38 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
+import pandas as pd
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+def test_convert_dtypes():
+    data = {
+        "a": [1, 2, 3],
+        "b": [1, 2, 3],
+        "c": [1.1, 2.2, 3.3],
+        "d": [1.0, 2.0, 3.0],
+        "e": [1.0, 2.0, 3.0],
+        "f": ["a", "b", "c"],
+        "g": ["a", "b", "c"],
+        "h": ["2001-01-01", "2001-01-02", "2001-01-03"],
+    }
+    dtypes = [
+        "int8",
+        "int64",
+        "float32",
+        "float32",
+        "float64",
+        "str",
+        "category",
+        "datetime64[ns]",
+    ]
+    df = pd.DataFrame(
+        {
+            k: pd.Series(v, dtype=d)
+            for k, v, d in zip(data.keys(), data.values(), dtypes)
+        }
+    )
+    gdf = cudf.DataFrame.from_pandas(df)
+    expect = df.convert_dtypes()
+    got = gdf.convert_dtypes().to_pandas(nullable=True)
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py
new file mode 100644
index 00000000000..f2c2d9a263b
--- /dev/null
+++ b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+import pandas as pd
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+def test_slice_datetimetz_index():
+    data = ["2001-01-01", "2001-01-02", None, None, "2001-01-03"]
+    pidx = pd.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize(
+        "US/Eastern"
+    )
+    idx = cudf.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize(
+        "US/Eastern"
+    )
+    expected = pidx[1:4]
+    got = idx[1:4]
+    assert_eq(expected, got)
diff --git a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
index 7a3fcc25033..1ed1e23f1ab 100644
--- a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
+++ b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
@@ -14,3 +14,19 @@ def test_tz_localize():
         pidx.tz_localize("America/New_York"),
         idx.tz_localize("America/New_York"),
     )
+
+
+def test_tz_convert():
+    pidx = pd.date_range("2023-01-01", periods=3, freq="H")
+    idx = cudf.from_pandas(pidx)
+    pidx = pidx.tz_localize("UTC")
+    idx = idx.tz_localize("UTC")
+    assert_eq(
+        pidx.tz_convert("America/New_York"), idx.tz_convert("America/New_York")
+    )
+
+
+def test_delocalize_naive():
+    pidx = pd.date_range("2023-01-01", periods=3, freq="H")
+    idx = cudf.from_pandas(pidx)
+    assert_eq(pidx.tz_localize(None), idx.tz_localize(None))
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index 06777c8e6af..4f435f74d59 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -1 +1,310 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pytest
+
+import cudf
+from cudf.core._compat import PANDAS_GE_210
+from cudf.core.index import IntervalIndex, interval_range
+from cudf.testing._utils import assert_eq
+
+
+def test_interval_constructor_default_closed():
+    idx = cudf.IntervalIndex([pd.Interval(0, 1)])
+    assert idx.closed == "right"
+    assert idx.dtype.closed == "right"
+
+
+def test_interval_to_arrow():
+    expect = pa.Array.from_pandas(pd.IntervalIndex([pd.Interval(0, 1)]))
+    got = cudf.IntervalIndex([pd.Interval(0, 1)]).to_arrow()
+    assert_eq(expect, got)
+
+
+INTERVAL_BOUNDARY_TYPES = [
+    int,
+    np.int8,
+    np.int16,
+    np.int32,
+    np.int64,
+    np.float32,
+    np.float64,
+    cudf.Scalar,
+]
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+@pytest.mark.parametrize("start", [0, 1, 2, 3])
+@pytest.mark.parametrize("end", [4, 5, 6, 7])
+def test_interval_range_basic(start, end, closed):
+    pindex = pd.interval_range(start=start, end=end, closed=closed)
+    gindex = cudf.interval_range(start=start, end=end, closed=closed)
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
+def test_interval_range_dtype_basic(start_t, end_t):
+    start, end = start_t(24), end_t(42)
+    start_val = start.value if isinstance(start, cudf.Scalar) else start
+    end_val = end.value if isinstance(end, cudf.Scalar) else end
+    pindex = pd.interval_range(start=start_val, end=end_val, closed="left")
+    gindex = cudf.interval_range(start=start, end=end, closed="left")
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+@pytest.mark.parametrize("start", [0])
+@pytest.mark.parametrize("end", [0])
+def test_interval_range_empty(start, end, closed):
+    pindex = pd.interval_range(start=start, end=end, closed=closed)
+    gindex = cudf.interval_range(start=start, end=end, closed=closed)
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+@pytest.mark.parametrize("freq", [1, 2, 3])
+@pytest.mark.parametrize("start", [0, 1, 2, 3, 5])
+@pytest.mark.parametrize("end", [6, 8, 10, 43, 70])
+def test_interval_range_freq_basic(start, end, freq, closed):
+    pindex = pd.interval_range(start=start, end=end, freq=freq, closed=closed)
+    gindex = cudf.interval_range(
+        start=start, end=end, freq=freq, closed=closed
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
+def test_interval_range_freq_basic_dtype(start_t, end_t, freq_t):
+    start, end, freq = start_t(5), end_t(70), freq_t(3)
+    start_val = start.value if isinstance(start, cudf.Scalar) else start
+    end_val = end.value if isinstance(end, cudf.Scalar) else end
+    freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
+    pindex = pd.interval_range(
+        start=start_val, end=end_val, freq=freq_val, closed="left"
+    )
+    gindex = cudf.interval_range(
+        start=start, end=end, freq=freq, closed="left"
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+@pytest.mark.parametrize("periods", [1, 1.0, 2, 2.0, 3.0, 3])
+@pytest.mark.parametrize("start", [0, 0.0, 1.0, 1, 2, 2.0, 3.0, 3])
+@pytest.mark.parametrize("end", [4, 4.0, 5.0, 5, 6, 6.0, 7.0, 7])
+def test_interval_range_periods_basic(start, end, periods, closed):
+    pindex = pd.interval_range(
+        start=start, end=end, periods=periods, closed=closed
+    )
+    gindex = cudf.interval_range(
+        start=start, end=end, periods=periods, closed=closed
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("periods_t", INTERVAL_BOUNDARY_TYPES)
+def test_interval_range_periods_basic_dtype(start_t, end_t, periods_t):
+    start, end, periods = start_t(0), end_t(4), periods_t(1.0)
+    start_val = start.value if isinstance(start, cudf.Scalar) else start
+    end_val = end.value if isinstance(end, cudf.Scalar) else end
+    periods_val = (
+        periods.value if isinstance(periods, cudf.Scalar) else periods
+    )
+    pindex = pd.interval_range(
+        start=start_val, end=end_val, periods=periods_val, closed="left"
+    )
+    gindex = cudf.interval_range(
+        start=start, end=end, periods=periods, closed="left"
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+@pytest.mark.parametrize("periods", [1, 2, 3])
+@pytest.mark.parametrize("freq", [1, 2, 3, 4])
+@pytest.mark.parametrize("end", [4, 8, 9, 10])
+def test_interval_range_periods_freq_end(end, freq, periods, closed):
+    pindex = pd.interval_range(
+        end=end, freq=freq, periods=periods, closed=closed
+    )
+    gindex = cudf.interval_range(
+        end=end, freq=freq, periods=periods, closed=closed
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("periods_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
+def test_interval_range_periods_freq_end_dtype(periods_t, freq_t, end_t):
+    periods, freq, end = periods_t(2), freq_t(3), end_t(10)
+    freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
+    end_val = end.value if isinstance(end, cudf.Scalar) else end
+    periods_val = (
+        periods.value if isinstance(periods, cudf.Scalar) else periods
+    )
+    pindex = pd.interval_range(
+        end=end_val, freq=freq_val, periods=periods_val, closed="left"
+    )
+    gindex = cudf.interval_range(
+        end=end, freq=freq, periods=periods, closed="left"
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+@pytest.mark.parametrize("periods", [1, 2, 3])
+@pytest.mark.parametrize("freq", [1, 2, 3, 4])
+@pytest.mark.parametrize("start", [1, 4, 9, 12])
+def test_interval_range_periods_freq_start(start, freq, periods, closed):
+    pindex = pd.interval_range(
+        start=start, freq=freq, periods=periods, closed=closed
+    )
+    gindex = cudf.interval_range(
+        start=start, freq=freq, periods=periods, closed=closed
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("periods_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
+@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
+def test_interval_range_periods_freq_start_dtype(periods_t, freq_t, start_t):
+    periods, freq, start = periods_t(2), freq_t(3), start_t(9)
+    freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
+    start_val = start.value if isinstance(start, cudf.Scalar) else start
+    periods_val = (
+        periods.value if isinstance(periods, cudf.Scalar) else periods
+    )
+    pindex = pd.interval_range(
+        start=start_val, freq=freq_val, periods=periods_val, closed="left"
+    )
+    gindex = cudf.interval_range(
+        start=start, freq=freq, periods=periods, closed="left"
+    )
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
+@pytest.mark.parametrize(
+    "data",
+    [
+        ([pd.Interval(30, 50)]),
+        ([pd.Interval(0, 3), pd.Interval(1, 7)]),
+        ([pd.Interval(0.2, 60.3), pd.Interval(1, 7), pd.Interval(0, 0)]),
+        ([]),
+    ],
+)
+def test_interval_index_basic(data, closed):
+    pindex = pd.IntervalIndex(data, closed=closed)
+    gindex = IntervalIndex(data, closed=closed)
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
+def test_interval_index_empty(closed):
+    pindex = pd.IntervalIndex([], closed=closed)
+    gindex = IntervalIndex([], closed=closed)
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
+@pytest.mark.parametrize(
+    "data",
+    [
+        ([pd.Interval(1, 6), pd.Interval(1, 10), pd.Interval(1, 3)]),
+        (
+            [
+                pd.Interval(3.5, 6.0),
+                pd.Interval(1.0, 7.0),
+                pd.Interval(0.0, 10.0),
+            ]
+        ),
+        (
+            [
+                pd.Interval(50, 100, closed="left"),
+                pd.Interval(1.0, 7.0, closed="left"),
+                pd.Interval(16, 322, closed="left"),
+            ]
+        ),
+        (
+            [
+                pd.Interval(50, 100, closed="right"),
+                pd.Interval(1.0, 7.0, closed="right"),
+                pd.Interval(16, 322, closed="right"),
+            ]
+        ),
+    ],
+)
+def test_interval_index_many_params(data, closed):
+    pindex = pd.IntervalIndex(data, closed=closed)
+    gindex = IntervalIndex(data, closed=closed)
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
+def test_interval_index_from_breaks(closed):
+    breaks = [0, 3, 6, 10]
+    pindex = pd.IntervalIndex.from_breaks(breaks, closed=closed)
+    gindex = IntervalIndex.from_breaks(breaks, closed=closed)
+
+    assert_eq(pindex, gindex)
+
+
+@pytest.mark.parametrize(
+    "start, stop, freq, periods",
+    [
+        (0.0, None, 0.2, 5),
+        (0.0, 1.0, None, 5),
+        pytest.param(
+            0.0,
+            1.0,
+            0.2,
+            None,
+            marks=pytest.mark.xfail(
+                condition=not PANDAS_GE_210,
+                reason="https://github.com/pandas-dev/pandas/pull/54477",
+            ),
+        ),
+        (None, 1.0, 0.2, 5),
+        pytest.param(
+            0.0,
+            1.0,
+            0.1,
+            None,
+            marks=pytest.mark.xfail(
+                condition=not PANDAS_GE_210,
+                reason="https://github.com/pandas-dev/pandas/pull/54477",
+            ),
+        ),
+        (0.0, 1.0, None, 10),
+        (0.0, None, 0.25, 4),
+        (1.0, None, 2.5, 2),
+    ],
+)
+def test_interval_range_floating(start, stop, freq, periods):
+    expected = pd.interval_range(
+        start=start, end=stop, freq=freq, periods=periods
+    )
+    got = interval_range(start=start, end=stop, freq=freq, periods=periods)
+    assert_eq(expected, got)
diff --git a/python/cudf/cudf/tests/input_output/test_text.py b/python/cudf/cudf/tests/input_output/test_text.py
index 06777c8e6af..acba13bb5b0 100644
--- a/python/cudf/cudf/tests/input_output/test_text.py
+++ b/python/cudf/cudf/tests/input_output/test_text.py
@@ -1 +1,164 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+
+from io import StringIO
+
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+@pytest.fixture(scope="module")
+def datadir(datadir):
+    return datadir / "text"
+
+
+def test_read_text(datadir):
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read().split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(chess_file, delimiter=delimiter)
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_byte_range(datadir):
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file, "r") as f:
+        data = f.read()
+        content = data.split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    byte_range_size = (len(data) // 3) + (len(data) % 3 != 0)
+
+    actual_0 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 0, byte_range_size],
+    )
+    actual_1 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 1, byte_range_size],
+    )
+    actual_2 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 2, byte_range_size],
+    )
+
+    actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True)
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_byte_range_large(tmpdir):
+    content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000))
+    delimiter = "\n"
+    temp_file = str(tmpdir) + "/temp.txt"
+
+    with open(temp_file, "w") as f:
+        f.write(content)
+
+    expected = cudf.Series(["xxxx\n" for i in range(0, 200)])
+
+    actual = cudf.read_text(
+        temp_file, delimiter=delimiter, byte_range=[1000, 1000]
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_in_memory(datadir):
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(["x::", "y::", "z"])
+
+    actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_in_memory_strip_delimiter(datadir):
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(["x", "y", "z"])
+
+    actual = cudf.read_text(
+        StringIO("x::y::z"), delimiter="::", strip_delimiters=True
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_bgzip(datadir):
+    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read().split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(
+        chess_file_compressed, compression="bgzip", delimiter=delimiter
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_bgzip_offsets(datadir):
+    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read()[29:695].split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(
+        chess_file_compressed,
+        compression="bgzip",
+        compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7],
+        delimiter=delimiter,
+    )
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/series/test_conversion.py b/python/cudf/cudf/tests/series/test_conversion.py
index 06777c8e6af..08124a9a98e 100644
--- a/python/cudf/cudf/tests/series/test_conversion.py
+++ b/python/cudf/cudf/tests/series/test_conversion.py
@@ -1 +1,34 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
+import pandas as pd
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+@pytest.mark.parametrize(
+    "data, dtype",
+    [
+        ([1, 2, 3], "int8"),
+        ([1, 2, 3], "int64"),
+        ([1.1, 2.2, 3.3], "float32"),
+        ([1.0, 2.0, 3.0], "float32"),
+        ([1.0, 2.0, 3.0], "float64"),
+        (["a", "b", "c"], "str"),
+        (["a", "b", "c"], "category"),
+        (["2001-01-01", "2001-01-02", "2001-01-03"], "datetime64[ns]"),
+    ],
+)
+def test_convert_dtypes(data, dtype):
+    s = pd.Series(data, dtype=dtype)
+    gs = cudf.Series(data, dtype=dtype)
+    expect = s.convert_dtypes()
+
+    # because we don't have distinct nullable types, we check that we
+    # get the same result if we convert to nullable pandas types:
+    got = gs.convert_dtypes().to_pandas(nullable=True)
+    assert_eq(expect, got)
+
+
+# Now write the same test, but construct a DataFrame
+# as input instead of parametrizing:
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 03184bcc1de..80b343c2332 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -111,3 +111,70 @@ def test_delocalize(unit, tz):
     expect = psr.dt.tz_localize(tz).dt.tz_localize(None)
     got = sr.dt.tz_localize(tz).dt.tz_localize(None)
     assert_eq(expect, got)
+
+
+def test_delocalize_naive():
+    # delocalizing naive datetimes should be a no-op
+    psr = pd.Series(["2001-01-01"], dtype="datetime64[ns]")
+    sr = cudf.from_pandas(psr)
+
+    expect = psr.dt.tz_localize(None)
+    got = sr.dt.tz_localize(None)
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "from_tz", ["Europe/London", "America/Chicago", "UTC"]
+)
+@pytest.mark.parametrize(
+    "to_tz", ["Europe/London", "America/Chicago", "UTC", None]
+)
+def test_convert(from_tz, to_tz):
+    ps = pd.Series(pd.date_range("2023-01-01", periods=3, freq="H"))
+    gs = cudf.from_pandas(ps)
+    ps = ps.dt.tz_localize(from_tz)
+    gs = gs.dt.tz_localize(from_tz)
+    expect = ps.dt.tz_convert(to_tz)
+    got = gs.dt.tz_convert(to_tz)
+    assert_eq(expect, got)
+
+
+def test_convert_from_naive():
+    gs = cudf.Series(cudf.date_range("2023-01-01", periods=3, freq="H"))
+    with pytest.raises(TypeError):
+        gs.dt.tz_convert("America/New_York")
+
+
+@pytest.mark.parametrize(
+    "data,original_timezone,target_timezone",
+    [
+        # DST transition:
+        (["2023-03-12 01:30:00"], "America/New_York", "America/Los_Angeles"),
+        # crossing the international date line:
+        (["2023-05-17 23:30:00"], "Pacific/Auckland", "America/Los_Angeles"),
+        # timezone with non-integer offset:
+        (["2023-05-17 12:00:00"], "Asia/Kolkata", "Australia/Eucla"),
+        # timezone with negative offset:
+        (["2023-05-17 09:00:00"], "America/Los_Angeles", "Pacific/Auckland"),
+        # conversion across multiple days:
+        (["2023-05-16 23:30:00"], "America/New_York", "Asia/Kolkata"),
+        # timezone with half-hour offset:
+        (["2023-05-17 12:00:00"], "Asia/Kolkata", "Australia/Adelaide"),
+        # timezone conversion with a timestamp in the future:
+        (["2025-01-01 00:00:00"], "America/New_York", "Europe/London"),
+        # timezone conversion with a timestamp in the past:
+        (["2000-01-01 12:00:00"], "Europe/Paris", "America/Los_Angeles"),
+        # timezone conversion with a timestamp at midnight:
+        (["2023-05-17 00:00:00"], "Asia/Tokyo", "Europe/Paris"),
+    ],
+)
+def test_convert_edge_cases(data, original_timezone, target_timezone):
+    ps = pd.Series(data, dtype="datetime64[s]").dt.tz_localize(
+        original_timezone
+    )
+    gs = cudf.Series(data, dtype="datetime64[s]").dt.tz_localize(
+        original_timezone
+    )
+    expect = ps.dt.tz_convert(target_timezone)
+    got = gs.dt.tz_convert(target_timezone)
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py
index c2cd78f88a0..1ebafbcb654 100644
--- a/python/cudf/cudf/tests/test_api_types.py
+++ b/python/cudf/cudf/tests/test_api_types.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -65,7 +65,7 @@
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
         # Pandas dtypes.
-        (pd.core.dtypes.dtypes.CategoricalDtypeType, True),
+        (pd.CategoricalDtype.type, True),
         (pd.CategoricalDtype, True),
         # Pandas objects.
         (pd.Series(dtype="bool"), False),
@@ -174,7 +174,7 @@ def test_is_categorical_dtype(obj, expect):
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
         # Pandas dtypes.
-        (pd.core.dtypes.dtypes.CategoricalDtypeType, False),
+        (pd.CategoricalDtype.type, False),
         (pd.CategoricalDtype, False),
         # Pandas objects.
         (pd.Series(dtype="bool"), True),
@@ -279,7 +279,7 @@ def test_is_numeric_dtype(obj, expect):
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
         # Pandas dtypes.
-        (pd.core.dtypes.dtypes.CategoricalDtypeType, False),
+        (pd.CategoricalDtype.type, False),
         (pd.CategoricalDtype, False),
         # Pandas objects.
         (pd.Series(dtype="bool"), False),
@@ -384,7 +384,7 @@ def test_is_integer_dtype(obj, expect):
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
         # Pandas dtypes.
-        (pd.core.dtypes.dtypes.CategoricalDtypeType, False),
+        (pd.CategoricalDtype.type, False),
         (pd.CategoricalDtype, False),
         # Pandas objects.
         (pd.Series(dtype="bool"), False),
@@ -490,7 +490,7 @@ def test_is_integer(obj, expect):
         (np.array([], dtype=np.timedelta64), False),
         # (np.array([], dtype=object), False),
         # Pandas dtypes.
-        (pd.core.dtypes.dtypes.CategoricalDtypeType, False),
+        (pd.CategoricalDtype.type, False),
         (pd.CategoricalDtype, False),
         # Pandas objects.
         (pd.Series(dtype="bool"), False),
@@ -595,7 +595,7 @@ def test_is_string_dtype(obj, expect):
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
         # Pandas dtypes.
-        (pd.core.dtypes.dtypes.CategoricalDtypeType, False),
+        (pd.CategoricalDtype.type, False),
         (pd.CategoricalDtype, False),
         # Pandas objects.
         (pd.Series(dtype="bool"), False),
@@ -700,7 +700,7 @@ def test_is_datetime_dtype(obj, expect):
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
         # Pandas dtypes.
-        (pd.core.dtypes.dtypes.CategoricalDtypeType, False),
+        (pd.CategoricalDtype.type, False),
         (pd.CategoricalDtype, False),
         # Pandas objects.
         (pd.Series(dtype="bool"), False),
@@ -805,7 +805,7 @@ def test_is_list_dtype(obj, expect):
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
         # Pandas dtypes.
-        (pd.core.dtypes.dtypes.CategoricalDtypeType, False),
+        (pd.CategoricalDtype.type, False),
         (pd.CategoricalDtype, False),
         # Pandas objects.
         (pd.Series(dtype="bool"), False),
@@ -913,7 +913,7 @@ def test_is_struct_dtype(obj, expect):
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
         # Pandas dtypes.
-        (pd.core.dtypes.dtypes.CategoricalDtypeType, False),
+        (pd.CategoricalDtype.type, False),
         (pd.CategoricalDtype, False),
         # Pandas objects.
         (pd.Series(dtype="bool"), False),
@@ -1019,7 +1019,7 @@ def test_is_decimal_dtype(obj, expect):
         np.array([], dtype=object),
         # Pandas dtypes.
         # TODO: pandas does not consider these to be categoricals.
-        # pd.core.dtypes.dtypes.CategoricalDtypeType,
+        # pd.CategoricalDtype.type,
         # pd.CategoricalDtype,
         # Pandas objects.
         pd.Series(dtype="bool"),
@@ -1101,7 +1101,7 @@ def test_pandas_agreement(obj):
         np.array([], dtype=object),
         # Pandas dtypes.
         # TODO: pandas does not consider these to be categoricals.
-        # pd.core.dtypes.dtypes.CategoricalDtypeType,
+        # pd.CategoricalDtype.type,
         # pd.CategoricalDtype,
         # Pandas objects.
         pd.Series(dtype="bool"),
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index 65874c94b93..758a8cbb535 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 import numpy as np
 import pandas as pd
 import pytest
@@ -67,6 +67,11 @@ def test_array_func_cudf_series(np_ar, func):
         lambda x: np.sum(x, axis=0),
         lambda x: np.var(x, ddof=1),
         lambda x: np.dot(x, x.transpose()),
+        lambda x: np.all(x),
+        lambda x: np.any(x),
+        lambda x: np.product(x),
+        lambda x: np.product(x, axis=0),
+        lambda x: np.product(x, axis=1),
     ],
 )
 def test_array_func_cudf_dataframe(pd_df, func):
@@ -94,15 +99,26 @@ def test_array_func_missing_cudf_dataframe(pd_df, func):
         func(cudf_df)
 
 
-# we only implement sum among all numpy non-ufuncs
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
 @pytest.mark.parametrize("np_ar", [np.random.random(100)])
-@pytest.mark.parametrize("func", [lambda x: np.sum(x), lambda x: np.dot(x, x)])
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda x: np.mean(x),
+        lambda x: np.sum(x),
+        lambda x: np.var(x, ddof=1),
+        lambda x: np.unique(x),
+        lambda x: np.dot(x, x),
+    ],
+)
 def test_array_func_cudf_index(np_ar, func):
     cudf_index = cudf.core.index.as_index(cudf.Series(np_ar))
     expect = func(np_ar)
     got = func(cudf_index)
-    assert_eq(expect, got)
+    if np.isscalar(expect):
+        assert_eq(expect, got)
+    else:
+        assert_eq(expect, got.to_numpy())
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index f8eddc21acd..549cd8da78e 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -3,6 +3,7 @@
 import decimal
 import operator
 import random
+import warnings
 from itertools import combinations_with_replacement, product
 
 import cupy as cp
@@ -149,6 +150,33 @@
     lambda x: cudf.Scalar(0) / x,
 ]
 
+_series_or_index_names = [
+    None,
+    pd.NA,
+    cudf.NA,
+    np.nan,
+    float("NaN"),
+    "abc",
+    1,
+    pd.NaT,
+    np.datetime64("nat"),
+    np.timedelta64("NaT"),
+    np.timedelta64(10, "D"),
+    np.timedelta64(5, "D"),
+    np.datetime64("1970-01-01 00:00:00.000000001"),
+    np.datetime64("1970-01-01 00:00:00.000000002"),
+    pd.Timestamp(1),
+    pd.Timestamp(2),
+    pd.Timedelta(1),
+    pd.Timedelta(2),
+    decimal.Decimal("NaN"),
+    decimal.Decimal("1.2"),
+    np.int64(1),
+    np.int32(1),
+    np.float32(1),
+    pd.Timestamp(1),
+]
+
 pytest_xfail = pytest.mark.xfail
 pytestmark = pytest.mark.spilling
 
@@ -1672,7 +1700,12 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
     rhs = cudf.Scalar(cudf.NA, dtype=dtype_r)
 
     result = op(lhs, rhs)
-    assert result.value is cudf.NA
+    assert result.value is (
+        cudf.NaT
+        if cudf.api.types.is_datetime64_dtype(result.dtype)
+        or cudf.api.types.is_timedelta64_dtype(result.dtype)
+        else cudf.NA
+    )
 
     # make sure dtype is the same as had there been a valid scalar
     valid_lhs = cudf.Scalar(1, dtype=dtype_l)
@@ -2361,10 +2394,25 @@ def test_binops_reflect_decimal(
     utils.assert_eq(expect, got)
 
 
+@pytest.mark.parametrize("powers", [0, 1, 2, 3])
+def test_binops_decimal_pow(powers):
+    s = cudf.Series(
+        [
+            decimal.Decimal("1.324324"),
+            None,
+            decimal.Decimal("2"),
+            decimal.Decimal("3"),
+            decimal.Decimal("5"),
+        ]
+    )
+    ps = s.to_pandas()
+
+    utils.assert_eq(s**powers, ps**powers, check_dtype=False)
+
+
 def test_binops_raise_error():
     s = cudf.Series([decimal.Decimal("1.324324")])
-    with pytest.raises(TypeError):
-        s**1
+
     with pytest.raises(TypeError):
         s // 1
 
@@ -3250,3 +3298,51 @@ def test_binop_integer_power_int_scalar():
     expected = base**exponent.value
     got = base**exponent
     utils.assert_eq(expected, got)
+
+
+def test_numpy_int_scalar_binop():
+    assert (np.float32(1.0) - cudf.Scalar(1)) == 0.0
+
+
+@pytest.mark.parametrize("op", _binops)
+def test_binop_index_series(op):
+    gi = cudf.Index([10, 11, 12])
+    gs = cudf.Series([1, 2, 3])
+
+    actual = op(gi, gs)
+    expected = op(gi.to_pandas(), gs.to_pandas())
+
+    utils.assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("name1", _series_or_index_names)
+@pytest.mark.parametrize("name2", _series_or_index_names)
+def test_binop_index_dt_td_series_with_names(name1, name2):
+    gi = cudf.Index([1, 2, 3], dtype="datetime64[ns]", name=name1)
+    gs = cudf.Series([10, 11, 12], dtype="timedelta64[ns]", name=name2)
+    with warnings.catch_warnings():
+        # Numpy raises a deprecation warning:
+        # "elementwise comparison failed; this will raise an error "
+        warnings.simplefilter("ignore", (DeprecationWarning,))
+
+        expected = gi.to_pandas() + gs.to_pandas()
+    actual = gi + gs
+
+    utils.assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("data1", [[1, 2, 3], [10, 11, None]])
+@pytest.mark.parametrize("data2", [[1, 2, 3], [10, 11, None]])
+def test_binop_eq_ne_index_series(data1, data2):
+    gi = cudf.Index(data1, dtype="datetime64[ns]", name=np.nan)
+    gs = cudf.Series(data2, dtype="timedelta64[ns]", name="abc")
+
+    actual = gi == gs
+    expected = gi.to_pandas() == gs.to_pandas()
+
+    utils.assert_eq(expected, actual)
+
+    actual = gi != gs
+    expected = gi.to_pandas() != gs.to_pandas()
+
+    utils.assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 2c8226e4fe5..8b3d75fe59e 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -926,3 +926,13 @@ def test_categorical_string_index_contains(data, value):
     pidx = idx.to_pandas()
 
     assert_eq(value in idx, value in pidx)
+
+
+def test_categorical_index_with_dtype():
+    dtype = cudf.CategoricalDtype(categories=["a", "z", "c"])
+    gi = cudf.Index(["z", "c", "a"], dtype=dtype)
+    pi = pd.Index(["z", "c", "a"], dtype=dtype.to_pandas())
+
+    assert_eq(gi, pi)
+    assert_eq(gi.dtype, pi.dtype)
+    assert_eq(gi.dtype.categories, pi.dtype.categories)
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index a15afa727c0..db0446d506c 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -398,8 +398,8 @@ def test_column_view_string_slice(slc):
             cudf.core.column.as_column([], dtype="uint8"),
         ),
         (
-            cp.array([453], dtype="uint8"),
-            cudf.core.column.as_column([453], dtype="uint8"),
+            cp.array([255], dtype="uint8"),
+            cudf.core.column.as_column([255], dtype="uint8"),
         ),
     ],
 )
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index d485912f7b7..085774e9dbc 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -1,7 +1,5 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-import contextlib
-
 import cupy as cp
 import numpy as np
 import pandas as pd
@@ -12,17 +10,6 @@
 from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
 
 
-# TODO: Make use of set_option context manager
-# once https://github.com/rapidsai/cudf/issues/12736
-# is resolved.
-@contextlib.contextmanager
-def with_copy_on_write(on):
-    original_cow_setting = cudf.get_option("copy_on_write")
-    cudf.set_option("copy_on_write", on)
-    yield
-    cudf.set_option("copy_on_write", original_cow_setting)
-
-
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES)
 def test_repeat(dtype):
     arr = np.random.rand(10) * 10
@@ -68,291 +55,298 @@ def test_null_copy():
     assert len(col) == 2049
 
 
-@with_copy_on_write(on=True)
 def test_series_setitem_cow_on():
-    actual = cudf.Series([1, 2, 3, 4, 5])
-    new_copy = actual.copy(deep=False)
+    with cudf.option_context("copy_on_write", True):
+        actual = cudf.Series([1, 2, 3, 4, 5])
+        new_copy = actual.copy(deep=False)
 
-    actual[1] = 100
-    assert_eq(actual, cudf.Series([1, 100, 3, 4, 5]))
-    assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5]))
+        actual[1] = 100
+        assert_eq(actual, cudf.Series([1, 100, 3, 4, 5]))
+        assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5]))
 
 
-@with_copy_on_write(on=False)
 def test_series_setitem_cow_off():
-    actual = cudf.Series([1, 2, 3, 4, 5])
-    new_copy = actual.copy(deep=False)
+    with cudf.option_context("copy_on_write", False):
+        actual = cudf.Series([1, 2, 3, 4, 5])
+        new_copy = actual.copy(deep=False)
 
-    actual[1] = 100
-    assert_eq(actual, cudf.Series([1, 100, 3, 4, 5]))
-    assert_eq(new_copy, cudf.Series([1, 100, 3, 4, 5]))
+        actual[1] = 100
+        assert_eq(actual, cudf.Series([1, 100, 3, 4, 5]))
+        assert_eq(new_copy, cudf.Series([1, 100, 3, 4, 5]))
 
 
-@with_copy_on_write(on=True)
 def test_series_setitem_both_slice_cow_on():
-    actual = cudf.Series([1, 2, 3, 4, 5])
-    new_copy = actual.copy(deep=False)
+    with cudf.option_context("copy_on_write", True):
+        actual = cudf.Series([1, 2, 3, 4, 5])
+        new_copy = actual.copy(deep=False)
 
-    actual[slice(0, 2, 1)] = 100
-    assert_eq(actual, cudf.Series([100, 100, 3, 4, 5]))
-    assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5]))
+        actual[slice(0, 2, 1)] = 100
+        assert_eq(actual, cudf.Series([100, 100, 3, 4, 5]))
+        assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5]))
 
-    new_copy[slice(2, 4, 1)] = 300
-    assert_eq(actual, cudf.Series([100, 100, 3, 4, 5]))
-    assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5]))
+        new_copy[slice(2, 4, 1)] = 300
+        assert_eq(actual, cudf.Series([100, 100, 3, 4, 5]))
+        assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5]))
 
 
-@with_copy_on_write(on=False)
 def test_series_setitem_both_slice_cow_off():
-    actual = cudf.Series([1, 2, 3, 4, 5])
-    new_copy = actual.copy(deep=False)
+    with cudf.option_context("copy_on_write", False):
+        actual = cudf.Series([1, 2, 3, 4, 5])
+        new_copy = actual.copy(deep=False)
 
-    actual[slice(0, 2, 1)] = 100
-    assert_eq(actual, cudf.Series([100, 100, 3, 4, 5]))
-    assert_eq(new_copy, cudf.Series([100, 100, 3, 4, 5]))
+        actual[slice(0, 2, 1)] = 100
+        assert_eq(actual, cudf.Series([100, 100, 3, 4, 5]))
+        assert_eq(new_copy, cudf.Series([100, 100, 3, 4, 5]))
 
-    new_copy[slice(2, 4, 1)] = 300
-    assert_eq(actual, cudf.Series([100, 100, 300, 300, 5]))
-    assert_eq(new_copy, cudf.Series([100, 100, 300, 300, 5]))
+        new_copy[slice(2, 4, 1)] = 300
+        assert_eq(actual, cudf.Series([100, 100, 300, 300, 5]))
+        assert_eq(new_copy, cudf.Series([100, 100, 300, 300, 5]))
 
 
-@with_copy_on_write(on=True)
 def test_series_setitem_partial_slice_cow_on():
-    actual = cudf.Series([1, 2, 3, 4, 5])
-    new_copy = actual.copy(deep=False)
-
-    new_copy[slice(2, 4, 1)] = 300
-    assert_eq(actual, cudf.Series([1, 2, 3, 4, 5]))
-    assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5]))
+    with cudf.option_context("copy_on_write", True):
+        actual = cudf.Series([1, 2, 3, 4, 5])
+        new_copy = actual.copy(deep=False)
+
+        new_copy[slice(2, 4, 1)] = 300
+        assert_eq(actual, cudf.Series([1, 2, 3, 4, 5]))
+        assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5]))
+
+        new_slice = actual[2:]
+        # TODO: when COW and spilling has been unified, find a clean way to
+        # test this without accessing the internal attributes _base and _ptr
+        assert (
+            new_slice._column.base_data._base._ptr
+            == actual._column.base_data._base._ptr
+        )
+        new_slice[0:2] = 10
+        assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4]))
+        assert_eq(actual, cudf.Series([1, 2, 3, 4, 5]))
 
-    new_slice = actual[2:]
-    assert new_slice._column.base_data._ptr == actual._column.base_data._ptr
-    new_slice[0:2] = 10
-    assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4]))
-    assert_eq(actual, cudf.Series([1, 2, 3, 4, 5]))
 
-
-@with_copy_on_write(on=False)
 def test_series_setitem_partial_slice_cow_off():
-    actual = cudf.Series([1, 2, 3, 4, 5])
-    new_copy = actual.copy(deep=False)
+    with cudf.option_context("copy_on_write", False):
+        actual = cudf.Series([1, 2, 3, 4, 5])
+        new_copy = actual.copy(deep=False)
 
-    new_copy[slice(2, 4, 1)] = 300
-    assert_eq(actual, cudf.Series([1, 2, 300, 300, 5]))
-    assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5]))
+        new_copy[slice(2, 4, 1)] = 300
+        assert_eq(actual, cudf.Series([1, 2, 300, 300, 5]))
+        assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5]))
 
-    new_slice = actual[2:]
-    assert new_slice._column.base_data._ptr == actual._column.base_data._ptr
-    new_slice[0:2] = 10
-    assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4]))
-    assert_eq(actual, cudf.Series([1, 2, 10, 10, 5]))
+        new_slice = actual[2:]
+        assert (
+            new_slice._column.base_data._ptr == actual._column.base_data._ptr
+        )
+        new_slice[0:2] = 10
+        assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4]))
+        assert_eq(actual, cudf.Series([1, 2, 10, 10, 5]))
 
 
-@with_copy_on_write(on=True)
 def test_multiple_series_cow():
-    # Verify constructing, modifying, deleting
-    # multiple copies of a series preserves
-    # the data appropriately when COW is enabled.
-    s = cudf.Series([10, 20, 30, 40, 50])
-    s1 = s.copy(deep=False)
-    s2 = s.copy(deep=False)
-    s3 = s.copy(deep=False)
-    s4 = s2.copy(deep=False)
-    s5 = s4.copy(deep=False)
-    s6 = s3.copy(deep=False)
-
-    s1[0:3] = 10000
-    # s1 will be unlinked from actual data in s,
-    # and then modified. Rest all should
-    # contain the original data.
-    assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
-    for ser in [s, s2, s3, s4, s5, s6]:
-        assert_eq(ser, cudf.Series([10, 20, 30, 40, 50]))
-
-    s6[0:3] = 3000
-    # s6 will be unlinked from actual data in s,
-    # and then modified. Rest all should
-    # contain the original data.
-    assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
-    assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
-    for ser in [s2, s3, s4, s5]:
-        assert_eq(ser, cudf.Series([10, 20, 30, 40, 50]))
-
-    s2[1:4] = 4000
-    # s2 will be unlinked from actual data in s,
-    # and then modified. Rest all should
-    # contain the original data.
-    assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50]))
-    assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
-    assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
-    for ser in [s3, s4, s5]:
-        assert_eq(ser, cudf.Series([10, 20, 30, 40, 50]))
-
-    s4[2:4] = 5000
-    # s4 will be unlinked from actual data in s,
-    # and then modified. Rest all should
-    # contain the original data.
-    assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50]))
-    assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50]))
-    assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
-    assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
-    for ser in [s3, s5]:
-        assert_eq(ser, cudf.Series([10, 20, 30, 40, 50]))
-
-    s5[2:4] = 6000
-    # s5 will be unlinked from actual data in s,
-    # and then modified. Rest all should
-    # contain the original data.
-    assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50]))
-    assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50]))
-    assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50]))
-    assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
-    assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
-    for ser in [s3]:
-        assert_eq(ser, cudf.Series([10, 20, 30, 40, 50]))
-
-    s7 = s5.copy(deep=False)
-    assert_eq(s7, cudf.Series([10, 20, 6000, 6000, 50]))
-    s7[1:3] = 55
-    # Making a copy of s5, i.e., s7 and modifying shouldn't
-    # be touching/modifying data in other series.
-    assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50]))
-
-    assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50]))
-    assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50]))
-    assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
-    assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
-    for ser in [s3]:
-        assert_eq(ser, cudf.Series([10, 20, 30, 40, 50]))
-
-    # Deleting any of the following series objects
-    # shouldn't delete rest of the weekly referenced data
-    # elsewhere.
-
-    del s2
-
-    assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
-    assert_eq(s3, cudf.Series([10, 20, 30, 40, 50]))
-    assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50]))
-    assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50]))
-    assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
-    assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50]))
-
-    del s4
-    del s1
-
-    assert_eq(s3, cudf.Series([10, 20, 30, 40, 50]))
-    assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50]))
-    assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
-    assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50]))
-
-    del s
-    del s6
-
-    assert_eq(s3, cudf.Series([10, 20, 30, 40, 50]))
-    assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50]))
-    assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50]))
-
-    del s5
-
-    assert_eq(s3, cudf.Series([10, 20, 30, 40, 50]))
-    assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50]))
-
-    del s3
-    assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50]))
-
-
-@with_copy_on_write(on=True)
+    with cudf.option_context("copy_on_write", True):
+        # Verify constructing, modifying, deleting
+        # multiple copies of a series preserves
+        # the data appropriately when COW is enabled.
+        s = cudf.Series([10, 20, 30, 40, 50])
+        s1 = s.copy(deep=False)
+        s2 = s.copy(deep=False)
+        s3 = s.copy(deep=False)
+        s4 = s2.copy(deep=False)
+        s5 = s4.copy(deep=False)
+        s6 = s3.copy(deep=False)
+
+        s1[0:3] = 10000
+        # s1 will be unlinked from actual data in s,
+        # and then modified. Rest all should
+        # contain the original data.
+        assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
+        for ser in [s, s2, s3, s4, s5, s6]:
+            assert_eq(ser, cudf.Series([10, 20, 30, 40, 50]))
+
+        s6[0:3] = 3000
+        # s6 will be unlinked from actual data in s,
+        # and then modified. Rest all should
+        # contain the original data.
+        assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
+        assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
+        for ser in [s2, s3, s4, s5]:
+            assert_eq(ser, cudf.Series([10, 20, 30, 40, 50]))
+
+        s2[1:4] = 4000
+        # s2 will be unlinked from actual data in s,
+        # and then modified. Rest all should
+        # contain the original data.
+        assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50]))
+        assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
+        assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
+        for ser in [s3, s4, s5]:
+            assert_eq(ser, cudf.Series([10, 20, 30, 40, 50]))
+
+        s4[2:4] = 5000
+        # s4 will be unlinked from actual data in s,
+        # and then modified. Rest all should
+        # contain the original data.
+        assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50]))
+        assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50]))
+        assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
+        assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
+        for ser in [s3, s5]:
+            assert_eq(ser, cudf.Series([10, 20, 30, 40, 50]))
+
+        s5[2:4] = 6000
+        # s5 will be unlinked from actual data in s,
+        # and then modified. Rest all should
+        # contain the original data.
+        assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50]))
+        assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50]))
+        assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50]))
+        assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
+        assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
+        for ser in [s3]:
+            assert_eq(ser, cudf.Series([10, 20, 30, 40, 50]))
+
+        s7 = s5.copy(deep=False)
+        assert_eq(s7, cudf.Series([10, 20, 6000, 6000, 50]))
+        s7[1:3] = 55
+        # Making a copy of s5, i.e., s7 and modifying shouldn't
+        # be touching/modifying data in other series.
+        assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50]))
+
+        assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50]))
+        assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50]))
+        assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
+        assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
+        for ser in [s3]:
+            assert_eq(ser, cudf.Series([10, 20, 30, 40, 50]))
+
+        # Deleting any of the following series objects
+        # shouldn't delete rest of the weekly referenced data
+        # elsewhere.
+
+        del s2
+
+        assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50]))
+        assert_eq(s3, cudf.Series([10, 20, 30, 40, 50]))
+        assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50]))
+        assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50]))
+        assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
+        assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50]))
+
+        del s4
+        del s1
+
+        assert_eq(s3, cudf.Series([10, 20, 30, 40, 50]))
+        assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50]))
+        assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50]))
+        assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50]))
+
+        del s
+        del s6
+
+        assert_eq(s3, cudf.Series([10, 20, 30, 40, 50]))
+        assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50]))
+        assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50]))
+
+        del s5
+
+        assert_eq(s3, cudf.Series([10, 20, 30, 40, 50]))
+        assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50]))
+
+        del s3
+        assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50]))
+
+
 def test_series_zero_copy_cow_on():
-    s = cudf.Series([1, 2, 3, 4, 5])
-    s1 = s.copy(deep=False)
-    cp_array = cp.asarray(s)
-
-    # Ensure all original data & zero-copied
-    # data is same.
-    assert_eq(s, cudf.Series([1, 2, 3, 4, 5]))
-    assert_eq(s1, cudf.Series([1, 2, 3, 4, 5]))
-    assert_eq(cp_array, cp.array([1, 2, 3, 4, 5]))
-
-    cp_array[0:3] = 10
-    # Modifying a zero-copied array should only
-    # modify `s` and will leave rest of the copies
-    # untouched.
-
-    assert_eq(s, cudf.Series([10, 10, 10, 4, 5]))
-    assert_eq(s1, cudf.Series([1, 2, 3, 4, 5]))
-    assert_eq(cp_array, cp.array([10, 10, 10, 4, 5]))
-
-    s2 = cudf.Series(cp_array)
-    assert_eq(s2, cudf.Series([10, 10, 10, 4, 5]))
-
-    s3 = s2.copy(deep=False)
-    cp_array[0] = 20
-    # Modifying a zero-copied array should modify
-    # `s2` and `s` only. Because `cp_array`
-    # is zero-copy shared with `s` & `s2`.
-
-    assert_eq(s, cudf.Series([20, 10, 10, 4, 5]))
-    assert_eq(s1, cudf.Series([1, 2, 3, 4, 5]))
-    assert_eq(cp_array, cp.array([20, 10, 10, 4, 5]))
-    assert_eq(s2, cudf.Series([20, 10, 10, 4, 5]))
-    assert_eq(s3, cudf.Series([10, 10, 10, 4, 5]))
-
-    s4 = cudf.Series([10, 20, 30, 40, 50])
-    s5 = cudf.Series(s4)
-    assert_eq(s5, cudf.Series([10, 20, 30, 40, 50]))
-    s5[0:2] = 1
-    # Modifying `s5` should also modify `s4`
-    # because they are zero-copied.
-    assert_eq(s5, cudf.Series([1, 1, 30, 40, 50]))
-    assert_eq(s4, cudf.Series([1, 1, 30, 40, 50]))
-
-
-@with_copy_on_write(on=False)
+    with cudf.option_context("copy_on_write", True):
+        s = cudf.Series([1, 2, 3, 4, 5])
+        s1 = s.copy(deep=False)
+        cp_array = cp.asarray(s)
+
+        # Ensure all original data & zero-copied
+        # data is same.
+        assert_eq(s, cudf.Series([1, 2, 3, 4, 5]))
+        assert_eq(s1, cudf.Series([1, 2, 3, 4, 5]))
+        assert_eq(cp_array, cp.array([1, 2, 3, 4, 5]))
+
+        cp_array[0:3] = 10
+        # Modifying a zero-copied array should only
+        # modify `s` and will leave rest of the copies
+        # untouched.
+
+        assert_eq(s, cudf.Series([10, 10, 10, 4, 5]))
+        assert_eq(s1, cudf.Series([1, 2, 3, 4, 5]))
+        assert_eq(cp_array, cp.array([10, 10, 10, 4, 5]))
+
+        s2 = cudf.Series(cp_array)
+        assert_eq(s2, cudf.Series([10, 10, 10, 4, 5]))
+
+        s3 = s2.copy(deep=False)
+        cp_array[0] = 20
+        # Modifying a zero-copied array should modify
+        # `s2` and `s` only. Because `cp_array`
+        # is zero-copy shared with `s` & `s2`.
+
+        assert_eq(s, cudf.Series([20, 10, 10, 4, 5]))
+        assert_eq(s1, cudf.Series([1, 2, 3, 4, 5]))
+        assert_eq(cp_array, cp.array([20, 10, 10, 4, 5]))
+        assert_eq(s2, cudf.Series([20, 10, 10, 4, 5]))
+        assert_eq(s3, cudf.Series([10, 10, 10, 4, 5]))
+
+        s4 = cudf.Series([10, 20, 30, 40, 50])
+        s5 = cudf.Series(s4)
+        assert_eq(s5, cudf.Series([10, 20, 30, 40, 50]))
+        s5[0:2] = 1
+        # Modifying `s5` should also modify `s4`
+        # because they are zero-copied.
+        assert_eq(s5, cudf.Series([1, 1, 30, 40, 50]))
+        assert_eq(s4, cudf.Series([1, 1, 30, 40, 50]))
+
+
 def test_series_zero_copy_cow_off():
-    s = cudf.Series([1, 2, 3, 4, 5])
-    s1 = s.copy(deep=False)
-    cp_array = cp.asarray(s)
-
-    # Ensure all original data & zero-copied
-    # data is same.
-    assert_eq(s, cudf.Series([1, 2, 3, 4, 5]))
-    assert_eq(s1, cudf.Series([1, 2, 3, 4, 5]))
-    assert_eq(cp_array, cp.array([1, 2, 3, 4, 5]))
-
-    cp_array[0:3] = 10
-    # When COW is off, modifying a zero-copied array
-    # will need to modify `s` & `s1` since they are
-    # shallow copied.
-
-    assert_eq(s, cudf.Series([10, 10, 10, 4, 5]))
-    assert_eq(s1, cudf.Series([10, 10, 10, 4, 5]))
-    assert_eq(cp_array, cp.array([10, 10, 10, 4, 5]))
-
-    s2 = cudf.Series(cp_array)
-    assert_eq(s2, cudf.Series([10, 10, 10, 4, 5]))
-    s3 = s2.copy(deep=False)
-    cp_array[0] = 20
-
-    # Modifying `cp_array`, will propagate the changes
-    # across all Series objects, because they are
-    # either shallow copied or zero-copied.
-
-    assert_eq(s, cudf.Series([20, 10, 10, 4, 5]))
-    assert_eq(s1, cudf.Series([20, 10, 10, 4, 5]))
-    assert_eq(cp_array, cp.array([20, 10, 10, 4, 5]))
-    assert_eq(s2, cudf.Series([20, 10, 10, 4, 5]))
-    assert_eq(s3, cudf.Series([20, 10, 10, 4, 5]))
-
-    s4 = cudf.Series([10, 20, 30, 40, 50])
-    s5 = cudf.Series(s4)
-    assert_eq(s5, cudf.Series([10, 20, 30, 40, 50]))
-    s5[0:2] = 1
-
-    # Modifying `s5` should also modify `s4`
-    # because they are zero-copied.
-    assert_eq(s5, cudf.Series([1, 1, 30, 40, 50]))
-    assert_eq(s4, cudf.Series([1, 1, 30, 40, 50]))
+    with cudf.option_context("copy_on_write", False):
+        s = cudf.Series([1, 2, 3, 4, 5])
+        s1 = s.copy(deep=False)
+        cp_array = cp.asarray(s)
+
+        # Ensure all original data & zero-copied
+        # data is same.
+        assert_eq(s, cudf.Series([1, 2, 3, 4, 5]))
+        assert_eq(s1, cudf.Series([1, 2, 3, 4, 5]))
+        assert_eq(cp_array, cp.array([1, 2, 3, 4, 5]))
+
+        cp_array[0:3] = 10
+        # When COW is off, modifying a zero-copied array
+        # will need to modify `s` & `s1` since they are
+        # shallow copied.
+
+        assert_eq(s, cudf.Series([10, 10, 10, 4, 5]))
+        assert_eq(s1, cudf.Series([10, 10, 10, 4, 5]))
+        assert_eq(cp_array, cp.array([10, 10, 10, 4, 5]))
+
+        s2 = cudf.Series(cp_array)
+        assert_eq(s2, cudf.Series([10, 10, 10, 4, 5]))
+        s3 = s2.copy(deep=False)
+        cp_array[0] = 20
+
+        # Modifying `cp_array`, will propagate the changes
+        # across all Series objects, because they are
+        # either shallow copied or zero-copied.
+
+        assert_eq(s, cudf.Series([20, 10, 10, 4, 5]))
+        assert_eq(s1, cudf.Series([20, 10, 10, 4, 5]))
+        assert_eq(cp_array, cp.array([20, 10, 10, 4, 5]))
+        assert_eq(s2, cudf.Series([20, 10, 10, 4, 5]))
+        assert_eq(s3, cudf.Series([20, 10, 10, 4, 5]))
+
+        s4 = cudf.Series([10, 20, 30, 40, 50])
+        s5 = cudf.Series(s4)
+        assert_eq(s5, cudf.Series([10, 20, 30, 40, 50]))
+        s5[0:2] = 1
+
+        # Modifying `s5` should also modify `s4`
+        # because they are zero-copied.
+        assert_eq(s5, cudf.Series([1, 1, 30, 40, 50]))
+        assert_eq(s4, cudf.Series([1, 1, 30, 40, 50]))
 
 
 @pytest.mark.parametrize("copy_on_write", [True, False])
@@ -405,27 +399,31 @@ def test_series_cat_copy(copy_on_write):
     cudf.set_option("copy_on_write", original_cow_setting)
 
 
-@with_copy_on_write(on=True)
 def test_dataframe_cow_slice_setitem():
-    df = cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]})
-    slice_df = df[1:4]
-
-    assert_eq(
-        slice_df,
-        cudf.DataFrame(
-            {"a": [11, 12, 13], "b": [30, 40, 50]}, index=[1, 2, 3]
-        ),
-    )
-
-    slice_df["a"][2] = 1111
-
-    assert_eq(
-        slice_df,
-        cudf.DataFrame(
-            {"a": [11, 1111, 13], "b": [30, 40, 50]}, index=[1, 2, 3]
-        ),
-    )
-    assert_eq(
-        df,
-        cudf.DataFrame({"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}),
-    )
+    with cudf.option_context("copy_on_write", True):
+        df = cudf.DataFrame(
+            {"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}
+        )
+        slice_df = df[1:4]
+
+        assert_eq(
+            slice_df,
+            cudf.DataFrame(
+                {"a": [11, 12, 13], "b": [30, 40, 50]}, index=[1, 2, 3]
+            ),
+        )
+
+        slice_df["a"][2] = 1111
+
+        assert_eq(
+            slice_df,
+            cudf.DataFrame(
+                {"a": [11, 1111, 13], "b": [30, 40, 50]}, index=[1, 2, 3]
+            ),
+        )
+        assert_eq(
+            df,
+            cudf.DataFrame(
+                {"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]}
+            ),
+        )
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 4a7804da62c..ff82ca802aa 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
+import codecs
 import gzip
 import os
 import re
@@ -151,7 +152,7 @@ def make_all_numeric_extremes_dataframe():
         if np.issubdtype(np_type, np.integer):
             itype = np.iinfo(np_type)
             extremes = [0, +1, -1, itype.min, itype.max]
-            df[gdf_dtype] = np.array(extremes * 4, dtype=np_type)[:20]
+            df[gdf_dtype] = np.array(extremes * 4).astype(np_type)[:20]
         else:
             ftype = np.finfo(np_type)
             extremes = [
@@ -324,6 +325,7 @@ def test_csv_reader_dtype_dict(use_names):
     assert_eq(gdf, pdf)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 @pytest.mark.parametrize("use_names", [True, False])
 def test_csv_reader_dtype_extremes(use_names):
     # Save with the column header if not explicitly specifying a list of names
@@ -1433,7 +1435,7 @@ def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype):
 
     gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"])
 
-    expected = np.array(values, dtype=np_dtype)
+    expected = np.array(values).astype(np_dtype)
     actual = gdf["hex_int"].to_numpy()
     np.testing.assert_array_equal(expected, actual)
 
@@ -2149,6 +2151,7 @@ def test_default_integer_bitwidth_partial(
     )
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 def test_default_integer_bitwidth_extremes(
     cudf_extreme_numeric_dataframe, default_integer_bitwidth
 ):
@@ -2220,3 +2223,14 @@ def test_column_selection_plus_column_names(usecols, names):
         pd.read_csv(StringIO(buffer), usecols=usecols, names=names),
         cudf.read_csv(StringIO(buffer), usecols=usecols, names=names),
     )
+
+
+def test_read_compressed_BOM(tmpdir):
+    buffer = 'int, string\n1, "a"\n2, "b"\n3, "c"\n'
+
+    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file20.gz")
+    with gzip.open(fname, "wt", encoding="utf-8") as f:
+        f.write(codecs.BOM_UTF8.decode("utf-8"))
+        f.write(buffer)
+
+    assert_eq(pd.read_csv(fname), cudf.read_csv(fname))
diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py
index 35cc107b257..325be954fe4 100644
--- a/python/cudf/cudf/tests/test_custom_accessor.py
+++ b/python/cudf/cudf/tests/test_custom_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import pandas as pd
 import pytest
@@ -50,7 +50,7 @@ def test_dataframe_accessor_idendity(gdf1, gdf2):
     """
 
     assert gdf1.point is gdf1.point
-    assert not (gdf1.point is gdf2.point)
+    assert gdf1.point is not gdf2.point
 
 
 @pd.api.extensions.register_index_accessor("odd")
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 918bd995ed1..44d0b9249d0 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10,7 +10,7 @@
 import string
 import textwrap
 import warnings
-from collections import OrderedDict, defaultdict
+from collections import OrderedDict, defaultdict, namedtuple
 from copy import copy
 
 import cupy
@@ -240,7 +240,6 @@ def test_series_from_cupy_scalars():
 @pytest.mark.parametrize("a", [[1, 2, 3], [1, 10, 30]])
 @pytest.mark.parametrize("b", [[4, 5, 6], [-11, -100, 30]])
 def test_append_index(a, b):
-
     df = pd.DataFrame()
     df["a"] = a
     df["b"] = b
@@ -368,7 +367,6 @@ def test_dataframe_truncate_datetimeindex():
 
 
 def test_series_init_none():
-
     # test for creating empty series
     # 1: without initializing
     sr1 = cudf.Series()
@@ -1503,7 +1501,6 @@ def test_dataframe_concat_different_column_types():
     "df_2", [cudf.DataFrame({"a": [], "b": []}), cudf.DataFrame({})]
 )
 def test_concat_empty_dataframe(df_1, df_2):
-
     got = cudf.concat([df_1, df_2])
     expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False)
 
@@ -1694,18 +1691,31 @@ def test_nonmatching_index_setitem(nrows):
     assert_eq(gdf["c"].to_pandas(), gdf_series.to_pandas())
 
 
-def test_from_pandas():
-    df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0])
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "int",
+        pytest.param(
+            "int64[pyarrow]",
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_150, reason="pyarrow support only in >=1.5"
+            ),
+        ),
+    ],
+)
+def test_from_pandas(dtype):
+    df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0], dtype=dtype)
+    df.columns.name = "custom_column_name"
     gdf = cudf.DataFrame.from_pandas(df)
     assert isinstance(gdf, cudf.DataFrame)
 
-    assert_eq(df, gdf)
+    assert_eq(df, gdf, check_dtype="pyarrow" not in dtype)
 
     s = df.x
     gs = cudf.Series.from_pandas(s)
     assert isinstance(gs, cudf.Series)
 
-    assert_eq(s, gs)
+    assert_eq(s, gs, check_dtype="pyarrow" not in dtype)
 
 
 @pytest.mark.parametrize("dtypes", [int, float])
@@ -2471,8 +2481,15 @@ def test_bitwise_binops_series(pdf, gdf, binop):
 
 
 @pytest.mark.parametrize("unaryop", [operator.neg, operator.inv, operator.abs])
-def test_unaryops_df(pdf, gdf, unaryop):
-    d = unaryop(pdf - 5)
+@pytest.mark.parametrize(
+    "col_name,assign_col_name", [(None, False), (None, True), ("abc", True)]
+)
+def test_unaryops_df(pdf, unaryop, col_name, assign_col_name):
+    pd_df = pdf.copy()
+    if assign_col_name:
+        pd_df.columns.name = col_name
+    gdf = cudf.from_pandas(pd_df)
+    d = unaryop(pd_df - 5)
     g = unaryop(gdf - 5)
     assert_eq(d, g)
 
@@ -2614,11 +2631,16 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):
     pdf2 = pdf_arrow_table.to_pandas()
 
     assert_eq(pdf2, gdf2)
+    pdf.columns.name = "abc"
+    pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index)
+
+    gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table)
+    pdf2 = pdf_arrow_table.to_pandas()
+    assert_eq(pdf2, gdf2)
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
 def test_cuda_array_interface(dtype):
-
     np_data = np.arange(10).astype(dtype)
     cupy_data = cupy.array(np_data)
     pd_data = pd.Series(np_data)
@@ -2900,6 +2922,7 @@ def test_tail_for_string():
         ["v0", "v1"],
         ["v0", "index"],
         pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]),
+        pd.MultiIndex.from_tuples([(1, 2), (10, 11)], names=["ABC", "DEF"]),
     ],
 )
 @pytest.mark.parametrize("inplace", [True, False])
@@ -3795,7 +3818,6 @@ def test_diff(dtype, period, data_empty):
 @pytest.mark.parametrize("df", _dataframe_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_dataframe_isnull_isna(df, nan_as_null):
-
     gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
 
     assert_eq(df.isnull(), gdf.isnull())
@@ -3810,7 +3832,6 @@ def test_dataframe_isnull_isna(df, nan_as_null):
 @pytest.mark.parametrize("df", _dataframe_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_dataframe_notna_notnull(df, nan_as_null):
-
     gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
 
     assert_eq(df.notnull(), gdf.notnull())
@@ -4114,7 +4135,7 @@ def test_as_column_types():
     assert_eq(pds, gds)
 
     pds = pd.Series(pd.Index(["1", "18", "9"]), dtype="int")
-    gds = cudf.Series(cudf.StringIndex(["1", "18", "9"]), dtype="int")
+    gds = cudf.Series(cudf.Index(["1", "18", "9"]), dtype="int")
 
     assert_eq(pds, gds)
 
@@ -5190,7 +5211,6 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected):
 @pytest.mark.parametrize("op", ["max", "min"])
 @pytest.mark.parametrize("skipna", [True, False])
 def test_rowwise_ops_datetime_dtypes(data, op, skipna):
-
     gdf = cudf.DataFrame(data)
 
     pdf = gdf.to_pandas()
@@ -5254,7 +5274,6 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna):
     ],
 )
 def test_rowwise_ops_datetime_dtypes_2(data, op, skipna):
-
     gdf = cudf.DataFrame(data)
 
     pdf = gdf.to_pandas()
@@ -5369,10 +5388,7 @@ def test_cov_nans():
         cudf.Series([4, 2, 3], index=["a", "b", "d"]),
         cudf.Series([4, 2], index=["a", "b"]),
         cudf.Series([4, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
-        pytest.param(
-            cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
-            marks=pytest_xfail,
-        ),
+        cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
     ],
 )
 @pytest.mark.parametrize("colnames", [["a", "b", "c"], [0, 1, 2]])
@@ -5502,13 +5518,11 @@ def test_memory_usage(deep, index, set_index):
     gdf = cudf.from_pandas(df)
 
     if index and set_index is None:
-
         # Special Case: Assume RangeIndex size == 0
         with expect_warning_if(deep, UserWarning):
             assert gdf.index.memory_usage(deep=deep) == 0
 
     else:
-
         # Check for Series only
         assert df["B"].memory_usage(index=index, deep=deep) == gdf[
             "B"
@@ -6222,7 +6236,6 @@ def test_from_pandas_unsupported_types(data, expected_upcast_type, error):
 @pytest.mark.parametrize("nan_as_null", [True, False])
 @pytest.mark.parametrize("index", [None, "a", ["a", "b"]])
 def test_from_pandas_nan_as_null(nan_as_null, index):
-
     data = [np.nan, 2.0, 3.0]
 
     if index is None:
@@ -6256,7 +6269,6 @@ def test_from_pandas_nan_as_null(nan_as_null, index):
 
 @pytest.mark.parametrize("nan_as_null", [True, False])
 def test_from_pandas_for_series_nan_as_null(nan_as_null):
-
     data = [np.nan, 2.0, 3.0]
     psr = pd.Series(data)
 
@@ -6401,7 +6413,6 @@ def test_dataframe_init_1d_list(data, columns):
     ],
 )
 def test_dataframe_init_from_arrays_cols(data, cols, index):
-
     gd_data = data
     if isinstance(data, cupy.ndarray):
         # pandas can't handle cupy arrays in general
@@ -6537,7 +6548,6 @@ def test_dataframe_assign_scalar_with_scalar_cols(col_data, assign_val):
 
 
 def test_dataframe_info_basic():
-
     buffer = io.StringIO()
     str_cmp = textwrap.dedent(
         """\
@@ -7069,7 +7079,6 @@ def test_dataframe_to_dict(orient, into):
     ],
 )
 def test_dataframe_from_dict(data, orient, dtype, columns):
-
     expected = pd.DataFrame.from_dict(
         data=data, orient=orient, dtype=dtype, columns=columns
     )
@@ -7167,7 +7176,6 @@ def test_dataframe_from_dict_transposed(dtype):
 def test_dataframe_from_dict_cp_np_arrays(
     pd_data, gd_data, orient, dtype, columns
 ):
-
     expected = pd.DataFrame.from_dict(
         data=pd_data, orient=orient, dtype=dtype, columns=columns
     )
@@ -7248,10 +7256,7 @@ def test_dataframe_keys(df):
 def test_series_keys(ps):
     gds = cudf.from_pandas(ps)
 
-    if len(ps) == 0 and not isinstance(ps.index, pd.RangeIndex):
-        assert_eq(ps.keys().astype("float64"), gds.keys())
-    else:
-        assert_eq(ps.keys(), gds.keys())
+    assert_eq(ps.keys(), gds.keys())
 
 
 @pytest_unmark_spilling
@@ -9858,6 +9863,15 @@ def test_dataframe_eval_errors(df_eval, expr):
         df_eval.eval(expr)
 
 
+def test_dataframe_eval_misc():
+    df = cudf.DataFrame({"a": [1, 2, 3, None, 5]})
+    got = df.eval("isnull(a)")
+    assert_eq(got, cudf.Series.isnull(df["a"]), check_names=False)
+
+    df.eval("c = isnull(1)", inplace=True)
+    assert_eq(df["c"], cudf.Series([False] * len(df), name="c"))
+
+
 @pytest.mark.parametrize(
     "gdf,subset",
     [
@@ -9983,7 +9997,6 @@ def test_non_string_column_name_to_arrow(data):
 
 
 def test_complex_types_from_arrow():
-
     expected = pa.Table.from_arrays(
         [
             pa.array([1, 2, 3]),
@@ -10126,3 +10139,190 @@ def test_dataframe_init_length_error(data, index):
             {"data": data, "index": index},
         ),
     )
+
+
+def test_dataframe_binop_with_mixed_date_types():
+    df = pd.DataFrame(
+        np.random.rand(2, 2),
+        columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"),
+    )
+    ser = pd.Series(np.random.rand(3), index=[0, 1, 2])
+    gdf = cudf.from_pandas(df)
+    gser = cudf.from_pandas(ser)
+    expected = df - ser
+    got = gdf - gser
+    assert_eq(expected, got)
+
+
+def test_dataframe_binop_with_mixed_string_types():
+    df1 = pd.DataFrame(np.random.rand(3, 3), columns=pd.Index([0, 1, 2]))
+    df2 = pd.DataFrame(
+        np.random.rand(6, 6),
+        columns=pd.Index([0, 1, 2, "VhDoHxRaqt", "X0NNHBIPfA", "5FbhPtS0D1"]),
+    )
+    gdf1 = cudf.from_pandas(df1)
+    gdf2 = cudf.from_pandas(df2)
+
+    expected = df2 + df1
+    got = gdf2 + gdf1
+
+    assert_eq(expected, got)
+
+
+def test_dataframe_binop_and_where():
+    df = pd.DataFrame(np.random.rand(2, 2), columns=pd.Index([True, False]))
+    gdf = cudf.from_pandas(df)
+
+    expected = df > 1
+    got = gdf > 1
+
+    assert_eq(expected, got)
+
+    expected = df[df > 1]
+    got = gdf[gdf > 1]
+
+    assert_eq(expected, got)
+
+
+def test_dataframe_binop_with_datetime_index():
+    df = pd.DataFrame(
+        np.random.rand(2, 2),
+        columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"),
+    )
+    ser = pd.Series(
+        np.random.rand(2),
+        index=pd.Index(
+            [
+                "2000-01-04",
+                "2000-01-03",
+            ],
+            dtype="datetime64[ns]",
+        ),
+    )
+    gdf = cudf.from_pandas(df)
+    gser = cudf.from_pandas(ser)
+    expected = df - ser
+    got = gdf - gser
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "columns", ([], ["c", "a"], ["a", "d", "b", "e", "c"], ["a", "b", "c"])
+)
+@pytest.mark.parametrize("index", (None, [4, 5, 6]))
+def test_dataframe_dict_like_with_columns(columns, index):
+    data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
+    expect = pd.DataFrame(data, columns=columns, index=index)
+    actual = cudf.DataFrame(data, columns=columns, index=index)
+    if index is None and columns == []:
+        # We make an empty range index, pandas makes an empty index
+        expect = expect.reset_index(drop=True)
+    assert_eq(expect, actual)
+
+
+def test_dataframe_init_columns_named_multiindex():
+    np.random.seed(0)
+    data = np.random.randn(2, 2)
+    columns = cudf.MultiIndex.from_tuples(
+        [("A", "one"), ("A", "two")], names=["y", "z"]
+    )
+    gdf = cudf.DataFrame(data, columns=columns)
+    pdf = pd.DataFrame(data, columns=columns.to_pandas())
+
+    assert_eq(gdf, pdf)
+
+
+def test_dataframe_init_columns_named_index():
+    np.random.seed(0)
+    data = np.random.randn(2, 2)
+    columns = pd.Index(["a", "b"], name="custom_name")
+    gdf = cudf.DataFrame(data, columns=columns)
+    pdf = pd.DataFrame(data, columns=columns)
+
+    assert_eq(gdf, pdf)
+
+
+def test_dataframe_from_pandas_sparse():
+    pdf = pd.DataFrame(range(2), dtype=pd.SparseDtype(np.int64, 0))
+    with pytest.raises(NotImplementedError):
+        cudf.DataFrame(pdf)
+
+
+def test_dataframe_constructor_unbounded_sequence():
+    class A:
+        def __getitem__(self, key):
+            return 1
+
+    with pytest.raises(TypeError):
+        cudf.DataFrame([A()])
+
+    with pytest.raises(TypeError):
+        cudf.DataFrame({"a": A()})
+
+
+def test_dataframe_constructor_from_namedtuple():
+    Point1 = namedtuple("Point1", ["a", "b", "c"])
+    Point2 = namedtuple("Point1", ["x", "y"])
+
+    data = [Point1(1, 2, 3), Point2(4, 5)]
+    idx = ["a", "b"]
+    gdf = cudf.DataFrame(data, index=idx)
+    pdf = pd.DataFrame(data, index=idx)
+
+    assert_eq(gdf, pdf)
+
+    data = [Point2(4, 5), Point1(1, 2, 3)]
+    with pytest.raises(ValueError):
+        cudf.DataFrame(data, index=idx)
+    with pytest.raises(ValueError):
+        pd.DataFrame(data, index=idx)
+
+
+@pytest.mark.parametrize(
+    "dtype", ["datetime64[ns]", "timedelta64[ns]", "int64", "float32"]
+)
+def test_dataframe_mixed_dtype_error(dtype):
+    pdf = pd.Series([1, 2, 3], dtype=dtype).to_frame().astype(object)
+    with pytest.raises(TypeError):
+        cudf.from_pandas(pdf)
+
+
+@pytest.mark.parametrize(
+    "index_data,name",
+    [([10, 13], "a"), ([30, 40, 20], "b"), (["ef"], "c"), ([2, 3], "Z")],
+)
+def test_dataframe_reindex_with_index_names(index_data, name):
+    gdf = cudf.DataFrame(
+        {
+            "a": [10, 12, 13],
+            "b": [20, 30, 40],
+            "c": cudf.Series(["ab", "cd", "ef"], dtype="category"),
+        }
+    )
+    if name in gdf.columns:
+        gdf = gdf.set_index(name)
+    pdf = gdf.to_pandas()
+
+    gidx = cudf.Index(index_data, name=name)
+    actual = gdf.reindex(gidx)
+    expected = pdf.reindex(gidx.to_pandas())
+
+    assert_eq(actual, expected)
+
+    actual = gdf.reindex(index_data)
+    expected = pdf.reindex(index_data)
+
+    assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize("attr", ["nlargest", "nsmallest"])
+def test_dataframe_nlargest_nsmallest_str_error(attr):
+    gdf = cudf.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
+    pdf = gdf.to_pandas()
+
+    assert_exceptions_equal(
+        getattr(gdf, attr),
+        getattr(pdf, attr),
+        ([], {"n": 1, "columns": ["a", "b"]}),
+        ([], {"n": 1, "columns": ["a", "b"]}),
+    )
diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py
index 85e994bd733..fec52d82ab1 100644
--- a/python/cudf/cudf/tests/test_dataframe_copy.py
+++ b/python/cudf/cudf/tests/test_dataframe_copy.py
@@ -1,13 +1,13 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 from copy import copy, deepcopy
 
+import cupy as cp
 import numpy as np
 import pandas as pd
 import pytest
-from numba import cuda
 
 from cudf.core.dataframe import DataFrame
-from cudf.testing._utils import ALL_TYPES, assert_eq
+from cudf.testing._utils import ALL_TYPES, assert_eq, assert_neq
 
 """
 DataFrame copy expectations
@@ -131,28 +131,7 @@ def test_cudf_dataframe_copy_then_insert(copy_fn, ncols, data_type):
     assert not copy_df.to_string().split() == df.to_string().split()
 
 
-@cuda.jit
-def group_mean(data, segments, output):
-    i = cuda.grid(1)
-    if i < segments.size:
-        s = segments[i]
-        e = segments[i + 1] if (i + 1) < segments.size else data.size
-        # mean calculation
-        carry = 0.0
-        n = e - s
-        for j in range(s, e):
-            carry += data[j]
-        output[i] = carry / n
-
-
-@cuda.jit
-def add_one(data):
-    i = cuda.grid(1)
-    if i == 1:
-        data[i] = data[i] + 1.0
-
-
-def test_kernel_deep_copy():
+def test_deep_copy_write_in_place():
     pdf = pd.DataFrame(
         [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]
     )
@@ -160,18 +139,25 @@ def test_kernel_deep_copy():
     cdf = gdf.copy(deep=True)
     sr = gdf["b"]
 
-    add_one[1, len(sr)](sr._column.data_array_view(mode="write"))
-    assert not gdf.to_string().split() == cdf.to_string().split()
+    # Write a value in-place on the deep copy.
+    # This should only affect the copy and not the original.
+    cp.asarray(sr._column)[1] = 42
 
+    assert_neq(gdf, cdf)
 
-def test_kernel_shallow_copy():
+
+def test_shallow_copy_write_in_place():
     pdf = pd.DataFrame(
         [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]
     )
     gdf = DataFrame.from_pandas(pdf)
     cdf = gdf.copy(deep=False)
     sr = gdf["a"]
-    add_one[1, len(sr)](sr.to_cupy())
+
+    # Write a value in-place on the shallow copy.
+    # This should change the copy and original.
+    cp.asarray(sr._column)[1] = 42
+
     assert_eq(gdf, cdf)
 
 
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index b0ef79b44e9..a6754bf1b5b 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2093,3 +2093,77 @@ def test_construction_from_tz_timestamps(data):
         _ = cudf.Series(data)
     with pytest.raises(NotImplementedError):
         _ = cudf.Index(data)
+    with pytest.raises(NotImplementedError):
+        _ = cudf.DatetimeIndex(data)
+    with pytest.raises(NotImplementedError):
+        cudf.CategoricalIndex(data)
+
+
+@pytest.mark.parametrize("op", _cmpops)
+def test_datetime_binop_tz_timestamp(op):
+    s = cudf.Series([1, 2, 3], dtype="datetime64[ns]")
+    pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc")
+    with pytest.raises(NotImplementedError):
+        op(s, pd_tz_timestamp)
+
+    date_scalar = datetime.datetime.now(datetime.timezone.utc)
+    with pytest.raises(NotImplementedError):
+        op(s, date_scalar)
+
+
+@pytest.mark.parametrize(
+    "data1", [["20110101", "20120101", None, "20140101", None]]
+)
+@pytest.mark.parametrize(
+    "data2", [["20110101", "20120101", "20130101", None, None]]
+)
+@pytest.mark.parametrize("op", _cmpops)
+def test_datetime_series_cmpops_pandas_compatibility(data1, data2, op):
+    gsr1 = cudf.Series(data=data1, dtype="datetime64[ns]")
+    psr1 = gsr1.to_pandas()
+
+    gsr2 = cudf.Series(data=data2, dtype="datetime64[ns]")
+    psr2 = gsr2.to_pandas()
+
+    expect = op(psr1, psr2)
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = op(gsr1, gsr2)
+
+    assert_eq(expect, got)
+
+
+def test_datetime_getitem_na():
+    s = cudf.Series([1, 2, None, 3], dtype="datetime64[ns]")
+    assert s[2] is cudf.NaT
+
+
+def test_daterange_pandas_compatibility():
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(NotImplementedError):
+            cudf.date_range("20010101", "20020215", freq="400h", name="times")
+        expected = pd.date_range(
+            "2010-01-01", "2010-02-01", periods=10, name="times"
+        )
+        actual = cudf.date_range(
+            "2010-01-01", "2010-02-01", periods=10, name="times"
+        )
+    assert_eq(expected, actual)
+
+
+def test_strings_with_utc_offset_not_implemented():
+    with pytest.raises(NotImplementedError):
+        DatetimeIndex(["2022-07-22 00:00:00+02:00"])
+
+
+@pytest.mark.parametrize("code", ["z", "Z"])
+def test_format_timezone_not_implemented(code):
+    with pytest.raises(NotImplementedError):
+        cudf.to_datetime(
+            ["2020-01-01 00:00:00 UTC"], format=f"%Y-%m-%d %H:%M:%S %{code}"
+        )
+
+
+@pytest.mark.parametrize("arg", [True, False])
+def test_args_not_datetime_typerror(arg):
+    with pytest.raises(TypeError):
+        cudf.to_datetime([arg])
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index 4ddbd0064d6..e4b2af90448 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -388,4 +388,4 @@ def test_decimal_overflow():
 
     s = cudf.Series([1, 2], dtype=cudf.Decimal128Dtype(precision=38, scale=0))
     result = s * Decimal("1.0")
-    assert_eq(cudf.Decimal128Dtype(precision=38, scale=-2), result.dtype)
+    assert_eq(cudf.Decimal128Dtype(precision=38, scale=1), result.dtype)
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index f1ed17c5df5..857cc114ffa 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 import operator
 
 import cupy as cp
@@ -12,6 +12,7 @@
 from cudf.core.udf.api import Masked
 from cudf.core.udf.masked_typing import MaskedType
 from cudf.testing._utils import parametrize_numeric_dtypes_pairwise
+from cudf.utils._numba import _CUDFNumbaConfig
 
 arith_ops = (
     operator.add,
@@ -106,7 +107,8 @@ def test_kernel(x, y, err):
             err[0] = 3
 
     err = cp.asarray([0], dtype="int8")
-    test_kernel[1, 1](1, 2, err)
+    with _CUDFNumbaConfig():
+        test_kernel[1, 1](1, 2, err)
     assert err[0] == 0
 
 
@@ -214,7 +216,8 @@ def test_kernel(err):
             err[0] = 2
 
     err = cp.asarray([0], dtype="int8")
-    test_kernel[1, 1](err)
+    with _CUDFNumbaConfig():
+        test_kernel[1, 1](err)
     assert err[0] == 0
 
 
@@ -304,7 +307,8 @@ def test_kernel(err):
             err[0] = 2
 
     err = cp.asarray([0], dtype="int8")
-    test_kernel[1, 1](err)
+    with _CUDFNumbaConfig():
+        test_kernel[1, 1](err)
     assert err[0] == 0
 
 
@@ -326,5 +330,6 @@ def test_kernel(err):
             err[0] = 1
 
     err = cp.asarray([0], dtype="int8")
-    test_kernel[1, 1](err)
+    with _CUDFNumbaConfig():
+        test_kernel[1, 1](err)
     assert err[0] == 0
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 90cf11d7dde..bf409b30090 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
@@ -122,6 +122,23 @@ def test_cudf_factorize_array():
     np.testing.assert_array_equal(expect[1], got[1].get())
 
 
+@pytest.mark.parametrize("pandas_compatibility", [True, False])
+def test_factorize_code_pandas_compatibility(pandas_compatibility):
+
+    psr = pd.Series([1, 2, 3, 4, 5])
+    gsr = cudf.from_pandas(psr)
+
+    expect = pd.factorize(psr)
+    with cudf.option_context("mode.pandas_compatible", pandas_compatibility):
+        got = cudf.factorize(gsr)
+    assert_eq(got[0], expect[0])
+    assert_eq(got[1], expect[1])
+    if pandas_compatibility:
+        assert got[0].dtype == expect[0].dtype
+    else:
+        assert got[0].dtype == cudf.dtype("int8")
+
+
 def test_factorize_result_classes():
     data = [1, 2, 3]
 
@@ -139,3 +156,21 @@ def test_factorize_result_classes():
 
     assert isinstance(labels, cp.ndarray)
     assert isinstance(cats, cp.ndarray)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        ["abc", "def", "abc", "a", "def", None],
+        [10, 20, 100, -10, 0, 1, None, 10, 100],
+    ],
+)
+def test_category_dtype_factorize(data):
+    gs = cudf.Series(data, dtype="category")
+    ps = gs.to_pandas()
+
+    actual_codes, actual_uniques = gs.factorize()
+    expected_codes, expected_uniques = ps.factorize()
+
+    assert_eq(actual_codes, expected_codes)
+    assert_eq(actual_uniques, expected_uniques)
diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py
index 6cdf47ed948..12a325fa4e8 100644
--- a/python/cudf/cudf/tests/test_feather.py
+++ b/python/cudf/cudf/tests/test_feather.py
@@ -15,23 +15,19 @@
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
     types = NUMERIC_TYPES + ["bool"]
-    typer = {"col_" + val: val for val in types}
-    ncols = len(types)
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
-        [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
-        columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
+        {
+            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            for typ in types
+        }
     )
     # Delete the name of the column index, and rename the row index
     test_pdf.columns.name = None
     test_pdf.index.name = "index"
 
-    # Cast all the column dtypes to objects, rename them, and then cast to
-    # appropriate types
-    test_pdf = test_pdf.astype("object").astype(typer)
-
     # Create non-numeric categorical data otherwise may get typecasted
     data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)]
     test_pdf["col_category"] = pd.Series(data, dtype="category")
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 9a72b85dd13..042f0e1aa38 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -7,6 +7,7 @@
 import string
 import textwrap
 from decimal import Decimal
+from functools import partial
 
 import numpy as np
 import pandas as pd
@@ -179,6 +180,16 @@ def test_groupby_as_index_single_agg(pdf, gdf, as_index):
     assert_groupby_results_equal(pdf, gdf)
 
 
+@pytest.mark.parametrize("engine", ["cudf", "jit"])
+@pytest.mark.parametrize("as_index", [True, False])
+def test_groupby_as_index_apply(pdf, gdf, as_index, engine):
+    gdf = gdf.groupby("y", as_index=as_index).apply(
+        lambda df: df["x"].mean(), engine=engine
+    )
+    pdf = pdf.groupby("y", as_index=as_index).apply(lambda df: df["x"].mean())
+    assert_groupby_results_equal(pdf, gdf)
+
+
 @pytest.mark.parametrize("as_index", [True, False])
 def test_groupby_as_index_multiindex(pdf, gdf, as_index):
     pdf = pd.DataFrame(
@@ -218,20 +229,22 @@ def test_groupby_getitem_getattr(as_index):
     pdf = pd.DataFrame({"x": [1, 3, 1], "y": [1, 2, 3], "z": [1, 4, 5]})
     gdf = cudf.from_pandas(pdf)
     assert_groupby_results_equal(
-        pdf.groupby("x")["y"].sum(),
-        gdf.groupby("x")["y"].sum(),
+        pdf.groupby("x", as_index=as_index)["y"].sum(),
+        gdf.groupby("x", as_index=as_index)["y"].sum(),
         as_index=as_index,
         by="x",
     )
     assert_groupby_results_equal(
-        pdf.groupby("x").y.sum(),
-        gdf.groupby("x").y.sum(),
+        pdf.groupby("x", as_index=as_index).y.sum(),
+        gdf.groupby("x", as_index=as_index).y.sum(),
         as_index=as_index,
         by="x",
     )
     assert_groupby_results_equal(
-        pdf.groupby("x")[["y"]].sum(),
-        gdf.groupby("x")[["y"]].sum(),
+        pdf.groupby("x", as_index=as_index)[["y"]].sum(),
+        gdf.groupby("x", as_index=as_index)[["y"]].sum(),
+        as_index=as_index,
+        by="x",
     )
     assert_groupby_results_equal(
         pdf.groupby(["x", "y"], as_index=as_index).sum(),
@@ -386,11 +399,13 @@ def groupby_jit_data():
     df["key2"] = np.random.randint(0, 2, nelem)
     df["val1"] = np.random.random(nelem)
     df["val2"] = np.random.random(nelem)
+    df["val3"] = np.random.randint(0, 10, nelem)
+    df["val4"] = np.random.randint(0, 10, nelem)
     return df
 
 
 def run_groupby_apply_jit_test(data, func, keys, *args):
-    expect_groupby_obj = data.to_pandas().groupby(keys, as_index=False)
+    expect_groupby_obj = data.to_pandas().groupby(keys)
     got_groupby_obj = data.groupby(keys)
 
     # compare cuDF jit to pandas
@@ -399,7 +414,11 @@ def run_groupby_apply_jit_test(data, func, keys, *args):
     assert_groupby_results_equal(cudf_jit_result, pandas_result)
 
 
-@pytest.mark.parametrize("dtype", SUPPORTED_GROUPBY_NUMPY_TYPES)
+@pytest.mark.parametrize(
+    "dtype",
+    SUPPORTED_GROUPBY_NUMPY_TYPES,
+    ids=[str(t) for t in SUPPORTED_GROUPBY_NUMPY_TYPES],
+)
 @pytest.mark.parametrize(
     "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"]
 )
@@ -426,6 +445,52 @@ def func(df):
     run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"])
 
 
+@pytest.mark.parametrize("dtype", ["int32", "int64"])
+def test_groupby_apply_jit_correlation(groupby_jit_data, dtype):
+
+    groupby_jit_data["val3"] = groupby_jit_data["val3"].astype(dtype)
+    groupby_jit_data["val4"] = groupby_jit_data["val4"].astype(dtype)
+
+    keys = ["key1", "key2"]
+
+    def func(group):
+        return group["val3"].corr(group["val4"])
+
+    run_groupby_apply_jit_test(groupby_jit_data, func, keys)
+
+
+@pytest.mark.parametrize("dtype", ["int32", "int64"])
+def test_groupby_apply_jit_correlation_zero_variance(dtype):
+    # pearson correlation is undefined when the variance of either
+    # variable is zero. This test ensures that the jit implementation
+    # returns the same result as pandas in this case.
+    data = DataFrame(
+        {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]}
+    )
+
+    def func(group):
+        return group["b"].corr(group["c"])
+
+    run_groupby_apply_jit_test(data, func, ["a"])
+
+
+@pytest.mark.parametrize("dtype", ["int32"])
+def test_groupby_apply_jit_sum_integer_overflow(dtype):
+    max = np.iinfo(dtype).max
+
+    data = DataFrame(
+        {
+            "a": [0, 0, 0],
+            "b": [max, max, max],
+        }
+    )
+
+    def func(group):
+        return group["b"].sum()
+
+    run_groupby_apply_jit_test(data, func, ["a"])
+
+
 @pytest.mark.parametrize("dtype", ["float64"])
 @pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"])
 @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
@@ -452,7 +517,19 @@ def func(df):
 
 @pytest.mark.parametrize("dtype", ["float64"])
 @pytest.mark.parametrize("func", ["idxmax", "idxmin"])
-@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
+@pytest.mark.parametrize(
+    "special_val",
+    [
+        pytest.param(
+            np.nan,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/13832"
+            ),
+        ),
+        np.inf,
+        -np.inf,
+    ],
+)
 def test_groupby_apply_jit_idx_reductions_special_vals(
     func, groupby_jit_data, dtype, special_val
 ):
@@ -471,19 +548,10 @@ def func(df):
     groupby_jit_data["val1"] = special_val
     groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype)
 
-    expect = (
-        groupby_jit_data.to_pandas()
-        .groupby("key1", as_index=False)
-        .apply(func)
-    )
+    expect = groupby_jit_data.to_pandas().groupby("key1").apply(func)
+    got = groupby_jit_data.groupby("key1").apply(func, engine="jit")
 
-    grouped = groupby_jit_data.groupby("key1")
-    sorted = grouped._grouped()[3].to_pandas()
-    expect_vals = sorted["key1"].drop_duplicates().index
-    expect[None] = expect_vals
-
-    got = grouped.apply(func, engine="jit")
-    assert_eq(expect, got)
+    assert_eq(expect, got, check_dtype=False)
 
 
 @pytest.mark.parametrize(
@@ -571,6 +639,25 @@ def f(group):
     assert precompiled.currsize == 3
 
 
+def test_groupby_apply_no_bytecode_fallback():
+    # tests that a function which contains no bytecode
+    # attribute, but would still be executable using
+    # the iterative groupby apply approach, still works.
+
+    gdf = cudf.DataFrame({"a": [0, 1, 1], "b": [1, 2, 3]})
+    pdf = gdf.to_pandas()
+
+    def f(group):
+        return group.sum()
+
+    part = partial(f)
+
+    expect = pdf.groupby("a").apply(part)
+    got = gdf.groupby("a").apply(part, engine="auto")
+
+    assert_groupby_results_equal(expect, got)
+
+
 @pytest.mark.parametrize("func", [lambda group: group.x + group.y])
 def test_groupby_apply_return_col_from_df(func):
     # tests a UDF that consists of purely colwise
@@ -1307,11 +1394,6 @@ def test_groupby_quantile(request, interpolation, q):
     pdresult = pdg.quantile(q, interpolation=interpolation)
     gdresult = gdg.quantile(q, interpolation=interpolation)
 
-    # There's a lot left to add to python bindings like index name
-    # so this is a temporary workaround
-    pdresult = pdresult["y"].reset_index(drop=True)
-    gdresult = gdresult["y"].reset_index(drop=True)
-
     assert_groupby_results_equal(pdresult, gdresult)
 
 
@@ -2119,6 +2201,35 @@ def test_groupby_rank_fails():
         gdf.groupby(["a"]).rank(method="min", axis=1)
 
 
+@pytest.mark.parametrize(
+    "with_nan", [False, True], ids=["just-NA", "also-NaN"]
+)
+@pytest.mark.parametrize("dropna", [False, True], ids=["keepna", "dropna"])
+@pytest.mark.parametrize(
+    "duplicate_index", [False, True], ids=["rangeindex", "dupindex"]
+)
+def test_groupby_scan_null_keys(with_nan, dropna, duplicate_index):
+    key_col = [None, 1, 2, None, 3, None, 3, 1, None, 1]
+    if with_nan:
+        df = pd.DataFrame(
+            {"key": pd.Series(key_col, dtype="float32"), "value": range(10)}
+        )
+    else:
+        df = pd.DataFrame(
+            {"key": pd.Series(key_col, dtype="Int32"), "value": range(10)}
+        )
+
+    if duplicate_index:
+        # Non-default index with duplicates
+        df.index = [1, 2, 3, 1, 3, 2, 4, 1, 6, 10]
+
+    cdf = cudf.from_pandas(df)
+
+    expect = df.groupby("key", dropna=dropna).cumsum()
+    got = cdf.groupby("key", dropna=dropna).cumsum()
+    assert_eq(expect, got)
+
+
 def test_groupby_mix_agg_scan():
     err_msg = "Cannot perform both aggregation and scan in one operation"
     func = ["cumsum", "sum"]
@@ -3044,6 +3155,23 @@ def test_groupby_by_index_names(index_names):
     )
 
 
+@pytest.mark.parametrize(
+    "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]]
+)
+def test_group_by_pandas_compat(groups):
+    with cudf.option_context("mode.pandas_compatible", True):
+        df = cudf.DataFrame(
+            {
+                "a": [1, 3, 2, 3, 3],
+                "b": ["x", "a", "y", "z", "a"],
+                "c": [10, 13, 11, 12, 12],
+            }
+        )
+        pdf = df.to_pandas()
+
+        assert_eq(pdf.groupby(groups).max(), df.groupby(groups).max())
+
+
 class TestSample:
     @pytest.fixture(params=["default", "rangeindex", "intindex", "strindex"])
     def index(self, request):
@@ -3055,7 +3183,7 @@ def index(self, request):
                 [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32"
             )
         elif request.param == "strindex":
-            return cudf.StringIndex(list(string.ascii_lowercase[:n]))
+            return cudf.Index(list(string.ascii_lowercase[:n]))
         elif request.param == "default":
             return None
 
@@ -3202,3 +3330,146 @@ def test_head_tail(self, df, n, take_head, expected, preserve_order):
         else:
             actual = df.groupby("a").tail(n=n, preserve_order=preserve_order)
         assert_eq(actual, expected)
+
+
+def test_head_tail_empty():
+    # GH #13397
+
+    values = [1, 2, 3]
+    pdf = pd.DataFrame({}, index=values)
+    df = cudf.DataFrame({}, index=values)
+
+    expected = pdf.groupby(pd.Series(values)).head()
+    got = df.groupby(cudf.Series(values)).head()
+    assert_eq(expected, got)
+
+    expected = pdf.groupby(pd.Series(values)).tail()
+    got = df.groupby(cudf.Series(values)).tail()
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]]
+)
+@pytest.mark.parametrize("sort", [True, False])
+def test_group_by_pandas_sort_order(groups, sort):
+    with cudf.option_context("mode.pandas_compatible", True):
+        df = cudf.DataFrame(
+            {
+                "a": [10, 1, 10, 3, 2, 1, 3, 3],
+                "b": [5, 6, 7, 1, 2, 3, 4, 9],
+                "c": [20, 20, 10, 11, 13, 11, 12, 12],
+            }
+        )
+        pdf = df.to_pandas()
+
+        assert_eq(
+            pdf.groupby(groups, sort=sort).sum(),
+            df.groupby(groups, sort=sort).sum(),
+        )
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"],
+)
+@pytest.mark.parametrize(
+    "reduce_op",
+    [
+        "min",
+        "max",
+        "idxmin",
+        "idxmax",
+        "first",
+        "last",
+    ],
+)
+def test_group_by_empty_reduction(dtype, reduce_op):
+    gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype)
+    pdf = gdf.to_pandas()
+
+    gg = gdf.groupby("a")["c"]
+    pg = pdf.groupby("a")["c"]
+
+    assert_eq(
+        getattr(gg, reduce_op)(), getattr(pg, reduce_op)(), check_dtype=True
+    )
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"],
+)
+@pytest.mark.parametrize(
+    "apply_op",
+    ["sum", "min", "max", "idxmax"],
+)
+def test_group_by_empty_apply(request, dtype, apply_op):
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=(dtype == "datetime64[ns]" and apply_op == "sum"),
+            reason=("sum isn't supported for datetime64[ns]"),
+        )
+    )
+
+    gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype)
+    pdf = gdf.to_pandas()
+
+    gg = gdf.groupby("a")["c"]
+    pg = pdf.groupby("a")["c"]
+
+    assert_eq(
+        gg.apply(apply_op),
+        pg.apply(apply_op),
+        check_dtype=True,
+        check_index_type=True,
+    )
+
+
+def test_groupby_consecutive_operations():
+    df = cudf.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
+    pdf = df.to_pandas()
+
+    gg = df.groupby("A")
+    pg = pdf.groupby("A")
+
+    actual = gg.nth(-1)
+    expected = pg.nth(-1)
+
+    assert_groupby_results_equal(actual, expected, check_dtype=False)
+
+    actual = gg.nth(0)
+    expected = pg.nth(0)
+
+    assert_groupby_results_equal(actual, expected, check_dtype=False)
+
+    actual = gg.cumsum()
+    expected = pg.cumsum()
+
+    assert_groupby_results_equal(actual, expected, check_dtype=False)
+
+    actual = gg.cumcount()
+    expected = pg.cumcount()
+
+    assert_groupby_results_equal(actual, expected, check_dtype=False)
+
+    actual = gg.cumsum()
+    expected = pg.cumsum()
+
+    assert_groupby_results_equal(actual, expected, check_dtype=False)
+
+
+def test_categorical_grouping_pandas_compatibility():
+    gdf = cudf.DataFrame(
+        {
+            "key": cudf.Series([2, 1, 3, 1, 1], dtype="category"),
+            "a": [0, 1, 3, 2, 3],
+        }
+    )
+    pdf = gdf.to_pandas()
+
+    with cudf.option_context("mode.pandas_compatible", True):
+        actual = gdf.groupby("key", sort=False).sum()
+    expected = pdf.groupby("key", sort=False).sum()
+
+    assert_eq(actual, expected)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 0b0c5fba7fa..506edd5b3f3 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -16,11 +16,11 @@
     CategoricalIndex,
     DatetimeIndex,
     GenericIndex,
-    IntervalIndex,
     RangeIndex,
     as_index,
 )
 from cudf.testing._utils import (
+    ALL_TYPES,
     FLOAT_TYPES,
     NUMERIC_TYPES,
     OTHER_TYPES,
@@ -75,23 +75,23 @@ def test_df_slice_empty_index():
 def test_index_find_label_range_genericindex():
     # Monotonic Index
     idx = cudf.Index(np.asarray([4, 5, 6, 10]))
-    assert idx.find_label_range(4, 6) == (0, 3)
-    assert idx.find_label_range(5, 10) == (1, 4)
-    assert idx.find_label_range(0, 6) == (0, 3)
-    assert idx.find_label_range(4, 11) == (0, 4)
+    assert idx.find_label_range(slice(4, 6)) == slice(0, 3, 1)
+    assert idx.find_label_range(slice(5, 10)) == slice(1, 4, 1)
+    assert idx.find_label_range(slice(0, 6)) == slice(0, 3, 1)
+    assert idx.find_label_range(slice(4, 11)) == slice(0, 4, 1)
 
     # Non-monotonic Index
     idx_nm = cudf.Index(np.asarray([5, 4, 6, 10]))
-    assert idx_nm.find_label_range(4, 6) == (1, 3)
-    assert idx_nm.find_label_range(5, 10) == (0, 4)
+    assert idx_nm.find_label_range(slice(4, 6)) == slice(1, 3, 1)
+    assert idx_nm.find_label_range(slice(5, 10)) == slice(0, 4, 1)
     # Last value not found
-    with pytest.raises(ValueError) as raises:
-        idx_nm.find_label_range(0, 6)
-    raises.match("value not found")
+    with pytest.raises(KeyError) as raises:
+        idx_nm.find_label_range(slice(0, 6))
+    raises.match("not in index")
     # Last value not found
-    with pytest.raises(ValueError) as raises:
-        idx_nm.find_label_range(4, 11)
-    raises.match("value not found")
+    with pytest.raises(KeyError) as raises:
+        idx_nm.find_label_range(slice(4, 11))
+    raises.match("not in index")
 
 
 def test_index_find_label_range_rangeindex():
@@ -99,18 +99,19 @@ def test_index_find_label_range_rangeindex():
     # step > 0
     # 3, 8, 13, 18
     ridx = RangeIndex(3, 20, 5)
-    assert ridx.find_label_range(3, 8) == (0, 2)
-    assert ridx.find_label_range(0, 7) == (0, 1)
-    assert ridx.find_label_range(3, 19) == (0, 4)
-    assert ridx.find_label_range(2, 21) == (0, 4)
+    assert ridx.find_label_range(slice(3, 8)) == slice(0, 2, 1)
+    assert ridx.find_label_range(slice(0, 7)) == slice(0, 1, 1)
+    assert ridx.find_label_range(slice(3, 19)) == slice(0, 4, 1)
+    assert ridx.find_label_range(slice(2, 21)) == slice(0, 4, 1)
 
     # step < 0
     # 20, 15, 10, 5
     ridx = RangeIndex(20, 3, -5)
-    assert ridx.find_label_range(15, 10) == (1, 3)
-    assert ridx.find_label_range(10, 0) == (2, 4)
-    assert ridx.find_label_range(30, 13) == (0, 2)
-    assert ridx.find_label_range(30, 0) == (0, 4)
+    assert ridx.find_label_range(slice(15, 10)) == slice(1, 3, 1)
+    assert ridx.find_label_range(slice(10, 15, -1)) == slice(2, 0, -1)
+    assert ridx.find_label_range(slice(10, 0)) == slice(2, 4, 1)
+    assert ridx.find_label_range(slice(30, 13)) == slice(0, 2, 1)
+    assert ridx.find_label_range(slice(30, 0)) == slice(0, 4, 1)
 
 
 def test_index_comparision():
@@ -337,7 +338,7 @@ def test_index_copy_datetime(name, dtype, deep=True):
 @pytest.mark.parametrize("name", ["x"])
 @pytest.mark.parametrize("dtype", ["category", "object"])
 def test_index_copy_string(name, dtype, deep=True):
-    cidx = cudf.StringIndex(["a", "b", "c"])
+    cidx = cudf.Index(["a", "b", "c"])
     pidx = cidx.to_pandas()
 
     with pytest.warns(FutureWarning):
@@ -401,7 +402,7 @@ def test_index_copy_category(name, dtype, deep=True):
     "idx",
     [
         cudf.DatetimeIndex(["2001", "2002", "2003"]),
-        cudf.StringIndex(["a", "b", "c"]),
+        cudf.Index(["a", "b", "c"]),
         cudf.Index([1, 2, 3]),
         cudf.Index([1.0, 2.0, 3.0]),
         cudf.CategoricalIndex([1, 2, 3]),
@@ -455,7 +456,7 @@ def test_rangeindex_slice_attr_name():
 def test_from_pandas_str():
     idx = ["a", "b", "c"]
     pidx = pd.Index(idx, name="idx")
-    gidx_1 = cudf.StringIndex(idx, name="idx")
+    gidx_1 = cudf.Index(idx, name="idx")
     gidx_2 = cudf.from_pandas(pidx)
 
     assert_eq(gidx_1, gidx_2)
@@ -804,12 +805,16 @@ def test_index_to_series(data):
     ],
 )
 @pytest.mark.parametrize("sort", [None, False])
-def test_index_difference(data, other, sort):
-    pd_data = pd.Index(data)
-    pd_other = pd.Index(other)
+@pytest.mark.parametrize(
+    "name_data,name_other",
+    [("abc", "c"), (None, "abc"), ("abc", pd.NA), ("abc", "abc")],
+)
+def test_index_difference(data, other, sort, name_data, name_other):
+    pd_data = pd.Index(data, name=name_data)
+    pd_other = pd.Index(other, name=name_other)
 
-    gd_data = cudf.core.index.as_index(data)
-    gd_other = cudf.core.index.as_index(other)
+    gd_data = cudf.from_pandas(pd_data)
+    gd_other = cudf.from_pandas(pd_other)
 
     expected = pd_data.difference(pd_other, sort=sort)
     actual = gd_data.difference(gd_other, sort=sort)
@@ -1329,7 +1334,6 @@ def test_float_index_apis(data, name, dtype):
 @pytest.mark.parametrize("ordered", [True, False])
 @pytest.mark.parametrize("name", [1, "a", None])
 def test_categorical_index_basic(data, categories, dtype, ordered, name):
-
     # can't have both dtype and categories/ordered
     if dtype is not None:
         categories = None
@@ -1352,256 +1356,6 @@ def test_categorical_index_basic(data, categories, dtype, ordered, name):
     assert_eq(pindex, gindex)
 
 
-INTERVAL_BOUNDARY_TYPES = [
-    int,
-    np.int8,
-    np.int16,
-    np.int32,
-    np.int64,
-    np.float32,
-    np.float64,
-    cudf.Scalar,
-]
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("start", [0, 1, 2, 3])
-@pytest.mark.parametrize("end", [4, 5, 6, 7])
-def test_interval_range_basic(start, end, closed):
-    pindex = pd.interval_range(start=start, end=end, closed=closed)
-    gindex = cudf.interval_range(start=start, end=end, closed=closed)
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
-def test_interval_range_dtype_basic(start_t, end_t):
-    start, end = start_t(24), end_t(42)
-    start_val = start.value if isinstance(start, cudf.Scalar) else start
-    end_val = end.value if isinstance(end, cudf.Scalar) else end
-    pindex = pd.interval_range(start=start_val, end=end_val, closed="left")
-    gindex = cudf.interval_range(start=start, end=end, closed="left")
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("start", [0])
-@pytest.mark.parametrize("end", [0])
-def test_interval_range_empty(start, end, closed):
-    pindex = pd.interval_range(start=start, end=end, closed=closed)
-    gindex = cudf.interval_range(start=start, end=end, closed=closed)
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("freq", [1, 2, 3])
-@pytest.mark.parametrize("start", [0, 1, 2, 3, 5])
-@pytest.mark.parametrize("end", [6, 8, 10, 43, 70])
-def test_interval_range_freq_basic(start, end, freq, closed):
-    pindex = pd.interval_range(start=start, end=end, freq=freq, closed=closed)
-    gindex = cudf.interval_range(
-        start=start, end=end, freq=freq, closed=closed
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
-def test_interval_range_freq_basic_dtype(start_t, end_t, freq_t):
-    start, end, freq = start_t(5), end_t(70), freq_t(3)
-    start_val = start.value if isinstance(start, cudf.Scalar) else start
-    end_val = end.value if isinstance(end, cudf.Scalar) else end
-    freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
-    pindex = pd.interval_range(
-        start=start_val, end=end_val, freq=freq_val, closed="left"
-    )
-    gindex = cudf.interval_range(
-        start=start, end=end, freq=freq, closed="left"
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("periods", [1, 1.0, 2, 2.0, 3.0, 3])
-@pytest.mark.parametrize("start", [0, 0.0, 1.0, 1, 2, 2.0, 3.0, 3])
-@pytest.mark.parametrize("end", [4, 4.0, 5.0, 5, 6, 6.0, 7.0, 7])
-def test_interval_range_periods_basic(start, end, periods, closed):
-    pindex = pd.interval_range(
-        start=start, end=end, periods=periods, closed=closed
-    )
-    gindex = cudf.interval_range(
-        start=start, end=end, periods=periods, closed=closed
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("periods_t", INTERVAL_BOUNDARY_TYPES)
-def test_interval_range_periods_basic_dtype(start_t, end_t, periods_t):
-    start, end, periods = start_t(0), end_t(4), periods_t(1.0)
-    start_val = start.value if isinstance(start, cudf.Scalar) else start
-    end_val = end.value if isinstance(end, cudf.Scalar) else end
-    periods_val = (
-        periods.value if isinstance(periods, cudf.Scalar) else periods
-    )
-    pindex = pd.interval_range(
-        start=start_val, end=end_val, periods=periods_val, closed="left"
-    )
-    gindex = cudf.interval_range(
-        start=start, end=end, periods=periods, closed="left"
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("periods", [1, 2, 3])
-@pytest.mark.parametrize("freq", [1, 2, 3, 4])
-@pytest.mark.parametrize("end", [4, 8, 9, 10])
-def test_interval_range_periods_freq_end(end, freq, periods, closed):
-    pindex = pd.interval_range(
-        end=end, freq=freq, periods=periods, closed=closed
-    )
-    gindex = cudf.interval_range(
-        end=end, freq=freq, periods=periods, closed=closed
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("periods_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
-def test_interval_range_periods_freq_end_dtype(periods_t, freq_t, end_t):
-    periods, freq, end = periods_t(2), freq_t(3), end_t(10)
-    freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
-    end_val = end.value if isinstance(end, cudf.Scalar) else end
-    periods_val = (
-        periods.value if isinstance(periods, cudf.Scalar) else periods
-    )
-    pindex = pd.interval_range(
-        end=end_val, freq=freq_val, periods=periods_val, closed="left"
-    )
-    gindex = cudf.interval_range(
-        end=end, freq=freq, periods=periods, closed="left"
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-@pytest.mark.parametrize("periods", [1, 2, 3])
-@pytest.mark.parametrize("freq", [1, 2, 3, 4])
-@pytest.mark.parametrize("start", [1, 4, 9, 12])
-def test_interval_range_periods_freq_start(start, freq, periods, closed):
-    pindex = pd.interval_range(
-        start=start, freq=freq, periods=periods, closed=closed
-    )
-    gindex = cudf.interval_range(
-        start=start, freq=freq, periods=periods, closed=closed
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("periods_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
-@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
-def test_interval_range_periods_freq_start_dtype(periods_t, freq_t, start_t):
-    periods, freq, start = periods_t(2), freq_t(3), start_t(9)
-    freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
-    start_val = start.value if isinstance(start, cudf.Scalar) else start
-    periods_val = (
-        periods.value if isinstance(periods, cudf.Scalar) else periods
-    )
-    pindex = pd.interval_range(
-        start=start_val, freq=freq_val, periods=periods_val, closed="left"
-    )
-    gindex = cudf.interval_range(
-        start=start, freq=freq, periods=periods, closed="left"
-    )
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
-@pytest.mark.parametrize(
-    "data",
-    [
-        ([pd.Interval(30, 50)]),
-        ([pd.Interval(0, 3), pd.Interval(1, 7)]),
-        ([pd.Interval(0.2, 60.3), pd.Interval(1, 7), pd.Interval(0, 0)]),
-        ([]),
-    ],
-)
-def test_interval_index_basic(data, closed):
-    pindex = pd.IntervalIndex(data, closed=closed)
-    gindex = IntervalIndex(data, closed=closed)
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
-def test_interval_index_empty(closed):
-    pindex = pd.IntervalIndex([], closed=closed)
-    gindex = IntervalIndex([], closed=closed)
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
-@pytest.mark.parametrize(
-    "data",
-    [
-        ([pd.Interval(1, 6), pd.Interval(1, 10), pd.Interval(1, 3)]),
-        (
-            [
-                pd.Interval(3.5, 6.0),
-                pd.Interval(1.0, 7.0),
-                pd.Interval(0.0, 10.0),
-            ]
-        ),
-        (
-            [
-                pd.Interval(50, 100, closed="left"),
-                pd.Interval(1.0, 7.0, closed="left"),
-                pd.Interval(16, 322, closed="left"),
-            ]
-        ),
-        (
-            [
-                pd.Interval(50, 100, closed="right"),
-                pd.Interval(1.0, 7.0, closed="right"),
-                pd.Interval(16, 322, closed="right"),
-            ]
-        ),
-    ],
-)
-def test_interval_index_many_params(data, closed):
-
-    pindex = pd.IntervalIndex(data, closed=closed)
-    gindex = IntervalIndex(data, closed=closed)
-
-    assert_eq(pindex, gindex)
-
-
-@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
-def test_interval_index_from_breaks(closed):
-    breaks = [0, 3, 6, 10]
-    pindex = pd.IntervalIndex.from_breaks(breaks, closed=closed)
-    gindex = IntervalIndex.from_breaks(breaks, closed=closed)
-
-    assert_eq(pindex, gindex)
-
-
 @pytest.mark.parametrize(
     "data",
     [
@@ -1843,14 +1597,10 @@ def test_index_equals_categories():
 def test_index_rangeindex_search_range():
     # step > 0
     ridx = RangeIndex(-13, 17, 4)
-    stop = ridx._start + ridx._step * len(ridx)
+    ri = ridx.as_range
     for i in range(len(ridx)):
-        assert i == search_range(
-            ridx._start, stop, ridx[i], ridx._step, side="left"
-        )
-        assert i + 1 == search_range(
-            ridx._start, stop, ridx[i], ridx._step, side="right"
-        )
+        assert i == search_range(ridx[i], ri, side="left")
+        assert i + 1 == search_range(ridx[i], ri, side="right")
 
 
 @pytest.mark.parametrize(
@@ -2301,6 +2051,10 @@ def test_range_index_concat(objs):
         (pd.Index([0, 1, 2, 30], name="a"), [90, 100]),
         (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])),
         (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])),
+        (
+            pd.IntervalIndex.from_tuples([(0, 2), (0, 2), (2, 4)]),
+            pd.IntervalIndex.from_tuples([(0, 2), (2, 4)]),
+        ),
     ],
 )
 @pytest.mark.parametrize("sort", [None, False])
@@ -2321,7 +2075,7 @@ def test_union_index(idx1, idx2, sort):
         (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)),
         (pd.RangeIndex(0, 10), pd.RangeIndex(-10, 20)),
         (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")),
-        (pd.Index([0, 1, 2, 30], name="a"), pd.Index([30, 0, 90, 100])),
+        (pd.Index([0, 1, 2, 30], name=pd.NA), pd.Index([30, 0, 90, 100])),
         (pd.Index([0, 1, 2, 30], name="a"), [90, 100]),
         (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])),
         (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])),
@@ -2335,7 +2089,6 @@ def test_union_index(idx1, idx2, sort):
 )
 @pytest.mark.parametrize("sort", [None, False])
 def test_intersection_index(idx1, idx2, sort):
-
     expected = idx1.intersection(idx2, sort=sort)
 
     idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1
@@ -2491,11 +2244,15 @@ def test_isin_index(data, values):
             [[1, 2, 3, 10, 100], ["red", "blue", "green", "pink", "white"]],
             names=("number", "color"),
         ),
+        pd.MultiIndex.from_product(
+            [[0, 1], ["red", "blue", "green"]], names=("number", "color")
+        ),
     ],
 )
 @pytest.mark.parametrize(
     "values,level,err",
     [
+        ([(1, "red"), (2, "blue"), (0, "green")], None, None),
         (["red", "orange", "yellow"], "color", None),
         (["red", "white", "yellow"], "color", None),
         ([0, 1, 2, 10, 11, 15], "number", None),
@@ -2692,14 +2449,12 @@ def test_rangeindex_binops_user_option(
 def test_rangeindex_join_user_option(default_integer_bitwidth):
     # Test that RangeIndex is materialized into 32 bit index under user
     # configuration for join.
-    idx1 = cudf.RangeIndex(0, 10)
-    idx2 = cudf.RangeIndex(5, 15)
+    idx1 = cudf.RangeIndex(0, 10, name="a")
+    idx2 = cudf.RangeIndex(5, 15, name="b")
 
     actual = idx1.join(idx2, how="inner", sort=True)
-    expected = cudf.Index(
-        [5, 6, 7, 8, 9], dtype=f"int{default_integer_bitwidth}", name=0
-    )
-
+    expected = idx1.to_pandas().join(idx2.to_pandas(), how="inner", sort=True)
+    assert actual.dtype == cudf.dtype(f"int{default_integer_bitwidth}")
     assert_eq(expected, actual)
 
 
@@ -2905,3 +2660,70 @@ def test_scalar_getitem(self, index_values, i):
         assert not isinstance(index[i], cudf.Index)
         assert index[i] == index_values[i]
         assert_eq(index, index.to_pandas())
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            pd.Timestamp("1970-01-01 00:00:00.000000001"),
+            pd.Timestamp("1970-01-01 00:00:00.000000002"),
+            12,
+            20,
+        ],
+        [
+            pd.Timedelta(10),
+            pd.Timedelta(20),
+            12,
+            20,
+        ],
+        [1, 2, 3, 4],
+    ],
+)
+def test_index_mixed_dtype_error(data):
+    pi = pd.Index(data, dtype="object")
+    with pytest.raises(TypeError):
+        cudf.Index(pi)
+
+
+@pytest.mark.parametrize("cls", [pd.DatetimeIndex, pd.TimedeltaIndex])
+def test_index_date_duration_freq_error(cls):
+    s = cls([1, 2, 3], freq="infer")
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(NotImplementedError):
+            cudf.Index(s)
+
+
+@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
+def test_index_getitem_time_duration(dtype):
+    gidx = cudf.Index([1, 2, 3, 4, None], dtype=dtype)
+    pidx = gidx.to_pandas()
+    with cudf.option_context("mode.pandas_compatible", True):
+        for i in range(len(gidx)):
+            if i == 4:
+                assert gidx[i] is pidx[i]
+            else:
+                assert_eq(gidx[i], pidx[i])
+
+
+@pytest.mark.parametrize("dtype", ALL_TYPES)
+def test_index_empty_from_pandas(request, dtype):
+    request.node.add_marker(
+        pytest.mark.xfail(
+            condition=not PANDAS_GE_200
+            and dtype
+            in {
+                "datetime64[ms]",
+                "datetime64[s]",
+                "datetime64[us]",
+                "timedelta64[ms]",
+                "timedelta64[s]",
+                "timedelta64[us]",
+            },
+            reason="Fixed in pandas-2.0",
+        )
+    )
+    pidx = pd.Index([], dtype=dtype)
+    gidx = cudf.from_pandas(pidx)
+
+    assert_eq(pidx, gidx)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 95936c48b7c..2e169a2b0b1 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
+from datetime import datetime
 from itertools import combinations
 
 import cupy
@@ -579,7 +580,6 @@ def test_dataframe_series_loc_multiindex(obj):
 
 @pytest.mark.parametrize("nelem", [2, 5, 20, 100])
 def test_series_iloc(nelem):
-
     # create random cudf.Series
     np.random.seed(12)
     ps = pd.Series(np.random.sample(nelem))
@@ -685,9 +685,6 @@ def test_dataframe_iloc_tuple():
     assert_eq(gdf.iloc[:, -1], pdf.iloc[:, -1])
 
 
-@pytest.mark.xfail(
-    raises=IndexError, reason="positional indexers are out-of-bounds"
-)
 def test_dataframe_iloc_index_error():
     gdf = cudf.DataFrame()
     nelem = 123
@@ -700,11 +697,10 @@ def test_dataframe_iloc_index_error():
     pdf["a"] = ha
     pdf["b"] = hb
 
-    def assert_col(g, p):
-        np.testing.assert_equal(g["a"].to_numpy(), p["a"])
-        np.testing.assert_equal(g["b"].to_numpy(), p["b"])
-
-    assert_col(gdf.iloc[nelem * 2], pdf.iloc[nelem * 2])
+    with pytest.raises(IndexError):
+        pdf.iloc[nelem * 2]
+    with pytest.raises(IndexError):
+        gdf.iloc[nelem * 2]
 
 
 @pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200])
@@ -1265,7 +1261,6 @@ def test_iloc_categorical_index(index):
 )
 @pytest.mark.parametrize("is_dataframe", [True, False])
 def test_loc_datetime_index(sli, is_dataframe):
-
     if is_dataframe is True:
         pd_data = pd.DataFrame(
             {"a": [1, 2, 3]},
@@ -1685,3 +1680,428 @@ def test_loc_single_row_from_slice():
     pdf = pd.DataFrame({"a": [10, 20, 30], "b": [1, 2, 3]}).set_index("a")
     df = cudf.from_pandas(pdf)
     assert_eq(pdf.loc[5:10], df.loc[5:10])
+
+
+@pytest.mark.parametrize("indexer", ["loc", "iloc"])
+@pytest.mark.parametrize(
+    "mask",
+    [[False, True], [False, False, True, True, True]],
+    ids=["too-short", "too-long"],
+)
+def test_boolean_mask_wrong_length(indexer, mask):
+    s = pd.Series([1, 2, 3, 4])
+
+    indexee = getattr(s, indexer)
+    with pytest.raises(IndexError):
+        indexee[mask]
+
+    c = cudf.from_pandas(s)
+    indexee = getattr(c, indexer)
+    with pytest.raises(IndexError):
+        indexee[mask]
+
+
+@pytest.mark.parametrize("indexer", ["loc", "iloc"])
+def test_boolean_mask_columns(indexer):
+    df = pd.DataFrame(np.zeros((3, 3)))
+    cdf = cudf.from_pandas(df)
+    mask = [True, False, True]
+    expect = getattr(df, indexer)[:, mask]
+    got = getattr(cdf, indexer)[:, mask]
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("indexer", ["loc", "iloc"])
+@pytest.mark.parametrize(
+    "mask",
+    [[False, True], [False, False, True, True, True]],
+    ids=["too-short", "too-long"],
+)
+def test_boolean_mask_columns_wrong_length(indexer, mask):
+    df = pd.DataFrame(np.zeros((3, 3)))
+    cdf = cudf.from_pandas(df)
+
+    with pytest.raises(IndexError):
+        getattr(df, indexer)[:, mask]
+    with pytest.raises(IndexError):
+        getattr(cdf, indexer)[:, mask]
+
+
+def test_boolean_mask_columns_iloc_series():
+    df = pd.DataFrame(np.zeros((3, 3)))
+    cdf = cudf.from_pandas(df)
+
+    mask = pd.Series([True, False, True], dtype=bool)
+    with pytest.raises(NotImplementedError):
+        df.iloc[:, mask]
+
+    with pytest.raises(NotImplementedError):
+        cdf.iloc[:, mask]
+
+
+@pytest.mark.parametrize("index_type", ["single", "slice"])
+def test_loc_timestamp_issue_8585(index_type):
+    # https://github.com/rapidsai/cudf/issues/8585
+    start = pd.Timestamp(
+        datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")
+    )
+    end = pd.Timestamp(datetime.strptime("2021-03-12 11:00", "%Y-%m-%d %H:%M"))
+    timestamps = pd.date_range(start, end, periods=12)
+    value = np.random.normal(size=12)
+    df = pd.DataFrame(value, index=timestamps, columns=["value"])
+    cdf = cudf.from_pandas(df)
+    if index_type == "single":
+        index = pd.Timestamp(
+            datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")
+        )
+    elif index_type == "slice":
+        index = slice(start, end, None)
+    else:
+        raise ValueError("Invalid index type")
+    expect = df.loc[index]
+    actual = cdf.loc[index]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.parametrize(
+    "index_type",
+    [
+        "single",
+        pytest.param(
+            "slice",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/8585"
+            ),
+        ),
+        pytest.param(
+            "date_range",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/8585"
+            ),
+        ),
+    ],
+)
+def test_loc_multiindex_timestamp_issue_8585(index_type):
+    # https://github.com/rapidsai/cudf/issues/8585
+    start = pd.Timestamp(
+        datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M")
+    )
+    end = pd.Timestamp(datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M"))
+    timestamps = pd.date_range(start, end, periods=4)
+    labels = ["A", "B", "C"]
+    index = pd.MultiIndex.from_product(
+        [timestamps, labels], names=["timestamp", "label"]
+    )
+    value = np.random.normal(size=12)
+    df = pd.DataFrame(value, index=index, columns=["value"])
+    cdf = cudf.from_pandas(df)
+    start = pd.Timestamp(
+        datetime.strptime("2021-03-12 01:00", "%Y-%m-%d %H:%M")
+    )
+    end = pd.Timestamp(datetime.strptime("2021-03-12 02:00", "%Y-%m-%d %H:%M"))
+    if index_type == "single":
+        index = pd.Timestamp(
+            datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")
+        )
+    elif index_type == "slice":
+        index = slice(start, end, None)
+    elif index_type == "date_range":
+        index = pd.date_range(start, end, periods=2)
+    else:
+        raise ValueError("Invalid index type")
+    expect = df.loc[index]
+    actual = cdf.loc[index]
+    assert_eq(expect, actual)
+
+
+def test_loc_repeated_index_label_issue_8693():
+    # https://github.com/rapidsai/cudf/issues/8693
+    s = pd.Series([1, 2, 3, 4], index=[0, 1, 1, 2])
+    cs = cudf.from_pandas(s)
+    expect = s.loc[1]
+    actual = cs.loc[1]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13268")
+@pytest.mark.parametrize(
+    "indexer", [(..., 0), (0, ...)], ids=["row_ellipsis", "column_ellipsis"]
+)
+def test_loc_ellipsis_as_slice_issue_13268(indexer):
+    # https://github.com/rapidsai/cudf/issues/13268
+    df = pd.DataFrame(np.arange(4).reshape(2, 2))
+    cdf = cudf.from_pandas(df)
+
+    expect = df.loc[indexer]
+    actual = cdf.loc[indexer]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.xfail(
+    reason="https://github.com/rapidsai/cudf/issues/13269 "
+    "and https://github.com/rapidsai/cudf/issues/13273"
+)
+def test_loc_repeated_column_label_issue_13269():
+    # https://github.com/rapidsai/cudf/issues/13269
+    # https://github.com/rapidsai/cudf/issues/13273
+    df = pd.DataFrame(np.arange(4).reshape(2, 2))
+    cdf = cudf.from_pandas(df)
+
+    expect = df.loc[:, [0, 1, 0]]
+    actual = cdf.loc[:, [0, 1, 0]]
+    assert_eq(expect, actual)
+
+
+def test_loc_column_boolean_mask_issue_13270():
+    # https://github.com/rapidsai/cudf/issues/13270
+    df = pd.DataFrame(np.arange(4).reshape(2, 2))
+    cdf = cudf.from_pandas(df)
+    expect = df.loc[:, [True, True]]
+    actual = cdf.loc[:, [True, True]]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.parametrize("indexer", [[1], [0, 2]])
+def test_iloc_integer_categorical_issue_13013(indexer):
+    # https://github.com/rapidsai/cudf/issues/13013
+    s = pd.Series([0, 1, 2])
+    index = pd.Categorical(indexer)
+    expect = s.iloc[index]
+    c = cudf.from_pandas(s)
+    actual = c.iloc[index]
+    assert_eq(expect, actual)
+
+
+def test_iloc_incorrect_boolean_mask_length_issue_13015():
+    # https://github.com/rapidsai/cudf/issues/13015
+    s = pd.Series([0, 1, 2])
+    with pytest.raises(IndexError):
+        s.iloc[[True, False]]
+    c = cudf.from_pandas(s)
+    with pytest.raises(IndexError):
+        c.iloc[[True, False]]
+
+
+def test_iloc_column_boolean_mask_issue_13265():
+    # https://github.com/rapidsai/cudf/issues/13265
+    df = pd.DataFrame(np.arange(4).reshape(2, 2))
+    cdf = cudf.from_pandas(df)
+    expect = df.iloc[:, [True, True]]
+    actual = cdf.iloc[:, [True, True]]
+    assert_eq(expect, actual)
+
+
+def test_iloc_repeated_column_label_issue_13266():
+    # https://github.com/rapidsai/cudf/issues/13266
+    # https://github.com/rapidsai/cudf/issues/13273
+    df = pd.DataFrame(np.arange(4).reshape(2, 2))
+    cdf = cudf.from_pandas(df)
+
+    with pytest.raises(NotImplementedError):
+        cdf.iloc[:, [0, 1, 0]]
+
+
+@pytest.mark.parametrize(
+    "indexer",
+    [
+        (..., 0),
+        (0, ...),
+    ],
+    ids=["row_ellipsis", "column_ellipsis"],
+)
+def test_iloc_ellipsis_as_slice_issue_13267(indexer):
+    # https://github.com/rapidsai/cudf/issues/13267
+    df = pd.DataFrame(np.arange(4).reshape(2, 2))
+    cdf = cudf.from_pandas(df)
+
+    expect = df.iloc[indexer]
+    actual = cdf.iloc[indexer]
+    assert_eq(expect, actual)
+
+
+@pytest.mark.parametrize(
+    "indexer",
+    [
+        0,
+        (slice(None), 0),
+        ([0, 2], 1),
+        (slice(None), slice(None)),
+        (slice(None), [1, 0]),
+        (0, 0),
+        (1, [1, 0]),
+        ([1, 0], 0),
+        ([1, 2], [0, 1]),
+    ],
+)
+def test_iloc_multiindex_lookup_as_label_issue_13515(indexer):
+    # https://github.com/rapidsai/cudf/issues/13515
+    df = pd.DataFrame(
+        {"a": [1, 1, 3], "b": [2, 3, 4], "c": [1, 6, 7], "d": [1, 8, 9]}
+    ).set_index(["a", "b"])
+    cdf = cudf.from_pandas(df)
+
+    expect = df.iloc[indexer]
+    actual = cdf.iloc[indexer]
+    assert_eq(expect, actual)
+
+
+def test_loc_unsorted_index_slice_lookup_keyerror_issue_12833():
+    # https://github.com/rapidsai/cudf/issues/12833
+    df = pd.DataFrame({"a": [1, 2, 3]}, index=[7, 0, 4])
+    cdf = cudf.from_pandas(df)
+
+    # Check that pandas don't change their mind
+    with pytest.raises(KeyError):
+        df.loc[1:5]
+
+    with pytest.raises(KeyError):
+        cdf.loc[1:5]
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13379")
+@pytest.mark.parametrize("index", [range(5), list(range(5))])
+def test_loc_missing_label_keyerror_issue_13379(index):
+    # https://github.com/rapidsai/cudf/issues/13379
+    df = pd.DataFrame({"a": index}, index=index)
+    cdf = cudf.from_pandas(df)
+    # Check that pandas don't change their mind
+    with pytest.raises(KeyError):
+        df.loc[[0, 5]]
+
+    with pytest.raises(KeyError):
+        cdf.loc[[0, 5]]
+
+
+@pytest.mark.parametrize("series", [True, False], ids=["Series", "DataFrame"])
+def test_loc_repeated_label_ordering_issue_13658(series):
+    # https://github.com/rapidsai/cudf/issues/13658
+    values = range(2048)
+    index = [1 for _ in values]
+    if series:
+        frame = cudf.Series(values, index=index)
+    else:
+        frame = cudf.DataFrame({"a": values}, index=index)
+    expect = frame.to_pandas().loc[[1]]
+    actual = frame.loc[[1]]
+    assert_eq(actual, expect)
+
+
+class TestLocIndexWithOrder:
+    # https://github.com/rapidsai/cudf/issues/12833
+    @pytest.fixture(params=["increasing", "decreasing", "neither"])
+    def order(self, request):
+        return request.param
+
+    @pytest.fixture(params=[-1, 1], ids=["reverse", "forward"])
+    def take_order(self, request):
+        return request.param
+
+    @pytest.fixture(params=["float", "int", "string", "range"])
+    def dtype(self, request):
+        return request.param
+
+    @pytest.fixture
+    def index(self, order, dtype):
+        if dtype == "string":
+            index = ["a", "h", "f", "z"]
+        elif dtype == "int":
+            index = [-1, 10, 7, 14]
+        elif dtype == "float":
+            index = [-1.5, 7.10, 2.4, 11.2]
+        elif dtype == "range":
+            if order == "increasing":
+                return cudf.RangeIndex(2, 10, 3)
+            elif order == "decreasing":
+                return cudf.RangeIndex(10, 1, -3)
+            else:
+                return cudf.RangeIndex(10, 20, 3)
+        else:
+            raise ValueError(f"Unhandled index dtype {dtype}")
+        if order == "decreasing":
+            return sorted(index, reverse=True)
+        elif order == "increasing":
+            return sorted(index)
+        elif order == "neither":
+            return index
+        else:
+            raise ValueError(f"Unhandled index order {order}")
+
+    @pytest.fixture
+    def df(self, index):
+        return cudf.DataFrame({"a": range(len(index))}, index=index)
+
+    def test_loc_index_inindex_slice(self, df, take_order):
+        pdf = df.to_pandas()
+        lo = pdf.index[1]
+        hi = pdf.index[-2]
+        expect = pdf.loc[lo:hi:take_order]
+        actual = df.loc[lo:hi:take_order]
+        assert_eq(expect, actual)
+
+    def test_loc_index_inindex_subset(self, df, take_order):
+        pdf = df.to_pandas()
+        vals = [pdf.index[0], pdf.index[2]][::take_order]
+        expect = pdf.loc[vals]
+        actual = df.loc[vals]
+        assert_eq(expect, actual)
+
+    def test_loc_index_notinindex_slice(
+        self, request, df, order, dtype, take_order
+    ):
+        pdf = df.to_pandas()
+        lo = pdf.index[1]
+        hi = pdf.index[-2]
+        if isinstance(lo, str):
+            lo = chr(ord(lo) - 1)
+            hi = chr(ord(hi) + 1)
+        else:
+            lo -= 1
+            hi += 1
+        if order == "neither" and dtype != "range":
+            with pytest.raises(KeyError):
+                pdf.loc[lo:hi:take_order]
+            with pytest.raises(KeyError):
+                df.loc[lo:hi:take_order]
+        else:
+            expect = pdf.loc[lo:hi:take_order]
+            actual = df.loc[lo:hi:take_order]
+            assert_eq(expect, actual)
+
+
+@pytest.mark.parametrize(
+    "arg",
+    [
+        (2, ("one", "second")),
+        (slice(None, None, None), ("two", "first")),
+        (1, ("one", "first")),
+        (slice(None, None, None), ("two", "second")),
+        (slice(None, None, None), ("two", "first", "three")),
+        (3, ("two", "first", "three")),
+        (slice(None, None, None), ("two",)),
+        (0, ("two",)),
+    ],
+)
+def test_loc_dataframe_column_multiindex(arg):
+    gdf = cudf.DataFrame(
+        [list("abcd"), list("efgh"), list("ijkl"), list("mnop")],
+        columns=cudf.MultiIndex.from_product(
+            [["one", "two"], ["first", "second"], ["three"]]
+        ),
+    )
+    pdf = gdf.to_pandas()
+
+    assert_eq(gdf.loc[arg], pdf.loc[arg])
+
+
+@pytest.mark.parametrize(
+    "arg", [slice(2, 4), slice(2, 5), slice(2.3, 5), slice(4.6, 6)]
+)
+def test_series_iloc_float_int(arg):
+    gs = cudf.Series(range(4), index=[2.0, 3.0, 4.5, 5.5])
+    ps = gs.to_pandas()
+
+    actual = gs.loc[arg]
+    expected = ps.loc[arg]
+
+    assert_eq(actual, expected)
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index e1104829914..a27de60c2c5 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -1,5 +1,7 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
+
+import numpy as np
 import pandas as pd
 import pytest
 
@@ -132,3 +134,51 @@ def test_create_interval_df(data1, data2, data3, data4, closed):
         dtype="interval",
     )
     assert_eq(expect_three, got_three)
+
+
+def test_create_interval_index_from_list():
+    interval_list = [
+        np.nan,
+        pd.Interval(2.0, 3.0, closed="right"),
+        pd.Interval(3.0, 4.0, closed="right"),
+    ]
+
+    expected = pd.Index(interval_list)
+    actual = cudf.Index(interval_list)
+
+    assert_eq(expected, actual)
+
+
+def test_interval_index_unique():
+    interval_list = [
+        np.nan,
+        pd.Interval(2.0, 3.0, closed="right"),
+        pd.Interval(3.0, 4.0, closed="right"),
+        np.nan,
+        pd.Interval(3.0, 4.0, closed="right"),
+        pd.Interval(3.0, 4.0, closed="right"),
+    ]
+    pi = pd.Index(interval_list)
+    gi = cudf.from_pandas(pi)
+
+    expected = pi.unique()
+    actual = gi.unique()
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("box", [pd.Series, pd.IntervalIndex])
+@pytest.mark.parametrize("tz", ["US/Eastern", None])
+def test_interval_with_datetime(tz, box):
+    dti = pd.date_range(
+        start=pd.Timestamp("20180101", tz=tz),
+        end=pd.Timestamp("20181231", tz=tz),
+        freq="M",
+    )
+    pobj = box(pd.IntervalIndex.from_breaks(dti))
+    if tz is None:
+        gobj = cudf.from_pandas(pobj)
+        assert_eq(pobj, gobj)
+    else:
+        with pytest.raises(NotImplementedError):
+            cudf.from_pandas(pobj)
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index b197e91882a..d3e75dd4a0c 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -11,6 +11,7 @@
 from cudf.testing._utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
+    TIMEDELTA_TYPES,
     assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
@@ -469,7 +470,7 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how):
     gdf_right = cudf.from_pandas(pdf_right)
     if not set(pdf_left.columns).intersection(pdf_right.columns):
         with pytest.raises(
-            pd.core.reshape.merge.MergeError,
+            pd.errors.MergeError,
             match="No common columns to perform merge on",
         ):
             pdf_left.merge(pdf_right)
@@ -479,7 +480,7 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how):
             gdf_left.merge(gdf_right)
     elif not [value for value in pdf_left if value in pdf_right]:
         with pytest.raises(
-            pd.core.reshape.merge.MergeError,
+            pd.errors.MergeError,
             match="No common columns to perform merge on",
         ):
             pdf_left.merge(pdf_right)
@@ -2219,3 +2220,46 @@ def test_dataframe_join_on():
     df = cudf.DataFrame({"a": [1, 2, 3]})
     with pytest.raises(NotImplementedError):
         df.join(df, on="a")
+
+
+@pytest.mark.parametrize("how", ["inner", "outer"])
+def test_index_join_names(how):
+    idx1 = cudf.Index([10, 1, 2, 4, 2, 1], name="a")
+    idx2 = cudf.Index([-10, 2, 3, 1, 2], name="b")
+
+    expected = idx1.to_pandas().join(idx2.to_pandas(), how=how)
+    actual = idx1.join(idx2, how=how)
+    assert_join_results_equal(actual, expected, how=how)
+
+
+@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
+def test_join_datetime_timedelta_error(dtype):
+    df1 = cudf.DataFrame({"a": cudf.Series([10, 20, 30], dtype=dtype)})
+    df2 = df1.astype("int")
+
+    with pytest.raises(TypeError):
+        df1.merge(df2)
+
+
+@pytest.mark.parametrize("dtype1", TIMEDELTA_TYPES)
+@pytest.mark.parametrize("dtype2", TIMEDELTA_TYPES)
+def test_merge_timedelta_types(dtype1, dtype2):
+    df1 = cudf.DataFrame({"a": cudf.Series([10, 20, 30], dtype=dtype1)})
+    df2 = cudf.DataFrame({"a": cudf.Series([20, 500, 33240], dtype=dtype2)})
+
+    pdf1 = df1.to_pandas()
+    pdf2 = df2.to_pandas()
+    actual = df1.merge(df2)
+    expected = pdf1.merge(pdf2)
+
+    # Pandas is materializing the index, which is unnecessary
+    # hence the special handling.
+    assert_eq(
+        actual,
+        expected,
+        check_index_type=False
+        if isinstance(actual.index, cudf.RangeIndex)
+        and isinstance(expected.index, pd.Index)
+        else True,
+        check_dtype=True,
+    )
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 8dcab37d20a..47f5b99acf7 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -31,23 +31,19 @@ def make_numeric_dataframe(nrows, dtype):
 @pytest.fixture(params=[0, 1, 10, 100])
 def pdf(request):
     types = NUMERIC_TYPES + DATETIME_TYPES + ["bool"]
-    typer = {"col_" + val: val for val in types}
-    ncols = len(types)
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
-        [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
-        columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
+        {
+            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            for typ in types
+        }
     )
     # Delete the name of the column index, and rename the row index
     test_pdf.columns.name = None
     test_pdf.index.name = "test_index"
 
-    # Cast all the column dtypes to objects, rename them, and then cast to
-    # appropriate types
-    test_pdf = test_pdf.astype("object").astype(typer)
-
     return test_pdf
 
 
@@ -219,6 +215,69 @@ def test_cudf_json_writer_read(gdf_writer_types):
     assert_eq(pdf2, gdf2)
 
 
+@pytest.mark.parametrize(
+    "jsonl_string, expected",
+    [
+        # fixed width
+        ("""{"a":10, "b":1.1}\n {"a":20, "b":2.1}\n""", None),
+        # simple list
+        ("""{"a":[1, 2, 3], "b":1.1}\n {"a":[]}\n""", None),
+        # simple struct
+        ("""{"a":{"c": 123 }, "b":1.1}\n {"a": {"c": 456}}\n""", None),
+        # list of lists
+        ("""{"a":[[], [1, 2], [3, 4]], "b":1.1}\n""", None),
+        ("""{"a":[null, [1, 2], [null, 4]], "b":1.1}\n""", None),
+        # list of structs
+        # error ("""{"a":[null, {}], "b":1.1}\n""", None),
+        (
+            """{"a":[null, {"L": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}\n""",
+            None,
+        ),
+        (
+            """{"a":[{"L": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}\n""",
+            None,
+        ),
+        # struct of lists
+        (
+            """{"a":{"L": [1, 2, 3]}, "b":1.1}\n {"a": {"L": [4, 5, 6]}}\n""",
+            None,
+        ),
+        ("""{"a":{"L": [1, 2, null]}, "b":1.1}\n {"a": {"L": []}}\n""", None),
+        # struct of structs
+        (
+            """{"a":{"L": {"M": 123}}, "b":1.1}
+               {"a": {"L": {"M": 456}}}\n""",
+            None,
+        ),
+        (
+            """{"a":{"L": {"M": null}}, "b":1.1}\n {"a": {"L": {}}}\n""",
+            """{"a":{"L": {}}, "b":1.1}\n {"a": {"L": {}}}\n""",
+        ),
+        # list of structs of lists
+        ("""{"a":[{"L": [1, 2, 3]}, {"L": [4, 5, 6]}], "b":1.1}\n""", None),
+        ("""{"a":[{"L": [1, 2, null]}, {"L": []}], "b":1.1}\n""", None),
+        # struct of lists of structs
+        ("""{"a":{"L": [{"M": 123}, {"M": 456}]}, "b":1.1}\n""", None),
+        (
+            """{"a":{"L": [{"M": null}, {}]}, "b":1.1}\n""",
+            """{"a":{"L": [{}, {}]}, "b":1.1}\n""",
+        ),
+    ],
+)
+def test_cudf_json_roundtrip(jsonl_string, expected):
+    gdf = cudf.read_json(
+        StringIO(jsonl_string),
+        lines=True,
+        engine="cudf",
+        # dtype=dict(dtypes),
+    )
+    expected = jsonl_string if expected is None else expected
+    gdf_string = gdf.to_json(
+        orient="records", lines=True, engine="cudf", include_nulls=False
+    )
+    assert_eq(gdf_string, expected.replace(" ", ""))
+
+
 @pytest.mark.parametrize("sink", ["string", "file"])
 def test_cudf_json_writer_sinks(sink, tmp_path_factory):
     df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
@@ -1189,7 +1248,7 @@ def test_json_array_of_arrays(data, lines):
         # simple list with mixed types
         """{"a":[123, {}], "b":1.1}""",
         """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        """{"a":[{"0": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
         """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
         """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
         """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 87a0424998f..5dd58d8a875 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -697,7 +697,12 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level):
         dtype = cudf.ListDtype(dtype)
 
     slr = cudf.Scalar(None, dtype=dtype)
-    assert slr.value is cudf.NA
+    assert slr.value is (
+        cudf.NaT
+        if cudf.api.types.is_datetime64_dtype(slr.dtype)
+        or cudf.api.types.is_timedelta64_dtype(slr.dtype)
+        else cudf.NA
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index f4e8b80342a..37529973c8f 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 """
 Tests related to is_unique and is_monotonic attributes
@@ -14,14 +14,12 @@
     DatetimeIndex,
     GenericIndex,
     RangeIndex,
-    StringIndex,
 )
 from cudf.testing._utils import assert_eq, expect_warning_if
 
 
 @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)])
 def test_range_index(testrange):
-
     index = RangeIndex(
         start=testrange[0], stop=testrange[1], step=testrange[2]
     )
@@ -53,7 +51,6 @@ def test_range_index(testrange):
     ],
 )
 def test_generic_index(testlist):
-
     index = GenericIndex(testlist)
     index_pd = pd.Index(testlist)
 
@@ -77,8 +74,7 @@ def test_generic_index(testlist):
     ],
 )
 def test_string_index(testlist):
-
-    index = StringIndex(testlist)
+    index = cudf.Index(testlist)
     index_pd = pd.Index(testlist)
 
     assert index.is_unique == index_pd.is_unique
@@ -95,7 +91,6 @@ def test_string_index(testlist):
     "testlist", [["c", "d", "e", "f"], ["z", "y", "x", "r"]]
 )
 def test_categorical_index(testlist):
-
     # Assuming unordered categorical data cannot be "monotonic"
     raw_cat = pd.Categorical(testlist, ordered=True)
     index = CategoricalIndex(raw_cat)
@@ -142,7 +137,6 @@ def test_categorical_index(testlist):
     ],
 )
 def test_datetime_index(testlist):
-
     index = DatetimeIndex(testlist)
     index_pd = pd.DatetimeIndex(testlist)
 
@@ -329,12 +323,9 @@ def test_get_slice_bound_missing(label, side, kind):
     assert got == expect
 
 
-@pytest.mark.xfail
 @pytest.mark.parametrize("label", ["a", "c", "e", "g"])
 @pytest.mark.parametrize("side", ["left", "right"])
 def test_get_slice_bound_missing_str(label, side):
-    # Slicing for monotonic string indices not yet supported
-    # when missing values are specified (allowed in pandas)
     mylist = ["b", "d", "f"]
     index = GenericIndex(mylist)
     index_pd = pd.Index(mylist)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index a0e027d4c86..3c843ace0a8 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -155,7 +155,7 @@ def test_multiindex_swaplevel():
 
 
 def test_string_index():
-    from cudf.core.index import StringIndex
+    from cudf.core.index import Index
 
     pdf = pd.DataFrame(np.random.rand(5, 5))
     gdf = cudf.from_pandas(pdf)
@@ -167,7 +167,7 @@ def test_string_index():
     pdf.index = stringIndex
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
-    stringIndex = StringIndex(["a", "b", "c", "d", "e"], name="name")
+    stringIndex = Index(["a", "b", "c", "d", "e"], name="name")
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
@@ -338,11 +338,33 @@ def test_multiindex_loc(pdf, gdf, pdfIndex, key_tuple):
     gdf.index = gdfIndex
     # The index is unsorted, which makes things slow but is fine for testing.
     with expect_pandas_performance_warning(key_tuple):
-        expected = pdf.loc[key_tuple].sort_index()
+        expected = pdf.loc[key_tuple]
     got = gdf.loc[key_tuple].sort_index()
+    assert_eq(expected.sort_index(), got)
+
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = gdf.loc[key_tuple]
     assert_eq(expected, got)
 
 
+@pytest.mark.parametrize(
+    "indexer",
+    [
+        (([1, 1], [0, 1]), slice(None)),
+        (([1, 1], [1, 0]), slice(None)),
+    ],
+)
+def test_multiindex_compatible_ordering(indexer):
+    df = pd.DataFrame(
+        {"a": [1, 1, 2, 3], "b": [1, 0, 1, 1], "c": [1, 2, 3, 4]}
+    ).set_index(["a", "b"])
+    cdf = cudf.from_pandas(df)
+    expect = df.loc[indexer]
+    with cudf.option_context("mode.pandas_compatible", True):
+        actual = cdf.loc[indexer]
+    assert_eq(actual, expect)
+
+
 @pytest.mark.parametrize(
     "arg",
     [
@@ -1160,6 +1182,17 @@ def test_multiindex_values_host():
     assert_eq(midx.values_host, pmidx.values)
 
 
+def test_multiindex_to_numpy():
+    midx = cudf.MultiIndex(
+        levels=[[1, 3, 4, 5], [1, 2, 5]],
+        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
+        names=["x", "y"],
+    )
+    pmidx = midx.to_pandas()
+
+    assert_eq(midx.to_numpy(), pmidx.to_numpy())
+
+
 @pytest.mark.parametrize(
     "gdi, fill_value, expected",
     [
@@ -1664,6 +1697,7 @@ def test_difference():
 
     expected = midx2.to_pandas().difference(midx.to_pandas())
     actual = midx2.difference(midx)
+    assert isinstance(actual, cudf.MultiIndex)
     assert_eq(expected, actual)
 
 
@@ -1846,3 +1880,49 @@ def test_multiindex_index_single_row():
     gdf.index = idx
     pdf = gdf.to_pandas()
     assert_eq(pdf.loc[("b", 3)], gdf.loc[("b", 3)])
+
+
+def test_multiindex_levels():
+    gidx = cudf.MultiIndex.from_product(
+        [range(3), ["one", "two"]], names=["first", "second"]
+    )
+    pidx = gidx.to_pandas()
+
+    assert_eq(gidx.levels[0], pidx.levels[0])
+    assert_eq(gidx.levels[1], pidx.levels[1])
+
+
+def test_multiindex_empty_slice_pandas_compatibility():
+    expected = pd.MultiIndex.from_tuples([("a", "b")])[:0]
+    with cudf.option_context("mode.pandas_compatible", True):
+        actual = cudf.from_pandas(expected)
+    assert_eq(expected, actual, exact=False)
+
+
+@pytest.mark.parametrize(
+    "levels",
+    itertools.chain.from_iterable(
+        itertools.permutations(range(3), n) for n in range(1, 4)
+    ),
+    ids=str,
+)
+def test_multiindex_sort_index_partial(levels):
+    df = pd.DataFrame(
+        {
+            "a": [3, 3, 3, 1, 1, 1, 2, 2],
+            "b": [4, 2, 7, -1, 11, -2, 7, 7],
+            "c": [4, 4, 2, 3, 3, 3, 1, 1],
+            "val": [1, 2, 3, 4, 5, 6, 7, 8],
+        }
+    ).set_index(["a", "b", "c"])
+    cdf = cudf.from_pandas(df)
+
+    expect = df.sort_index(level=levels, sort_remaining=True)
+    got = cdf.sort_index(level=levels, sort_remaining=True)
+    assert_eq(expect, got)
+
+
+def test_multiindex_to_series_error():
+    midx = cudf.MultiIndex.from_tuples([("a", "b")])
+    with pytest.raises(NotImplementedError):
+        midx.to_series()
diff --git a/python/cudf/cudf/tests/test_numba_import.py b/python/cudf/cudf/tests/test_numba_import.py
new file mode 100644
index 00000000000..dcde0f68aa2
--- /dev/null
+++ b/python/cudf/cudf/tests/test_numba_import.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+import subprocess
+import sys
+
+import pytest
+
+IS_CUDA_11 = False
+try:
+    from ptxcompiler.patch import NO_DRIVER, safe_get_versions
+
+    versions = safe_get_versions()
+    if versions != NO_DRIVER:
+        driver_version, runtime_version = versions
+        if driver_version < (12, 0):
+            IS_CUDA_11 = True
+except ModuleNotFoundError:
+    pass
+
+TEST_NUMBA_MVC_ENABLED = """
+import numba.cuda
+import cudf
+from cudf.utils._numba import _CUDFNumbaConfig, _patch_numba_mvc
+
+
+_patch_numba_mvc()
+
+@numba.cuda.jit
+def test_kernel(x):
+    id = numba.cuda.grid(1)
+    if id < len(x):
+        x[id] += 1
+
+s = cudf.Series([1, 2, 3])
+with _CUDFNumbaConfig():
+    test_kernel.forall(len(s))(s)
+"""
+
+
+@pytest.mark.skipif(
+    not IS_CUDA_11, reason="Minor Version Compatibility test for CUDA 11"
+)
+def test_numba_mvc_enabled_cuda_11():
+    cp = subprocess.run(
+        [sys.executable, "-c", TEST_NUMBA_MVC_ENABLED], capture_output=True
+    )
+    assert cp.returncode == 0
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index e2fbd55c051..5bb55c164fe 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -194,6 +194,7 @@ def test_to_numeric_downcast_int(data, downcast):
     assert_eq(expected, got)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 @pytest.mark.parametrize(
     "data",
     [
@@ -223,6 +224,7 @@ def test_to_numeric_downcast_float(data, downcast):
     assert_eq(expected, got)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 @pytest.mark.parametrize(
     "data",
     [
@@ -245,6 +247,7 @@ def test_to_numeric_downcast_large_float(data, downcast):
     assert_eq(expected, got)
 
 
+@pytest.mark.filterwarnings("ignore:overflow encountered in cast")
 @pytest.mark.parametrize(
     "data",
     [
@@ -325,6 +328,7 @@ def test_to_numeric_downcast_string_float(data, downcast):
     assert_eq(expected, got)
 
 
+@pytest.mark.filterwarnings("ignore:overflow encountered in cast")
 @pytest.mark.parametrize(
     "data",
     [
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index d42b0e85d28..baaca0b806f 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 from string import ascii_lowercase
 
@@ -7,8 +7,7 @@
 import pytest
 
 import cudf
-from cudf import DataFrame
-from cudf.testing import _utils as utils
+from cudf.testing._utils import assert_eq
 
 pytestmark = pytest.mark.spilling
 
@@ -24,20 +23,21 @@
     ],
 )
 def test_get_dummies(data, index):
-    gdf = DataFrame({"x": data}, index=index)
+    gdf = cudf.DataFrame({"x": data}, index=index)
     pdf = pd.DataFrame({"x": data}, index=index)
 
     encoded_expected = pd.get_dummies(pdf, prefix="test")
-    encoded_actual = cudf.get_dummies(gdf, prefix="test")
+    with pytest.warns(FutureWarning):
+        encoded_actual = cudf.get_dummies(gdf, prefix="test")
 
-    utils.assert_eq(
+    assert_eq(
         encoded_expected,
         encoded_actual,
         check_dtype=len(data) != 0,
     )
     encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=np.uint8)
 
-    utils.assert_eq(
+    assert_eq(
         encoded_expected,
         encoded_actual,
         check_dtype=len(data) != 0,
@@ -55,24 +55,26 @@ def test_onehot_get_dummies_multicol(n_cols):
     pdf = pd.DataFrame(data)
 
     encoded_expected = pd.get_dummies(pdf, prefix="test")
-    encoded_actual = cudf.get_dummies(gdf, prefix="test")
+    with pytest.warns(FutureWarning):
+        encoded_actual = cudf.get_dummies(gdf, prefix="test")
 
-    utils.assert_eq(encoded_expected, encoded_actual)
+    assert_eq(encoded_expected, encoded_actual)
 
 
 @pytest.mark.parametrize("nan_as_null", [True, False])
 @pytest.mark.parametrize("dummy_na", [True, False])
 def test_onehost_get_dummies_dummy_na(nan_as_null, dummy_na):
     pdf = pd.DataFrame({"a": [0, 1, np.nan]})
-    df = DataFrame.from_pandas(pdf, nan_as_null=nan_as_null)
+    df = cudf.DataFrame.from_pandas(pdf, nan_as_null=nan_as_null)
 
     expected = pd.get_dummies(pdf, dummy_na=dummy_na, columns=["a"])
-    got = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"])
+    with pytest.warns(FutureWarning):
+        actual = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"])
 
     if dummy_na and nan_as_null:
-        got = got.rename(columns={"a_null": "a_nan"})[expected.columns]
+        actual = actual.rename(columns={"a_<NA>": "a_nan"})[expected.columns]
 
-    utils.assert_eq(expected, got)
+    assert_eq(expected, actual)
 
 
 @pytest.mark.parametrize(
@@ -101,76 +103,72 @@ def test_get_dummies_prefix_sep(prefix, prefix_sep):
         "third": ["ji", "ji", "ji"],
     }
 
-    gdf = DataFrame(data)
+    gdf = cudf.DataFrame(data)
     pdf = pd.DataFrame(data)
 
     encoded_expected = pd.get_dummies(
         pdf, prefix=prefix, prefix_sep=prefix_sep
     )
-    encoded_actual = cudf.get_dummies(
-        gdf, prefix=prefix, prefix_sep=prefix_sep
-    )
+    with pytest.warns(FutureWarning):
+        encoded_actual = cudf.get_dummies(
+            gdf, prefix=prefix, prefix_sep=prefix_sep
+        )
 
-    utils.assert_eq(encoded_expected, encoded_actual)
+    assert_eq(encoded_expected, encoded_actual)
 
 
 def test_get_dummies_with_nan():
     df = cudf.DataFrame(
         {"a": cudf.Series([1, 2, np.nan, None], nan_as_null=False)}
     )
-    expected = cudf.DataFrame(
-        {
-            "a_null": [0, 0, 0, 1],
-            "a_1.0": [1, 0, 0, 0],
-            "a_2.0": [0, 1, 0, 0],
-            "a_nan": [0, 0, 1, 0],
-        },
-        dtype="uint8",
+
+    expected = pd.get_dummies(
+        df.to_pandas(nullable=True), dummy_na=True, columns=["a"]
     )
-    actual = cudf.get_dummies(df, dummy_na=True, columns=["a"])
 
-    utils.assert_eq(expected, actual)
+    with pytest.warns(FutureWarning):
+        actual = cudf.get_dummies(df, dummy_na=True, columns=["a"])
+
+    assert_eq(expected, actual)
 
 
 @pytest.mark.parametrize(
     "data",
     [
-        cudf.Series(["abc", "l", "a", "abc", "z", "xyz"]),
-        cudf.Index([None, 1, 2, 3.3, None, 0.2]),
-        cudf.Series([0.1, 2, 3, None, np.nan]),
-        cudf.Series([23678, 324, 1, 324], name="abc"),
+        lambda: cudf.Series(["abc", "l", "a", "abc", "z", "xyz"]),
+        lambda: cudf.Index([None, 1, 2, 3.3, None, 0.2]),
+        lambda: cudf.Series([0.1, 2, 3, None, np.nan]),
+        lambda: cudf.Series([23678, 324, 1, 324], name="abc"),
     ],
 )
 @pytest.mark.parametrize("prefix_sep", ["-", "#"])
 @pytest.mark.parametrize("prefix", [None, "hi"])
 @pytest.mark.parametrize("dtype", ["uint8", "int16"])
 def test_get_dummies_array_like(data, prefix_sep, prefix, dtype):
-    actual = cudf.get_dummies(
-        data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype
-    )
-    if isinstance(data, (cudf.Series, cudf.BaseIndex)):
-        pd_data = data.to_pandas()
-    else:
-        pd_data = data
+    data = data()
+    pd_data = data.to_pandas()
 
     expected = pd.get_dummies(
         pd_data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype
     )
-    utils.assert_eq(expected, actual)
+
+    actual = cudf.get_dummies(
+        data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype
+    )
+
+    assert_eq(expected, actual)
 
 
 def test_get_dummies_array_like_with_nan():
     ser = cudf.Series([0.1, 2, 3, None, np.nan], nan_as_null=False)
-    expected = cudf.DataFrame(
-        {
-            "a_null": [0, 0, 0, 1, 0],
-            "a_0.1": [1, 0, 0, 0, 0],
-            "a_2.0": [0, 1, 0, 0, 0],
-            "a_3.0": [0, 0, 1, 0, 0],
-            "a_nan": [0, 0, 0, 0, 1],
-        },
-        dtype="uint8",
+
+    expected = pd.get_dummies(
+        ser.to_pandas(nullable=True), dummy_na=True, prefix="a", prefix_sep="_"
     )
-    actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_")
 
-    utils.assert_eq(expected, actual)
+    with pytest.warns(FutureWarning):
+        actual = cudf.get_dummies(
+            ser, dummy_na=True, prefix="a", prefix_sep="_"
+        )
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_options.py b/python/cudf/cudf/tests/test_options.py
index 19ffb361542..ef9c7ec0656 100644
--- a/python/cudf/cudf/tests/test_options.py
+++ b/python/cudf/cudf/tests/test_options.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 from contextlib import redirect_stdout
 from io import StringIO
@@ -8,7 +8,7 @@
 import cudf
 
 
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(scope="class", autouse=False)
 def empty_option_environment():
     old_option_environment = cudf.options._OPTIONS
     cudf.options._OPTIONS = {}
@@ -16,7 +16,7 @@ def empty_option_environment():
     cudf.options._OPTIONS = old_option_environment
 
 
-@pytest.fixture
+@pytest.fixture(scope="function")
 def odd_option(empty_option_environment):
     def validator(x):
         if not x % 2 == 1:
@@ -32,7 +32,7 @@ def validator(x):
     del cudf.options._OPTIONS["odd_option"]
 
 
-@pytest.fixture
+@pytest.fixture(scope="function")
 def even_option(empty_option_environment):
     def validator(x):
         if not x % 2 == 0:
@@ -45,33 +45,85 @@ def validator(x):
     del cudf.options._OPTIONS["even_option"]
 
 
-def test_option_get_set(odd_option):
-    assert cudf.get_option("odd_option") == 1
-    cudf.set_option("odd_option", 101)
-    assert cudf.get_option("odd_option") == 101
-
-
-def test_option_set_invalid(odd_option):
-    with pytest.raises(ValueError, match="Invalid option value 0"):
-        cudf.set_option("odd_option", 0)
-
+@pytest.mark.usefixtures("odd_option", "even_option")
+class TestCleanOptions:
+    def test_option_get_set(odd_option):
+        assert cudf.get_option("odd_option") == 1
+        cudf.set_option("odd_option", 101)
+        assert cudf.get_option("odd_option") == 101
+
+    def test_option_set_invalid(odd_option):
+        with pytest.raises(ValueError, match="Invalid option value 0"):
+            cudf.set_option("odd_option", 0)
+
+    def test_option_description(odd_option):
+        s = StringIO()
+        with redirect_stdout(s):
+            cudf.describe_option("odd_option")
+        s.seek(0)
+        expected = (
+            "odd_option:\n\tAn odd option.\n\t[Default: 1] [Current: 1]\n"
+        )
+        assert expected == s.read()
+
+    def test_option_description_all(odd_option, even_option):
+        s = StringIO()
+        with redirect_stdout(s):
+            cudf.describe_option()
+        s.seek(0)
+        expected = (
+            "odd_option:\n\tAn odd option.\n\t[Default: 1] [Current: 1]\n"
+            "even_option:\n\tAn even option.\n\t[Default: 0] [Current: 0]\n"
+        )
+        assert expected == s.read()
+
+
+@pytest.mark.parametrize("default_integer_bitwidth", [32, 64, None])
+def test_empty_option_context(default_integer_bitwidth):
+    prev_setting = cudf.get_option("default_integer_bitwidth")
+    cudf.set_option("default_integer_bitwidth", default_integer_bitwidth)
+    with cudf.option_context():
+        assert (
+            cudf.get_option("default_integer_bitwidth")
+            == default_integer_bitwidth
+        )
+
+    assert (
+        cudf.get_option("default_integer_bitwidth") == default_integer_bitwidth
+    )
+    cudf.set_option("default_integer_bitwidth", prev_setting)
+
+
+@pytest.mark.parametrize("pandas_compatible", [True, False])
+@pytest.mark.parametrize("default_integer_bitwidth", [32, 64])
+def test_option_context(pandas_compatible, default_integer_bitwidth):
+    prev_pandas_compatible_setting = cudf.get_option("mode.pandas_compatible")
+    prev_width_setting = cudf.get_option("default_integer_bitwidth")
+
+    with cudf.option_context(
+        "mode.pandas_compatible",
+        pandas_compatible,
+        "default_integer_bitwidth",
+        default_integer_bitwidth,
+    ):
+        assert cudf.get_option("mode.pandas_compatible") is pandas_compatible
+        assert (
+            cudf.get_option("default_integer_bitwidth")
+            is default_integer_bitwidth
+        )
+
+    assert (
+        cudf.get_option("mode.pandas_compatible")
+        is prev_pandas_compatible_setting
+    )
+    assert cudf.get_option("default_integer_bitwidth") is prev_width_setting
 
-def test_option_description(odd_option):
-    s = StringIO()
-    with redirect_stdout(s):
-        cudf.describe_option("odd_option")
-    s.seek(0)
-    expected = "odd_option:\n\tAn odd option.\n\t[Default: 1] [Current: 1]\n"
-    assert expected == s.read()
 
+def test_options_context_error():
+    with pytest.raises(ValueError):
+        with cudf.option_context("mode.pandas_compatible"):
+            pass
 
-def test_option_description_all(odd_option, even_option):
-    s = StringIO()
-    with redirect_stdout(s):
-        cudf.describe_option()
-    s.seek(0)
-    expected = (
-        "odd_option:\n\tAn odd option.\n\t[Default: 1] [Current: 1]\n"
-        "even_option:\n\tAn even option.\n\t[Default: 0] [Current: 0]\n"
-    )
-    assert expected == s.read()
+    with pytest.raises(ValueError):
+        with cudf.option_context("mode.pandas_compatible", 1, 2):
+            pass
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 7fcad5df9f1..aafc8831bf4 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -921,7 +921,6 @@ def test_orc_writer_decimal(tmpdir, scale, decimal_type):
 
 @pytest.mark.parametrize("num_rows", [1, 100, 3000])
 def test_orc_reader_multiple_files(datadir, num_rows):
-
     path = datadir / "TestOrcFile.testSnappy.orc"
 
     df_1 = pd.read_orc(path)
@@ -939,7 +938,6 @@ def test_orc_reader_multiple_files(datadir, num_rows):
 
 
 def test_orc_reader_multi_file_single_stripe(datadir):
-
     path = datadir / "TestOrcFile.testSnappy.orc"
 
     # should raise an exception
@@ -948,7 +946,6 @@ def test_orc_reader_multi_file_single_stripe(datadir):
 
 
 def test_orc_reader_multi_file_multi_stripe(datadir):
-
     path = datadir / "TestOrcFile.testStripeLevelStats.orc"
     gdf = cudf.read_orc([path, path], stripes=[[0, 1], [2]])
     pdf = pd.read_orc(path)
@@ -1100,7 +1097,6 @@ def list_struct_buff():
 @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000])
 @pytest.mark.parametrize("use_index", [True, False])
 def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff):
-
     gdf = cudf.read_orc(
         list_struct_buff,
         columns=columns,
@@ -1905,3 +1901,27 @@ def test_reader_row_index_order(data):
     expected.to_pandas().to_orc(buffer)
     got = cudf.read_orc(buffer)
     assert_eq(expected, got)
+
+
+# Test the corner case where empty blocks are compressed
+# Decompressed data size is zero, even though compressed data size is non-zero
+# For more information see https://github.com/rapidsai/cudf/issues/13608
+def test_orc_reader_empty_decomp_data(datadir):
+    path = datadir / "TestOrcFile.Spark.EmptyDecompData.orc"
+
+    expect = pd.read_orc(path)
+    got = cudf.read_orc(path)
+
+    assert_eq(expect, got)
+
+
+def test_orc_reader_empty_deeply_nested_level(datadir):
+    # Test the case where top level struct has nulls, but the nested struct is
+    # not nullable. In this case there is no data in the second level, but we
+    # still need to pass the parent null mask to the third level.
+    path = datadir / "TestOrcFile.Spark.NestedNotNullableStruct.orc"
+
+    expect = pd.read_orc(path)
+    got = cudf.read_orc(path)
+
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 0ab5d35f9f8..b892cc62ac4 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -69,14 +69,14 @@ def simple_pdf(request):
         "float32",
         "float64",
     ]
-    typer = {"col_" + val: val for val in types}
-    ncols = len(types)
     nrows = request.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
-        [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
-        columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
+        {
+            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            for typ in types
+        },
         # Need to ensure that this index is not a RangeIndex to get the
         # expected round-tripping behavior from Parquet reader/writer.
         index=pd.Index(list(range(nrows))),
@@ -85,10 +85,6 @@ def simple_pdf(request):
     test_pdf.columns.name = None
     test_pdf.index.name = "test_index"
 
-    # Cast all the column dtypes to objects, rename them, and then cast to
-    # appropriate types
-    test_pdf = test_pdf.astype("object").astype(typer)
-
     return test_pdf
 
 
@@ -115,14 +111,14 @@ def build_pdf(num_columns, day_resolution_timestamps):
         "datetime64[us]",
         "str",
     ]
-    typer = {"col_" + val: val for val in types}
-    ncols = len(types)
     nrows = num_columns.param
 
     # Create a pandas dataframe with random data of mixed types
     test_pdf = pd.DataFrame(
-        [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
-        columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
+        {
+            f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
+            for typ in types
+        },
         # Need to ensure that this index is not a RangeIndex to get the
         # expected round-tripping behavior from Parquet reader/writer.
         index=pd.Index(list(range(nrows))),
@@ -131,10 +127,6 @@ def build_pdf(num_columns, day_resolution_timestamps):
     test_pdf.columns.name = None
     test_pdf.index.name = "test_index"
 
-    # Cast all the column dtypes to objects, rename them, and then cast to
-    # appropriate types
-    test_pdf = test_pdf.astype(typer)
-
     # make datetime64's a little more interesting by increasing the range of
     # dates note that pandas will convert these to ns timestamps, so care is
     # taken to avoid overflowing a ns timestamp. There is also the ability to
@@ -305,8 +297,7 @@ def test_parquet_reader_empty_pandas_dataframe(tmpdir, engine):
 
 
 @pytest.mark.parametrize("has_null", [False, True])
-@pytest.mark.parametrize("strings_to_categorical", [False, True, None])
-def test_parquet_reader_strings(tmpdir, strings_to_categorical, has_null):
+def test_parquet_reader_strings(tmpdir, has_null):
     df = pd.DataFrame(
         [(1, "aaa", 9.0), (2, "bbb", 8.0), (3, "ccc", 7.0)],
         columns=pd.Index(list("abc")),
@@ -317,25 +308,10 @@ def test_parquet_reader_strings(tmpdir, strings_to_categorical, has_null):
     df.to_parquet(fname)
     assert os.path.exists(fname)
 
-    if strings_to_categorical is not None:
-        gdf = cudf.read_parquet(
-            fname, engine="cudf", strings_to_categorical=strings_to_categorical
-        )
-    else:
-        gdf = cudf.read_parquet(fname, engine="cudf")
+    gdf = cudf.read_parquet(fname, engine="cudf")
 
-    if strings_to_categorical:
-        if has_null:
-            hash_ref = [989983842, None, 1169108191]
-        else:
-            hash_ref = [989983842, 429364346, 1169108191]
-        assert gdf["b"].dtype == np.dtype("int32")
-        assert_eq(
-            gdf["b"], cudf.Series(hash_ref, dtype=np.dtype("int32"), name="b")
-        )
-    else:
-        assert gdf["b"].dtype == np.dtype("object")
-        assert_eq(gdf["b"], df["b"])
+    assert gdf["b"].dtype == np.dtype("object")
+    assert_eq(gdf["b"], df["b"])
 
 
 @pytest.mark.parametrize("columns", [None, ["b"]])
@@ -528,9 +504,7 @@ def test_parquet_read_filtered_multiple_files(tmpdir):
     )
     assert_eq(
         filtered_df,
-        cudf.DataFrame(
-            {"x": [2, 3, 2, 3], "y": list("bbcc")}, index=[2, 3, 2, 3]
-        ),
+        cudf.DataFrame({"x": [2, 2], "y": list("bc")}, index=[2, 2]),
     )
 
 
@@ -541,13 +515,16 @@ def test_parquet_read_filtered_multiple_files(tmpdir):
 @pytest.mark.parametrize(
     "predicate,expected_len",
     [
-        ([[("x", "==", 0)], [("z", "==", 0)]], 4),
-        ([("x", "==", 0), ("z", "==", 0)], 0),
-        ([("x", "==", 0), ("z", "!=", 0)], 2),
+        ([[("x", "==", 0)], [("z", "==", 0)]], 2),
         ([("x", "==", 0), ("z", "==", 0)], 0),
+        ([("x", "==", 0), ("z", "!=", 0)], 1),
         ([("y", "==", "c"), ("x", ">", 8)], 0),
-        ([("y", "==", "c"), ("x", ">=", 5)], 2),
-        ([[("y", "==", "c")], [("x", "<", 3)]], 6),
+        ([("y", "==", "c"), ("x", ">=", 5)], 1),
+        ([[("y", "==", "c")], [("x", "<", 3)]], 5),
+        ([[("x", "not in", (0, 9)), ("z", "not in", (4, 5))]], 6),
+        ([[("y", "==", "c")], [("x", "in", (0, 9)), ("z", "in", (0, 9))]], 4),
+        ([[("x", "==", 0)], [("x", "==", 1)], [("x", "==", 2)]], 3),
+        ([[("x", "==", 0), ("z", "==", 9), ("y", "==", "a")]], 1),
     ],
 )
 def test_parquet_read_filtered_complex_predicate(
@@ -556,7 +533,11 @@ def test_parquet_read_filtered_complex_predicate(
     # Generate data
     fname = tmpdir.join("filtered_complex_predicate.parquet")
     df = pd.DataFrame(
-        {"x": range(10), "y": list("aabbccddee"), "z": reversed(range(10))}
+        {
+            "x": range(10),
+            "y": list("aabbccddee"),
+            "z": reversed(range(10)),
+        }
     )
     df.to_parquet(fname, row_group_size=2)
 
@@ -1305,6 +1286,56 @@ def test_parquet_reader_v2(tmpdir, simple_pdf):
     assert_eq(cudf.read_parquet(pdf_fname), simple_pdf)
 
 
+@pytest.mark.parametrize("nrows", [1, 100000])
+@pytest.mark.parametrize("add_nulls", [True, False])
+def test_delta_binary(nrows, add_nulls, tmpdir):
+    null_frequency = 0.25 if add_nulls else 0
+
+    # Create a pandas dataframe with random data of mixed types
+    arrow_table = dg.rand_dataframe(
+        dtypes_meta=[
+            {
+                "dtype": "int8",
+                "null_frequency": null_frequency,
+                "cardinality": nrows,
+            },
+            {
+                "dtype": "int16",
+                "null_frequency": null_frequency,
+                "cardinality": nrows,
+            },
+            {
+                "dtype": "int32",
+                "null_frequency": null_frequency,
+                "cardinality": nrows,
+            },
+            {
+                "dtype": "int64",
+                "null_frequency": null_frequency,
+                "cardinality": nrows,
+            },
+        ],
+        rows=nrows,
+        seed=0,
+        use_threads=False,
+    )
+    # Roundabout conversion to pandas to preserve nulls/data types
+    cudf_table = cudf.DataFrame.from_arrow(arrow_table)
+    test_pdf = cudf_table.to_pandas(nullable=True)
+    pdf_fname = tmpdir.join("pdfv2.parquet")
+    test_pdf.to_parquet(
+        pdf_fname,
+        version="2.6",
+        column_encoding="DELTA_BINARY_PACKED",
+        data_page_version="2.0",
+        engine="pyarrow",
+        use_dictionary=False,
+    )
+    cdf = cudf.read_parquet(pdf_fname)
+    pcdf = cudf.from_pandas(test_pdf)
+    assert_eq(cdf, pcdf)
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -1954,26 +1985,16 @@ def test_read_parquet_partitioned_filtered(
         assert got.dtypes["c"] == "int"
     assert_eq(expect, got)
 
-    # Filter on non-partitioned column.
-    # Cannot compare to pandas, since the pyarrow
-    # backend will filter by row (and cudf can
-    # only filter by column, for now)
+    # Filter on non-partitioned column
     filters = [("a", "==", 10)]
-    got = cudf.read_parquet(
-        read_path,
-        filters=filters,
-        row_groups=row_groups,
-    )
-    assert len(got) < len(df) and 10 in got["a"]
+    got = cudf.read_parquet(read_path, filters=filters)
+    expect = pd.read_parquet(read_path, filters=filters)
 
     # Filter on both kinds of columns
     filters = [[("a", "==", 10)], [("c", "==", 1)]]
-    got = cudf.read_parquet(
-        read_path,
-        filters=filters,
-        row_groups=row_groups,
-    )
-    assert len(got) < len(df) and (1 in got["c"] and 10 in got["a"])
+    got = cudf.read_parquet(read_path, filters=filters)
+    expect = pd.read_parquet(read_path, filters=filters)
+    assert_eq(expect, got)
 
 
 def test_parquet_writer_chunked_metadata(tmpdir, simple_pdf, simple_gdf):
@@ -2293,7 +2314,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
         pdf = pdf.drop(columns=["col_category", "col_bool"])
 
     if not add_nulls:
-        # Timedelta types convert NA to None when reading from parquet into
+        # Timedelta types convert NaT to None when reading from parquet into
         # pandas which interferes with series.max()/min()
         for t in TIMEDELTA_TYPES:
             pdf["col_" + t] = pd.Series(np.arange(len(pdf.index))).astype(t)
@@ -2353,11 +2374,11 @@ def test_parquet_writer_list_statistics(tmpdir):
         for i, col in enumerate(pd_slice):
             stats = pq_file.metadata.row_group(rg).column(i).statistics
 
-            actual_min = cudf.Series(pd_slice[col].explode().explode()).min()
+            actual_min = pd_slice[col].explode().explode().dropna().min()
             stats_min = stats.min
             assert normalized_equals(actual_min, stats_min)
 
-            actual_max = cudf.Series(pd_slice[col].explode().explode()).max()
+            actual_max = pd_slice[col].explode().explode().dropna().max()
             stats_max = stats.max
             assert normalized_equals(actual_max, stats_max)
 
@@ -2547,6 +2568,24 @@ def test_parquet_reader_binary_decimal(datadir):
     assert_eq(expect, got)
 
 
+def test_parquet_reader_fixed_bin(datadir):
+    fname = datadir / "fixed_len_byte_array.parquet"
+
+    expect = pd.read_parquet(fname)
+    got = cudf.read_parquet(fname)
+
+    assert_eq(expect, got)
+
+
+def test_parquet_reader_rle_boolean(datadir):
+    fname = datadir / "rle_boolean_encoding.parquet"
+
+    expect = pd.read_parquet(fname)
+    got = cudf.read_parquet(fname)
+
+    assert_eq(expect, got)
+
+
 # testing a specific bug-fix/edge case.
 # specifically:  int a parquet file containing a particular way of representing
 #                a list column in a schema, the cudf reader was confusing
@@ -2802,3 +2841,30 @@ def test_parquet_writer_schema_nullability(data, force_nullable_schema):
     assert pa.parquet.read_schema(file_obj).field(0).nullable == (
         force_nullable_schema or df.isnull().any().any()
     )
+
+
+def test_parquet_read_filter_and_project():
+    # Filter on columns that are not included
+    # in the current column projection
+
+    with BytesIO() as buffer:
+        # Write parquet data
+        df = cudf.DataFrame(
+            {
+                "a": [1, 2, 3, 4, 5] * 10,
+                "b": [0, 1, 2, 3, 4] * 10,
+                "c": range(50),
+                "d": [6, 7] * 25,
+                "e": [8, 9] * 25,
+            }
+        )
+        df.to_parquet(buffer)
+
+        # Read back with filter and projection
+        columns = ["b"]
+        filters = [[("a", "==", 5), ("c", ">", 20)]]
+        got = cudf.read_parquet(buffer, columns=columns, filters=filters)
+
+    # Check result
+    expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True)
+    assert_eq(got, expected)
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index 53b06e64a91..8b126073a0f 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -75,3 +75,18 @@ def test_quantile_q_type():
         ),
     ):
         gs.quantile(cudf.DataFrame())
+
+
+@pytest.mark.parametrize(
+    "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"]
+)
+def test_quantile_type_int_float(interpolation):
+    data = [1, 3, 4]
+    psr = pd.Series(data)
+    gsr = cudf.Series(data)
+
+    expected = psr.quantile(0.5, interpolation=interpolation)
+    actual = gsr.quantile(0.5, interpolation=interpolation)
+
+    assert expected == actual
+    assert type(expected) == type(actual)
diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py
index 9bd67309ece..f8a8903b518 100644
--- a/python/cudf/cudf/tests/test_rank.py
+++ b/python/cudf/cudf/tests/test_rank.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from itertools import chain, combinations_with_replacement, product
 
@@ -128,6 +128,7 @@ def test_rank_error_arguments(pdf):
 sort_dtype_args = [np.int32, np.int64, np.float32, np.float64]
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
 @pytest.mark.parametrize(
     "elem,dtype",
     list(
@@ -139,13 +140,12 @@ def test_rank_error_arguments(pdf):
 )
 def test_series_rank_combinations(elem, dtype):
     np.random.seed(0)
+    aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(dtype)
     gdf = DataFrame()
-    gdf["a"] = aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(
-        dtype
-    )
-    ranked_gs = gdf["a"].rank(method="first")
     df = pd.DataFrame()
+    gdf["a"] = aa
     df["a"] = aa
+    ranked_gs = gdf["a"].rank(method="first")
     ranked_ps = df["a"].rank(method="first")
     # Check
-    assert_eq(ranked_ps, ranked_gs.to_pandas())
+    assert_eq(ranked_ps, ranked_gs)
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 61dde412b89..47968ec1d97 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -12,7 +12,12 @@
 from cudf import Series
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing import _utils as utils
-from cudf.testing._utils import NUMERIC_TYPES, assert_eq, gen_rand
+from cudf.testing._utils import (
+    NUMERIC_TYPES,
+    assert_eq,
+    expect_warning_if,
+    gen_rand,
+)
 
 params_dtype = NUMERIC_TYPES
 
@@ -122,10 +127,8 @@ def test_sum_of_squares(dtype, nelem):
     sr = Series(data)
     df = cudf.DataFrame(sr)
 
-    with pytest.warns(FutureWarning):
-        got = sr.sum_of_squares()
-    with pytest.warns(FutureWarning):
-        got_df = df.sum_of_squares()
+    got = (sr**2).sum()
+    got_df = (df**2).sum()
     expect = (data**2).sum()
 
     if cudf.dtype(dtype).kind in {"u", "i"}:
@@ -158,8 +161,7 @@ def test_sum_of_squares_decimal(dtype):
     data = [str(x) for x in gen_rand("int8", 3) / 10]
 
     expected = pd.Series([Decimal(x) for x in data]).pow(2).sum()
-    with pytest.warns(FutureWarning):
-        got = cudf.Series(data).astype(dtype).sum_of_squares()
+    got = (cudf.Series(data).astype(dtype) ** 2).sum()
 
     assert_eq(expected, got)
 
@@ -309,3 +311,56 @@ def test_categorical_reductions(op):
     psr = gsr.to_pandas()
 
     utils.assert_exceptions_equal(getattr(psr, op), getattr(gsr, op))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [1, 2, 3], "b": [10, 11, 12]},
+        {"a": [1, 0, 3], "b": [10, 11, 12]},
+        {"a": [1, 2, 3], "b": [10, 11, None]},
+        {
+            "a": [],
+        },
+        {},
+    ],
+)
+@pytest.mark.parametrize("op", ["all", "any"])
+def test_any_all_axis_none(data, op):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    expected = getattr(pdf, op)(axis=None)
+    actual = getattr(gdf, op)(axis=None)
+
+    assert expected == actual
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "sum",
+        "product",
+        "std",
+        "var",
+        "kurt",
+        "kurtosis",
+        "skew",
+        "min",
+        "max",
+        "mean",
+        "median",
+    ],
+)
+def test_reductions_axis_none_warning(op):
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]})
+    pdf = df.to_pandas()
+    with pytest.warns(FutureWarning):
+        actual = getattr(df, op)(axis=None)
+    with expect_warning_if(
+        op in {"kurt", "kurtosis", "skew", "min", "max", "mean", "median"},
+        FutureWarning,
+    ):
+        expected = getattr(pdf, op)(axis=None)
+
+    assert_eq(expected, actual, check_dtype=False)
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 9e93dd6d227..13e44e7cf59 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -944,8 +944,15 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
     psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype)
     sr = cudf.from_pandas(psr)
 
+    if sr.dtype.kind in "ui":
+        can_replace = np.array([replacement])[0].is_integer() and np.can_cast(
+            int(replacement), sr.dtype
+        )
+    else:
+        can_replace = np.can_cast(replacement, sr.dtype)
+
     # Both Scalar
-    if sr.dtype.type(replacement) != replacement:
+    if not can_replace:
         with pytest.raises(TypeError):
             sr.replace(1, replacement)
     else:
@@ -954,7 +961,7 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
         assert_eq(expect, got)
 
     # to_replace is a list, replacement is a scalar
-    if sr.dtype.type(replacement) != replacement:
+    if not can_replace:
         with pytest.raises(TypeError):
 
             sr.replace([2, 3], replacement)
@@ -974,7 +981,7 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
     # Both lists of equal length
     if (
         np.dtype(type(replacement)).kind == "f" and sr.dtype.kind in {"i", "u"}
-    ) or (sr.dtype.type(replacement) != replacement):
+    ) or (not can_replace):
         with pytest.raises(TypeError):
             sr.replace([2, 3], [replacement, replacement])
     else:
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index e7fa401f1ec..a36cc1b3819 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -40,10 +40,7 @@ def test_null_series(nrows, dtype):
         ps = sr.to_pandas()
 
     pd.options.display.max_rows = int(nrows)
-    psrepr = repr(ps)
-    psrepr = psrepr.replace("NaN", "<NA>")
-    psrepr = psrepr.replace("NaT", "<NA>")
-    psrepr = psrepr.replace("None", "<NA>")
+    psrepr = repr(ps).replace("NaN", "<NA>").replace("None", "<NA>")
     if "UInt" in psrepr:
         psrepr = psrepr.replace("UInt", "uint")
     elif "Int" in psrepr:
@@ -71,12 +68,7 @@ def test_null_dataframe(ncols):
         gdf[dtype] = sr
     pdf = gdf.to_pandas()
     pd.options.display.max_columns = int(ncols)
-    pdf_repr = (
-        repr(pdf)
-        .replace("NaN", "<NA>")
-        .replace("NaT", "<NA>")
-        .replace("None", "<NA>")
-    )
+    pdf_repr = repr(pdf).replace("NaN", "<NA>").replace("None", "<NA>")
     assert pdf_repr.split() == repr(gdf).split()
     pd.reset_option("display.max_columns")
 
@@ -359,33 +351,33 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows):
             cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ns]")),
             "DatetimeIndex([1970-01-01 00:00:00.000000010, "
             "1970-01-01 00:00:00.000000020,"
-            "\n       1970-01-01 00:00:00.000000030, <NA>],\n      "
+            "\n       1970-01-01 00:00:00.000000030, NaT],\n      "
             "dtype='datetime64[ns]')",
         ),
         (
             cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[s]")),
             "DatetimeIndex([1970-01-01 00:00:10, "
             "1970-01-01 00:00:20, 1970-01-01 00:00:30,\n"
-            "       <NA>],\n      dtype='datetime64[s]')",
+            "       NaT],\n      dtype='datetime64[s]')",
         ),
         (
             cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[us]")),
             "DatetimeIndex([1970-01-01 00:00:00.000010, "
             "1970-01-01 00:00:00.000020,\n       "
-            "1970-01-01 00:00:00.000030, <NA>],\n      "
+            "1970-01-01 00:00:00.000030, NaT],\n      "
             "dtype='datetime64[us]')",
         ),
         (
             cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ms]")),
             "DatetimeIndex([1970-01-01 00:00:00.010, "
             "1970-01-01 00:00:00.020,\n       "
-            "1970-01-01 00:00:00.030, <NA>],\n      "
+            "1970-01-01 00:00:00.030, NaT],\n      "
             "dtype='datetime64[ms]')",
         ),
         (
             cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")),
-            "DatetimeIndex([<NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, <NA>, "
-            "<NA>,\n       <NA>],\n      dtype='datetime64[ms]')",
+            "DatetimeIndex([NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, "
+            "NaT, NaT], dtype='datetime64[ms]')",
         ),
     ],
 )
@@ -473,12 +465,7 @@ def test_dataframe_null_index_repr(df, pandas_special_case):
     pdf = df
     gdf = cudf.from_pandas(pdf)
 
-    expected_repr = (
-        repr(pdf)
-        .replace("NaN", "<NA>")
-        .replace("NaT", "<NA>")
-        .replace("None", "<NA>")
-    )
+    expected_repr = repr(pdf).replace("NaN", "<NA>").replace("None", "<NA>")
     actual_repr = repr(gdf)
 
     if pandas_special_case:
@@ -552,12 +539,7 @@ def test_series_null_index_repr(sr, pandas_special_case):
     psr = sr
     gsr = cudf.from_pandas(psr)
 
-    expected_repr = (
-        repr(psr)
-        .replace("NaN", "<NA>")
-        .replace("NaT", "<NA>")
-        .replace("None", "<NA>")
-    )
+    expected_repr = repr(psr).replace("NaN", "<NA>").replace("None", "<NA>")
     actual_repr = repr(gsr)
 
     if pandas_special_case:
@@ -603,9 +585,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
     sr = cudf.Series(data, dtype=dtype)
     psr = sr.to_pandas()
 
-    expected = (
-        repr(psr).replace("timedelta64[ns]", dtype).replace("NaT", "<NA>")
-    )
+    expected = repr(psr).replace("timedelta64[ns]", dtype)
     actual = repr(sr)
 
     assert expected.split() == actual.split()
@@ -658,7 +638,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
                 """
             0    0 days 00:00:00.001000000
             1    0 days 00:00:00.000200000
-            2                         <NA>
+            2                          NaT
             dtype: timedelta64[ns]
             """
             ),
@@ -669,7 +649,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
                 """
             0    0 days 00:16:40
             1    0 days 00:03:20
-            2               <NA>
+            2                NaT
             dtype: timedelta64[ms]
             """
             ),
@@ -680,11 +660,11 @@ def test_timedelta_series_s_us_repr(data, dtype):
             ),
             textwrap.dedent(
                 """
-            0    <NA>
-            1    <NA>
-            2    <NA>
-            3    <NA>
-            4    <NA>
+            0    NaT
+            1    NaT
+            2    NaT
+            3    NaT
+            4    NaT
             dtype: timedelta64[ns]
             """
             ),
@@ -695,11 +675,11 @@ def test_timedelta_series_s_us_repr(data, dtype):
             ),
             textwrap.dedent(
                 """
-            0    <NA>
-            1    <NA>
-            2    <NA>
-            3    <NA>
-            4    <NA>
+            0    NaT
+            1    NaT
+            2    NaT
+            3    NaT
+            4    NaT
             dtype: timedelta64[ms]
             """
             ),
@@ -930,10 +910,10 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr):
                 """
                                      a   b
                 0   1579 days 08:54:14  10
-                1                 <NA>  11
+                1                  NaT  11
                 2   2839 days 15:29:05  22
                 3   2586 days 00:33:31  33
-                4                 <NA>  44
+                4                  NaT  44
                 5  42066 days 12:52:14  55
                 6      0 days 06:27:14  66
                 """
@@ -961,10 +941,10 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr):
                 """
                                      a
                 a   1579 days 08:54:14
-                b                 <NA>
+                b                  NaT
                 c   2839 days 15:29:05
                 d   2586 days 00:33:31
-                e                 <NA>
+                e                  NaT
                 f  42066 days 12:52:14
                 g      0 days 06:27:14
                 """
@@ -994,10 +974,10 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr):
                 """
                                       a
                 1 days 13:54:17.654   1
-                <NA>                  2
+                NaT                   2
                 2 days 20:09:05.345   3
                 2 days 14:03:52.411   4
-                <NA>                  5
+                NaT                   5
                 42 days 01:35:48.734  6
                 0 days 00:00:23.234   7
                 """
@@ -1027,10 +1007,10 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr):
                 """
                                     a
                 0 days 00:00:00.136457654  a
-                <NA>                f
+                NaT                 f
                 0 days 00:00:00.245345345  q
                 0 days 00:00:00.223432411  e
-                <NA>                w
+                NaT                 w
                 0 days 00:00:03.634548734  e
                 0 days 00:00:00.000023234  t
                 """
@@ -1057,7 +1037,7 @@ def test_timedelta_dataframe_repr(df, expected_repr):
             cudf.Index(
                 [None, None, None, None, None], dtype="timedelta64[us]"
             ),
-            "TimedeltaIndex([<NA>, <NA>, <NA>, <NA>, <NA>], "
+            "TimedeltaIndex([NaT, NaT, NaT, NaT, NaT], "
             "dtype='timedelta64[us]')",
         ),
         (
@@ -1073,9 +1053,9 @@ def test_timedelta_dataframe_repr(df, expected_repr):
                 ],
                 dtype="timedelta64[us]",
             ),
-            "TimedeltaIndex([0 days 00:02:16.457654, <NA>, "
+            "TimedeltaIndex([0 days 00:02:16.457654, NaT, "
             "0 days 00:04:05.345345, "
-            "0 days 00:03:43.432411, <NA>,"
+            "0 days 00:03:43.432411, NaT,"
             "       0 days 01:00:34.548734, 0 days 00:00:00.023234],"
             "      dtype='timedelta64[us]')",
         ),
@@ -1092,8 +1072,8 @@ def test_timedelta_dataframe_repr(df, expected_repr):
                 ],
                 dtype="timedelta64[s]",
             ),
-            "TimedeltaIndex([1579 days 08:54:14, <NA>, 2839 days 15:29:05,"
-            "       2586 days 00:33:31, <NA>, 42066 days 12:52:14, "
+            "TimedeltaIndex([1579 days 08:54:14, NaT, 2839 days 15:29:05,"
+            "       2586 days 00:33:31, NaT, 42066 days 12:52:14, "
             "0 days 06:27:14],"
             "      dtype='timedelta64[s]')",
         ),
@@ -1190,7 +1170,7 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-            MultiIndex([(                         '<NA>', 'abc'),
+            MultiIndex([(                          'NaT', 'abc'),
                         ('1970-01-01 00:00:00.000000001',  <NA>),
                         ('1970-01-01 00:00:00.000000002', 'xyz'),
                         ('1970-01-01 00:00:00.000000003',  <NA>)],
@@ -1210,7 +1190,7 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-                MultiIndex([(                         '<NA>', 'abc', 0.345),
+                MultiIndex([(                          'NaT', 'abc', 0.345),
                             ('1970-01-01 00:00:00.000000001',  <NA>,  <NA>),
                             ('1970-01-01 00:00:00.000000002', 'xyz', 100.0),
                             ('1970-01-01 00:00:00.000000003',  <NA>,  10.0)],
@@ -1230,7 +1210,7 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-                MultiIndex([('abc',                      '<NA>', 0.345),
+                MultiIndex([('abc',                       'NaT', 0.345),
                             ( <NA>, '0 days 00:00:00.000000001',  <NA>),
                             ('xyz', '0 days 00:00:00.000000002', 100.0),
                             ( <NA>, '0 days 00:00:00.000000003',  10.0)],
@@ -1272,10 +1252,10 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-            MultiIndex([('<NA>', <NA>),
-                        ('<NA>', <NA>),
-                        ('<NA>', <NA>),
-                        ('<NA>', <NA>)],
+            MultiIndex([('NaT', <NA>),
+                        ('NaT', <NA>),
+                        ('NaT', <NA>),
+                        ('NaT', <NA>)],
                     names=['b', 'a'])
             """
             ),
@@ -1489,3 +1469,16 @@ def test_repr_struct_after_concat():
     pdf = df.to_pandas()
 
     assert repr(df) == repr(pdf)
+
+
+def test_interval_index_repr():
+    pi = pd.Index(
+        [
+            np.nan,
+            pd.Interval(2.0, 3.0, closed="right"),
+            pd.Interval(3.0, 4.0, closed="right"),
+        ]
+    )
+    gi = cudf.from_pandas(pi)
+
+    assert repr(pi) == repr(gi)
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index 78e95fdbd81..0a07eecd096 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -83,6 +83,23 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
     assert_eq(expect, got_from_melt_method)
 
 
+def test_melt_many_columns():
+    mydict = {"id": ["foobar"]}
+    for i in range(1, 1942):
+        mydict[f"d_{i}"] = i
+
+    df = pd.DataFrame(mydict)
+    grid_df = pd.melt(df, id_vars=["id"], var_name="d", value_name="sales")
+
+    df_d = cudf.DataFrame(mydict)
+    grid_df_d = cudf.melt(
+        df_d, id_vars=["id"], var_name="d", value_name="sales"
+    )
+    grid_df_d["d"] = grid_df_d["d"].astype("str")
+
+    assert_eq(grid_df, grid_df_d)
+
+
 @pytest.mark.parametrize("num_cols", [1, 2, 10])
 @pytest.mark.parametrize("num_rows", [1, 2, 1000])
 @pytest.mark.parametrize(
@@ -134,6 +151,101 @@ def test_df_stack_reset_index():
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "columns",
+    [
+        pd.MultiIndex.from_tuples(
+            [("A", "cat"), ("A", "dog"), ("B", "cat"), ("B", "dog")],
+            names=["letter", "animal"],
+        ),
+        pd.MultiIndex.from_tuples(
+            [("A", "cat"), ("B", "bird"), ("A", "dog"), ("B", "dog")],
+            names=["letter", "animal"],
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "level",
+    [
+        -1,
+        0,
+        1,
+        "letter",
+        "animal",
+        [0, 1],
+        [1, 0],
+        ["letter", "animal"],
+        ["animal", "letter"],
+    ],
+)
+@pytest.mark.parametrize(
+    "index",
+    [
+        pd.RangeIndex(2, name="range"),
+        pd.Index([9, 8], name="myindex"),
+        pd.MultiIndex.from_arrays(
+            [
+                ["A", "B"],
+                [101, 102],
+            ],
+            names=["first", "second"],
+        ),
+    ],
+)
+@pytest.mark.parametrize("dropna", [True, False])
+def test_df_stack_multiindex_column_axis(columns, index, level, dropna):
+    if isinstance(level, list) and len(level) > 1 and not dropna:
+        pytest.skip(
+            "Stacking multiple levels with dropna==False is unsupported."
+        )
+
+    pdf = pd.DataFrame(
+        data=[[1, 2, 3, 4], [2, 4, 6, 8]], columns=columns, index=index
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    got = gdf.stack(level=level, dropna=dropna)
+    expect = pdf.stack(level=level, dropna=dropna)
+
+    assert_eq(expect, got, check_dtype=False)
+
+
+def test_df_stack_mixed_dtypes():
+    pdf = pd.DataFrame(
+        {
+            "A": pd.Series([1, 2, 3], dtype="f4"),
+            "B": pd.Series([4, 5, 6], dtype="f8"),
+        }
+    )
+
+    gdf = cudf.from_pandas(pdf)
+
+    got = gdf.stack()
+    expect = pdf.stack()
+
+    assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize("level", [["animal", "hair_length"], [1, 2]])
+def test_df_stack_multiindex_column_axis_pd_example(level):
+    columns = pd.MultiIndex.from_tuples(
+        [
+            ("A", "cat", "long"),
+            ("B", "cat", "long"),
+            ("A", "dog", "short"),
+            ("B", "dog", "short"),
+        ],
+        names=["exp", "animal", "hair_length"],
+    )
+
+    df = pd.DataFrame(np.random.randn(4, 4), columns=columns)
+
+    expect = df.stack(level=level)
+    got = cudf.from_pandas(df).stack(level=level)
+
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize("num_rows", [1, 2, 10, 1000])
 @pytest.mark.parametrize("num_cols", [1, 2, 10])
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 5e1e58f9e68..d73a1d40aaa 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 import datetime
 import re
@@ -212,7 +212,12 @@ def test_scalar_roundtrip(value):
 )
 def test_null_scalar(dtype):
     s = cudf.Scalar(None, dtype=dtype)
-    assert s.value is cudf.NA
+    if cudf.api.types.is_datetime64_dtype(
+        dtype
+    ) or cudf.api.types.is_timedelta64_dtype(dtype):
+        assert s.value is cudf.NaT
+    else:
+        assert s.value is cudf.NA
     assert s.dtype == (
         cudf.dtype(dtype)
         if not isinstance(dtype, cudf.core.dtypes.DecimalDtype)
@@ -236,7 +241,7 @@ def test_null_scalar(dtype):
 )
 def test_nat_to_null_scalar_succeeds(value):
     s = cudf.Scalar(value)
-    assert s.value is cudf.NA
+    assert s.value is cudf.NaT
     assert not s.is_valid()
     assert s.dtype == value.dtype
 
@@ -349,7 +354,12 @@ def test_scalar_implicit_int_conversion(value):
 def test_scalar_invalid_implicit_conversion(cls, dtype):
 
     try:
-        cls(pd.NA)
+        cls(
+            pd.NaT
+            if cudf.api.types.is_datetime64_dtype(dtype)
+            or cudf.api.types.is_timedelta64_dtype(dtype)
+            else pd.NA
+        )
     except TypeError as e:
         with pytest.raises(TypeError, match=re.escape(str(e))):
             slr = cudf.Scalar(None, dtype=dtype)
@@ -450,3 +460,13 @@ def test_scalar_numpy_casting():
     s1 = cudf.Scalar(1, dtype=np.int32)
     s2 = np.int64(2)
     assert s1 < s2
+
+
+def test_construct_timezone_scalar_error():
+    pd_scalar = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc")
+    with pytest.raises(NotImplementedError):
+        cudf.utils.dtypes.to_cudf_compatible_scalar(pd_scalar)
+
+    date_scalar = datetime.datetime.now(datetime.timezone.utc)
+    with pytest.raises(NotImplementedError):
+        cudf.utils.dtypes.to_cudf_compatible_scalar(date_scalar)
diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py
index d3433a589a7..b0eacb1a709 100644
--- a/python/cudf/cudf/tests/test_search.py
+++ b/python/cudf/cudf/tests/test_search.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 import cupy
 import numpy as np
 import pandas as pd
@@ -156,3 +156,15 @@ def test_searchsorted_misc():
         psr.searchsorted([-100, 3.00001, 2.2, 2.0, 2.000000001]),
         sr.searchsorted([-100, 3.00001, 2.2, 2.0, 2.000000001]),
     )
+
+
+@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/54668")
+def test_searchsorted_mixed_str_int():
+    psr = pd.Series([1, 2, 3], dtype="int")
+    sr = cudf.from_pandas(psr)
+
+    with pytest.raises(ValueError):
+        actual = sr.searchsorted("a")
+    with pytest.raises(ValueError):
+        expect = psr.searchsorted("a")
+    assert_eq(expect, actual)
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index e7f26e259c6..01dd11ad7ad 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -347,6 +347,17 @@ def test_serialize_seriesgroupby():
     assert_eq(recreated.sum(), gb.sum())
 
 
+def test_serialize_seriesresampler():
+    index = cudf.date_range(start="2001-01-01", periods=10, freq="1T")
+    sr = cudf.Series(range(10), index=index)
+    re_sampler = sr.resample("3T")
+    actual = re_sampler.sum()
+    recreated = re_sampler.__class__.deserialize(*re_sampler.serialize())
+    expected = recreated.sum()
+
+    assert_eq(actual, expected)
+
+
 def test_serialize_string_check_buffer_sizes():
     df = cudf.DataFrame({"a": ["a", "b", "cd", None]})
     expect = df.memory_usage(deep=True).loc["a"]
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 3bb669dede6..783d7d31d7f 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2045,7 +2045,7 @@ def test_series_ordered_dedup():
     sr = cudf.Series(np.random.randint(0, 100, 1000))
     # pandas unique() preserves order
     expect = pd.Series(sr.to_pandas().unique())
-    got = cudf.Series(sr._column.unique(preserve_order=True))
+    got = cudf.Series(sr._column.unique())
     assert_eq(expect.values, got.values)
 
 
@@ -2185,3 +2185,85 @@ def test_series_init_error():
         lfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}),
         rfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}),
     )
+
+
+@pytest.mark.parametrize(
+    "dtype", ["datetime64[ns]", "timedelta64[ns]", "object", "str"]
+)
+def test_series_mixed_dtype_error(dtype):
+    ps = pd.concat([pd.Series([1, 2, 3], dtype=dtype), pd.Series([10, 11])])
+    with pytest.raises(TypeError):
+        cudf.Series(ps)
+    with pytest.raises(TypeError):
+        cudf.Series(ps.array)
+
+
+@pytest.mark.parametrize("data", [[True, False, None], [10, 200, 300]])
+@pytest.mark.parametrize("index", [None, [10, 20, 30]])
+def test_series_contains(data, index):
+    ps = pd.Series(data, index=index)
+    gs = cudf.from_pandas(ps)
+
+    assert_eq(1 in ps, 1 in gs)
+    assert_eq(10 in ps, 10 in gs)
+    assert_eq(True in ps, True in gs)
+    assert_eq(False in ps, False in gs)
+
+
+def test_series_from_pandas_sparse():
+    pser = pd.Series(range(2), dtype=pd.SparseDtype(np.int64, 0))
+    with pytest.raises(NotImplementedError):
+        cudf.Series(pser)
+
+
+def test_series_constructor_unbounded_sequence():
+    class A:
+        def __getitem__(self, key):
+            return 1
+
+    with pytest.raises(TypeError):
+        cudf.Series(A())
+
+
+def test_series_constructor_error_mixed_type():
+    with pytest.raises(pa.ArrowTypeError):
+        cudf.Series(["abc", np.nan, "123"], nan_as_null=False)
+
+
+def test_series_typecast_to_object_error():
+    actual = cudf.Series([1, 2, 3], dtype="datetime64[ns]")
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(ValueError):
+            actual.astype(object)
+        with pytest.raises(ValueError):
+            actual.astype(np.dtype("object"))
+        new_series = actual.astype("str")
+        assert new_series[0] == "1970-01-01 00:00:00.000000001"
+
+
+def test_series_typecast_to_object():
+    actual = cudf.Series([1, 2, 3], dtype="datetime64[ns]")
+    with cudf.option_context("mode.pandas_compatible", False):
+        new_series = actual.astype(object)
+        assert new_series[0] == "1970-01-01 00:00:00.000000001"
+        new_series = actual.astype(np.dtype("object"))
+        assert new_series[0] == "1970-01-01 00:00:00.000000001"
+
+
+@pytest.mark.parametrize("attr", ["nlargest", "nsmallest"])
+def test_series_nlargest_nsmallest_str_error(attr):
+    gs = cudf.Series(["a", "b", "c", "d", "e"])
+    ps = gs.to_pandas()
+
+    assert_exceptions_equal(
+        getattr(gs, attr), getattr(ps, attr), ([], {"n": 1}), ([], {"n": 1})
+    )
+
+
+def test_series_unique_pandas_compatibility():
+    gs = cudf.Series([10, 11, 12, 11, 10])
+    ps = gs.to_pandas()
+    with cudf.option_context("mode.pandas_compatible", True):
+        actual = gs.unique()
+    expected = ps.unique()
+    assert_eq(actual, expected)
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index dd82a9244b6..bfcf72babaf 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -153,7 +153,6 @@ def test_setitem_dataframe_series_inplace(df):
     ],
 )
 def test_series_set_equal_length_object_by_mask(replace_data):
-
     psr = pd.Series([1, 2, 3, 4, 5], dtype="Int64")
     gsr = cudf.from_pandas(psr)
 
@@ -368,3 +367,99 @@ def test_setitem_str_trailing_null(n):
     assert s[0] == ""
     s[0] = "\x00"
     assert s[0] == "\x00"
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/7448")
+def test_iloc_setitem_7448():
+    index = pd.MultiIndex.from_product([(1, 2), (3, 4)])
+    expect = cudf.Series([1, 2, 3, 4], index=index)
+    actual = cudf.from_pandas(expect)
+    expect[(1, 3)] = 101
+    actual[(1, 3)] = 101
+    assert_eq(expect, actual)
+
+
+@pytest.mark.parametrize(
+    "value",
+    [
+        "7",
+        pytest.param(
+            ["7", "8"],
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/11298"
+            ),
+        ),
+    ],
+)
+def test_loc_setitem_string_11298(value):
+    df = pd.DataFrame({"a": ["a", "b", "c"]})
+    cdf = cudf.from_pandas(df)
+
+    df.loc[:1, "a"] = value
+
+    cdf.loc[:1, "a"] = value
+
+    assert_eq(df, cdf)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/11944")
+def test_loc_setitem_list_11944():
+    df = pd.DataFrame(
+        data={"a": ["yes", "no"], "b": [["l1", "l2"], ["c", "d"]]}
+    )
+    cdf = cudf.from_pandas(df)
+    df.loc[df.a == "yes", "b"] = [["hello"]]
+    cdf.loc[df.a == "yes", "b"] = [["hello"]]
+    assert_eq(df, cdf)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12504")
+def test_loc_setitem_extend_empty_12504():
+    df = pd.DataFrame(columns=["a"])
+    cdf = cudf.from_pandas(df)
+
+    df.loc[0] = [1]
+
+    cdf.loc[0] = [1]
+
+    assert_eq(df, cdf)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12505")
+def test_loc_setitem_extend_existing_12505():
+    df = pd.DataFrame({"a": [0]})
+    cdf = cudf.from_pandas(df)
+
+    df.loc[1] = 1
+
+    cdf.loc[1] = 1
+
+    assert_eq(df, cdf)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12801")
+def test_loc_setitem_add_column_partial_12801():
+    df = pd.DataFrame({"a": [0, 1, 2]})
+    cdf = cudf.from_pandas(df)
+
+    df.loc[df.a < 2, "b"] = 1
+
+    cdf.loc[cdf.a < 2, "b"] = 1
+
+    assert_eq(df, cdf)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13031")
+@pytest.mark.parametrize("other_index", [["1", "3", "2"], [1, 2, 3]])
+def test_loc_setitem_series_index_alignment_13031(other_index):
+    s = pd.Series([1, 2, 3], index=["1", "2", "3"])
+    other = pd.Series([5, 6, 7], index=other_index)
+
+    cs = cudf.from_pandas(s)
+    cother = cudf.from_pandas(other)
+
+    s.loc[["1", "3"]] = other
+
+    cs.loc[["1", "3"]] = cother
+
+    assert_eq(s, cs)
diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py
index 0dd47c219c0..3248e7f72c0 100644
--- a/python/cudf/cudf/tests/test_sparse_df.py
+++ b/python/cudf/cudf/tests/test_sparse_df.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import numpy as np
 
@@ -7,7 +7,7 @@
 
 def test_to_dense_array():
     data = np.random.random(8)
-    mask = np.asarray([0b11010110], dtype=np.byte)
+    mask = np.asarray([0b11010110]).astype(np.byte)
 
     sr = Series.from_masked_array(data=data, mask=mask, null_count=3)
     assert sr.has_nulls
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 12e832ba23b..2bddd93ccb8 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1075,7 +1075,8 @@ def test_string_index():
     pdf.index = stringIndex
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
-    stringIndex = StringIndex(["a", "b", "c", "d", "e"], name="name")
+    with pytest.warns(FutureWarning):
+        stringIndex = StringIndex(["a", "b", "c", "d", "e"], name="name")
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
@@ -1102,8 +1103,7 @@ def test_string_unique(item):
     gs = cudf.Series(item)
     # Pandas `unique` returns a numpy array
     pres = pd.Series(ps.unique())
-    # cudf returns sorted unique with `None` placed before other strings
-    pres = pres.sort_values(na_position="first").reset_index(drop=True)
+    # cudf returns a cudf.Series
     gres = gs.unique()
     assert_eq(pres, gres)
 
@@ -2755,7 +2755,7 @@ def test_string_str_subscriptable(data, index):
     assert_eq(psr.str[index], gsr.str[index])
 
     psi = pd.Index(data)
-    gsi = StringIndex(data)
+    gsi = cudf.Index(data)
 
     assert_eq(psi.str[index], gsi.str[index])
 
diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py
index 049dfdc8e30..88c73ccf964 100644
--- a/python/cudf/cudf/tests/test_string_udfs.py
+++ b/python/cudf/cudf/tests/test_string_udfs.py
@@ -22,6 +22,7 @@
 )
 from cudf.core.udf.utils import _PTX_FILE, _get_extensionty_size
 from cudf.testing._utils import assert_eq, sv_to_udf_str
+from cudf.utils._numba import _CUDFNumbaConfig
 
 
 def get_kernels(func, dtype, size):
@@ -85,8 +86,8 @@ def run_udf_test(data, func, dtype):
     sv_kernel, udf_str_kernel = get_kernels(func, dtype, len(data))
 
     expect = pd.Series(data).apply(func)
-
-    sv_kernel.forall(len(data))(str_views, output)
+    with _CUDFNumbaConfig():
+        sv_kernel.forall(len(data))(str_views, output)
     if dtype == "str":
         result = column_from_udf_string_array(output)
     else:
@@ -94,7 +95,8 @@ def run_udf_test(data, func, dtype):
 
     got = cudf.Series(result, dtype=dtype)
     assert_eq(expect, got, check_dtype=False)
-    udf_str_kernel.forall(len(data))(str_views, output)
+    with _CUDFNumbaConfig():
+        udf_str_kernel.forall(len(data))(str_views, output)
     if dtype == "str":
         result = column_from_udf_string_array(output)
     else:
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index eaee1efcbc8..a3593e55b97 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -392,3 +392,20 @@ def test_struct_with_null_memory_usage():
 
     s[2:4] = None
     assert s.memory_usage() == 272
+
+
+@pytest.mark.parametrize(
+    "indices",
+    [slice(0, 3), slice(1, 4), slice(None, None, 2), slice(1, None, 2)],
+    ids=[":3", "1:4", "0::2", "1::2"],
+)
+@pytest.mark.parametrize(
+    "values",
+    [[None, {}, {}, None], [{}, {}, {}, {}]],
+    ids=["nulls", "no_nulls"],
+)
+def test_struct_empty_children_slice(indices, values):
+    s = cudf.Series(values)
+    actual = s.iloc[indices]
+    expect = cudf.Series(values[indices], index=range(len(values))[indices])
+    assert_eq(actual, expect)
diff --git a/python/cudf/cudf/tests/test_subword_tokenizer.py b/python/cudf/cudf/tests/test_subword_tokenizer.py
deleted file mode 100644
index 9084132243e..00000000000
--- a/python/cudf/cudf/tests/test_subword_tokenizer.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-import os
-
-import cupy
-import numpy as np
-import pytest
-
-import cudf
-from cudf.core.subword_tokenizer import SubwordTokenizer
-from cudf.testing._utils import assert_eq
-
-
-@pytest.fixture(scope="module")
-def datadir(datadir):
-    return os.path.join(datadir, "subword_tokenizer_data")
-
-
-def assert_equal_tokenization_outputs(hf_output, cudf_output):
-    assert (
-        np.sum(hf_output["input_ids"] != cudf_output["input_ids"].get()) == 0
-    )
-    assert (
-        np.sum(
-            hf_output["attention_mask"] != cudf_output["attention_mask"].get()
-        )
-        == 0
-    )
-
-
-@pytest.mark.parametrize("seq_len", [32, 64])
-@pytest.mark.parametrize("stride", [0, 15, 30])
-@pytest.mark.parametrize("add_special_tokens", [True, False])
-@pytest.mark.parametrize("do_lower_case", [True, False])
-def test_subword_tokenize(
-    seq_len, stride, add_special_tokens, do_lower_case, datadir
-):
-    with open(
-        os.path.join(datadir, "test_sentences.txt"), encoding="utf-8"
-    ) as file:
-        input_sentence_ls = [line.strip() for line in file]
-
-    vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
-
-    transformers = pytest.importorskip("transformers")
-
-    hf_tokenizer = transformers.BertTokenizer.from_pretrained(
-        vocab_dir, do_lower_case=do_lower_case
-    )
-
-    hf_output = hf_tokenizer(
-        input_sentence_ls,
-        max_length=seq_len,
-        stride=stride,
-        padding="max_length",
-        return_tensors="np",
-        truncation=True,
-        add_special_tokens=add_special_tokens,
-    )
-
-    vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt")
-    str_series = cudf.Series(input_sentence_ls)
-    cudf_tokenizer = SubwordTokenizer(vocab_hash, do_lower_case=do_lower_case)
-    cudf_output = cudf_tokenizer(
-        str_series,
-        max_length=seq_len,
-        max_num_rows=len(str_series),
-        stride=stride,
-        padding="max_length",
-        return_tensors="cp",
-        truncation=True,
-        add_special_tokens=add_special_tokens,
-    )
-    assert_equal_tokenization_outputs(hf_output, cudf_output)
-
-
-def test_subword_tokenize_with_truncation(datadir):
-    vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
-    vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt")
-    str_series = cudf.Series(["Test error"])
-    cudf_tokenizer = SubwordTokenizer(vocab_hash)
-
-    error_msg = (
-        "Adding special tokens is not supported with truncation = False. "
-        "Custom Cupy kernel can potentially "
-        "be used to add it. For reference "
-        "see: _bert_add_special_tokens"
-    )
-
-    with pytest.raises(NotImplementedError, match=error_msg):
-        cudf_tokenizer(
-            str_series,
-            max_length=64,
-            max_num_rows=len(str_series),
-            truncation=False,
-            add_special_tokens=True,
-        )
-
-
-def test_text_subword_tokenize(tmpdir):
-    sr = cudf.Series(
-        [
-            "This is a test",
-            "A test this is",
-            "Is test a this",
-            "Test   test",
-            "this   This",
-        ]
-    )
-    hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt")
-    content = "1\n0\n23\n"
-    coefficients = [65559] * 23
-    for c in coefficients:
-        content = content + str(c) + " 0\n"
-    # based on values from the bert_hash_table.txt file for the
-    # test words used here: 'this' 'is' 'a' test'
-    table = [0] * 23
-    table[0] = 3015668
-    table[1] = 6205475701751155871
-    table[5] = 6358029
-    table[16] = 451412625363
-    table[20] = 6206321707968235495
-    content = content + "23\n"
-    for v in table:
-        content = content + str(v) + "\n"
-    content = content + "100\n101\n102\n\n"
-    hash_file.write(content)
-
-    cudf_tokenizer = SubwordTokenizer(hash_file)
-
-    token_d = cudf_tokenizer(
-        sr, 8, 8, add_special_tokens=False, truncation=True
-    )
-    tokens, masks, metadata = (
-        token_d["input_ids"],
-        token_d["attention_mask"],
-        token_d["metadata"],
-    )
-    expected_tokens = cupy.asarray(
-        [
-            2023,
-            2003,
-            1037,
-            3231,
-            0,
-            0,
-            0,
-            0,
-            1037,
-            3231,
-            2023,
-            2003,
-            0,
-            0,
-            0,
-            0,
-            2003,
-            3231,
-            1037,
-            2023,
-            0,
-            0,
-            0,
-            0,
-            3231,
-            3231,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            2023,
-            2023,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-        ],
-        dtype=np.uint32,
-    )
-    expected_tokens = expected_tokens.reshape(-1, 8)
-    assert_eq(expected_tokens, tokens)
-
-    expected_masks = cupy.asarray(
-        [
-            1,
-            1,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-        ],
-        dtype=np.uint32,
-    )
-    expected_masks = expected_masks.reshape(-1, 8)
-    assert_eq(expected_masks, masks)
-
-    expected_metadata = cupy.asarray(
-        [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32
-    )
-    expected_metadata = expected_metadata.reshape(-1, 3)
-    assert_eq(expected_metadata, metadata)
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 4b1e8cf1027..ef39a4fef5a 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -57,6 +57,15 @@
     [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234],
 ]
 
+_cmpops = [
+    operator.lt,
+    operator.gt,
+    operator.le,
+    operator.ge,
+    operator.eq,
+    operator.ne,
+]
+
 
 @pytest.mark.parametrize(
     "data",
@@ -1426,3 +1435,36 @@ def test_timedelta_constructor(data, dtype):
     actual = cudf.TimedeltaIndex(data=cudf.Series(data), dtype=dtype)
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("op", [operator.add, operator.sub])
+def test_timdelta_binop_tz_timestamp(op):
+    s = cudf.Series([1, 2, 3], dtype="timedelta64[ns]")
+    pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc")
+    with pytest.raises(NotImplementedError):
+        op(s, pd_tz_timestamp)
+    date_tz_scalar = datetime.datetime.now(datetime.timezone.utc)
+    with pytest.raises(NotImplementedError):
+        op(s, date_tz_scalar)
+
+
+def test_timedelta_getitem_na():
+    s = cudf.Series([1, 2, None, 3], dtype="timedelta64[ns]")
+    assert s[2] is cudf.NaT
+
+
+@pytest.mark.parametrize("data1", [[123, 456, None, 321, None]])
+@pytest.mark.parametrize("data2", [[123, 456, 789, None, None]])
+@pytest.mark.parametrize("op", _cmpops)
+def test_timedelta_series_cmpops_pandas_compatibility(data1, data2, op):
+    gsr1 = cudf.Series(data=data1, dtype="timedelta64[ns]")
+    psr1 = gsr1.to_pandas()
+
+    gsr2 = cudf.Series(data=data2, dtype="timedelta64[ns]")
+    psr2 = gsr2.to_pandas()
+
+    expect = op(psr1, psr2)
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = op(gsr1, gsr2)
+
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 515a9fd5956..85531f8fae8 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -802,6 +802,42 @@ def func(x):
     run_masked_udf_series(func, data, check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        np.array(
+            [0, 1, -1, 0, np.iinfo("int64").min, np.iinfo("int64").max],
+            dtype="int64",
+        ),
+        np.array([0, 0, 1, np.iinfo("uint64").max], dtype="uint64"),
+        np.array(
+            [
+                0,
+                0.0,
+                -1.0,
+                1.5,
+                -1.5,
+                np.finfo("float64").min,
+                np.finfo("float64").max,
+                np.nan,
+                np.inf,
+                -np.inf,
+            ],
+            dtype="float64",
+        ),
+        [False, True, False, cudf.NA],
+    ],
+)
+def test_masked_udf_abs(data):
+    data = cudf.Series(data)
+    data[0] = cudf.NA
+
+    def func(x):
+        return abs(x)
+
+    run_masked_udf_series(func, data, check_dtype=False)
+
+
 class TestStringUDFs:
     def test_string_udf_len(self, str_udf_data):
         def func(row):
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index 3f2f2072758..15d9d03d4a7 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import itertools
 import operator
@@ -77,9 +77,10 @@ def generate_valid_scalar_unaop_combos():
     return results
 
 
+@pytest.mark.filterwarnings("ignore:overflow encountered in scalar negative")
 @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos())
 def test_scalar_unary_operations(slr, dtype, op):
-    slr_host = cudf.dtype(dtype).type(slr)
+    slr_host = np.array([slr])[0].astype(cudf.dtype(dtype))
     slr_device = cudf.Scalar(slr, dtype=dtype)
 
     expect = op(slr_host)
@@ -122,3 +123,9 @@ def test_scalar_no_negative_bools():
         ),
     ):
         -x
+
+
+def test_series_bool_neg():
+    sr = Series([True, False, True, None, False, None, True, True])
+    psr = sr.to_pandas(nullable=True)
+    utils.assert_eq((-sr).to_pandas(nullable=True), -psr, check_dtype=True)
diff --git a/python/cudf/cudf/tests/text/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py
index 06777c8e6af..ac17daa8601 100644
--- a/python/cudf/cudf/tests/text/test_subword_tokenizer.py
+++ b/python/cudf/cudf/tests/text/test_subword_tokenizer.py
@@ -1 +1,238 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+import os
+
+import cupy
+import numpy as np
+import pytest
+
+import cudf
+from cudf.core.subword_tokenizer import SubwordTokenizer
+from cudf.testing._utils import assert_eq
+
+
+@pytest.fixture(scope="module")
+def datadir(datadir):
+    return os.path.join(datadir, "subword_tokenizer_data")
+
+
+def assert_equal_tokenization_outputs(hf_output, cudf_output):
+    assert (
+        np.sum(hf_output["input_ids"] != cudf_output["input_ids"].get()) == 0
+    )
+    assert (
+        np.sum(
+            hf_output["attention_mask"] != cudf_output["attention_mask"].get()
+        )
+        == 0
+    )
+
+
+@pytest.mark.parametrize("seq_len", [32, 64])
+@pytest.mark.parametrize("stride", [0, 15, 30])
+@pytest.mark.parametrize("add_special_tokens", [True, False])
+@pytest.mark.parametrize("do_lower_case", [True, False])
+def test_subword_tokenize(
+    seq_len, stride, add_special_tokens, do_lower_case, datadir
+):
+    with open(
+        os.path.join(datadir, "test_sentences.txt"), encoding="utf-8"
+    ) as file:
+        input_sentence_ls = [line.strip() for line in file]
+
+    vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
+
+    transformers = pytest.importorskip("transformers")
+
+    hf_tokenizer = transformers.BertTokenizer.from_pretrained(
+        vocab_dir, do_lower_case=do_lower_case
+    )
+
+    hf_output = hf_tokenizer(
+        input_sentence_ls,
+        max_length=seq_len,
+        stride=stride,
+        padding="max_length",
+        return_tensors="np",
+        truncation=True,
+        add_special_tokens=add_special_tokens,
+    )
+
+    vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt")
+    str_series = cudf.Series(input_sentence_ls)
+    cudf_tokenizer = SubwordTokenizer(vocab_hash, do_lower_case=do_lower_case)
+    cudf_output = cudf_tokenizer(
+        str_series,
+        max_length=seq_len,
+        max_num_rows=len(str_series),
+        stride=stride,
+        padding="max_length",
+        return_tensors="cp",
+        truncation=True,
+        add_special_tokens=add_special_tokens,
+    )
+    assert_equal_tokenization_outputs(hf_output, cudf_output)
+
+
+def test_subword_tokenize_with_truncation(datadir):
+    vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
+    vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt")
+    str_series = cudf.Series(["Test error"])
+    cudf_tokenizer = SubwordTokenizer(vocab_hash)
+
+    error_msg = (
+        "Adding special tokens is not supported with truncation = False. "
+        "Custom Cupy kernel can potentially "
+        "be used to add it. For reference "
+        "see: _bert_add_special_tokens"
+    )
+
+    with pytest.raises(NotImplementedError, match=error_msg):
+        cudf_tokenizer(
+            str_series,
+            max_length=64,
+            max_num_rows=len(str_series),
+            truncation=False,
+            add_special_tokens=True,
+        )
+
+
+def test_text_subword_tokenize(tmpdir):
+    sr = cudf.Series(
+        [
+            "This is a test",
+            "A test this is",
+            "Is test a this",
+            "Test   test",
+            "this   This",
+        ]
+    )
+    hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt")
+    content = "1\n0\n23\n"
+    coefficients = [65559] * 23
+    for c in coefficients:
+        content = content + str(c) + " 0\n"
+    # based on values from the bert_hash_table.txt file for the
+    # test words used here: 'this' 'is' 'a' test'
+    table = [0] * 23
+    table[0] = 3015668
+    table[1] = 6205475701751155871
+    table[5] = 6358029
+    table[16] = 451412625363
+    table[20] = 6206321707968235495
+    content = content + "23\n"
+    for v in table:
+        content = content + str(v) + "\n"
+    content = content + "100\n101\n102\n\n"
+    hash_file.write(content)
+
+    cudf_tokenizer = SubwordTokenizer(hash_file)
+
+    token_d = cudf_tokenizer(
+        sr, 8, 8, add_special_tokens=False, truncation=True
+    )
+    tokens, masks, metadata = (
+        token_d["input_ids"],
+        token_d["attention_mask"],
+        token_d["metadata"],
+    )
+    expected_tokens = cupy.asarray(
+        [
+            2023,
+            2003,
+            1037,
+            3231,
+            0,
+            0,
+            0,
+            0,
+            1037,
+            3231,
+            2023,
+            2003,
+            0,
+            0,
+            0,
+            0,
+            2003,
+            3231,
+            1037,
+            2023,
+            0,
+            0,
+            0,
+            0,
+            3231,
+            3231,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            2023,
+            2023,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ],
+        dtype=np.uint32,
+    )
+    expected_tokens = expected_tokens.reshape(-1, 8)
+    assert_eq(expected_tokens, tokens)
+
+    expected_masks = cupy.asarray(
+        [
+            1,
+            1,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ],
+        dtype=np.uint32,
+    )
+    expected_masks = expected_masks.reshape(-1, 8)
+    assert_eq(expected_masks, masks)
+
+    expected_metadata = cupy.asarray(
+        [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32
+    )
+    expected_metadata = expected_metadata.reshape(-1, 3)
+    assert_eq(expected_metadata, metadata)
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/text/test_text_methods.py
similarity index 81%
rename from python/cudf/cudf/tests/test_text.py
rename to python/cudf/cudf/tests/text/test_text_methods.py
index 46ee7b58c87..8cda15e4acc 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
-from io import StringIO
+import random
+import string
 
 import numpy as np
 import pytest
@@ -9,11 +10,6 @@
 from cudf.testing._utils import assert_eq
 
 
-@pytest.fixture(scope="module")
-def datadir(datadir):
-    return datadir / "text"
-
-
 def test_tokenize():
     strings = cudf.Series(
         [
@@ -52,6 +48,35 @@ def test_tokenize():
     assert_eq(expected, actual)
 
 
+def test_tokenize_delimiter():
+    strings = cudf.Series(
+        [
+            "the quick fox jumped over the lazy dog",
+            "the siamésé cat jumped under the sofa",
+            None,
+            "",
+        ]
+    )
+
+    expected_values = cudf.Series(
+        [
+            "the quick f",
+            "x jumped ",
+            "ver the lazy d",
+            "g",
+            "the siamésé cat jumped under the s",
+            "fa",
+        ]
+    )
+    expected_index = strings.index.repeat(strings.str.token_count("o"))
+    expected = cudf.Series(expected_values, index=expected_index)
+
+    actual = strings.str.tokenize(delimiter="o")
+
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)
+
+
 def test_detokenize():
     strings = cudf.Series(
         [
@@ -291,6 +316,26 @@ def test_character_ngrams(n, expected_values, expected_index, as_list):
     assert_eq(expected, actual)
 
 
+def test_hash_character_ngrams():
+    strings = cudf.Series(["abcdefg", "stuvwxyz"])
+    expected = cudf.Series(
+        [
+            cudf.Series([3902511862, 570445242, 4202475763], dtype=np.uint32),
+            cudf.Series(
+                [556054766, 3166857694, 3760633458, 192452857], dtype=np.uint32
+            ),
+        ]
+    )
+    actual = strings.str.hash_character_ngrams(5, True)
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)
+
+    actual = strings.str.hash_character_ngrams(5)
+    expected = expected.explode()
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)
+
+
 @pytest.mark.parametrize(
     "n, separator, expected_values",
     [
@@ -548,7 +593,7 @@ def test_character_tokenize_index():
     assert_eq(expected, actual)
 
     sr = cudf.Index([""])
-    expected = cudf.StringIndex([], dtype="object")
+    expected = cudf.Index([], dtype="object")
 
     actual = sr.str.character_tokenize()
     assert_eq(expected, actual)
@@ -791,7 +836,15 @@ def test_is_vowel_consonant():
 
 def test_minhash():
     strings = cudf.Series(["this is my", "favorite book", None, ""])
-    expected = cudf.Series([21141582, 962346254, None, 0], dtype=np.uint32)
+
+    expected = cudf.Series(
+        [
+            cudf.Series([21141582], dtype=np.uint32),
+            cudf.Series([962346254], dtype=np.uint32),
+            None,
+            cudf.Series([0], dtype=np.uint32),
+        ]
+    )
     actual = strings.str.minhash()
     assert_eq(expected, actual)
     seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
@@ -803,164 +856,118 @@ def test_minhash():
             cudf.Series([0, 0, 0], dtype=np.uint32),
         ]
     )
-    actual = strings.str.minhash(seeds=seeds, n=5)
+    actual = strings.str.minhash(seeds=seeds, width=5)
     assert_eq(expected, actual)
 
-    with pytest.raises(ValueError):
-        strings.str.minhash(seeds=7)
-    with pytest.raises(ValueError):
-        strings.str.minhash(seeds=seeds, method="md5")
-    with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
-        strings.str.minhash(seeds=seeds)
-
-
-def test_read_text(datadir):
-    chess_file = str(datadir) + "/chess.pgn"
-    delimiter = "1."
-
-    with open(chess_file) as f:
-        content = f.read().split(delimiter)
-
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
     expected = cudf.Series(
         [
-            c + delimiter if i < (len(content) - 1) else c
-            for i, c in enumerate(content)
+            cudf.Series([3232308021562742685], dtype=np.uint64),
+            cudf.Series([23008204270530356], dtype=np.uint64),
+            None,
+            cudf.Series([0], dtype=np.uint64),
         ]
     )
-
-    actual = cudf.read_text(chess_file, delimiter=delimiter)
-
+    actual = strings.str.minhash64()
     assert_eq(expected, actual)
-
-
-def test_read_text_byte_range(datadir):
-    chess_file = str(datadir) + "/chess.pgn"
-    delimiter = "1."
-
-    with open(chess_file, "r") as f:
-        data = f.read()
-        content = data.split(delimiter)
-
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
+    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
     expected = cudf.Series(
         [
-            c + delimiter if i < (len(content) - 1) else c
-            for i, c in enumerate(content)
+            cudf.Series(
+                [7082801294247314046, 185949556058924788, 167570629329462454],
+                dtype=np.uint64,
+            ),
+            cudf.Series(
+                [382665377781028452, 86243762733551437, 7688750597953083512],
+                dtype=np.uint64,
+            ),
+            None,
+            cudf.Series([0, 0, 0], dtype=np.uint64),
         ]
     )
-
-    byte_range_size = (len(data) // 3) + (len(data) % 3 != 0)
-
-    actual_0 = cudf.read_text(
-        chess_file,
-        delimiter=delimiter,
-        byte_range=[byte_range_size * 0, byte_range_size],
-    )
-    actual_1 = cudf.read_text(
-        chess_file,
-        delimiter=delimiter,
-        byte_range=[byte_range_size * 1, byte_range_size],
-    )
-    actual_2 = cudf.read_text(
-        chess_file,
-        delimiter=delimiter,
-        byte_range=[byte_range_size * 2, byte_range_size],
-    )
-
-    actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True)
-
+    actual = strings.str.minhash64(seeds=seeds, width=5)
     assert_eq(expected, actual)
 
+    # test wrong seed types
+    with pytest.raises(ValueError):
+        strings.str.minhash(seeds="a")
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
+        strings.str.minhash(seeds=seeds)
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+        strings.str.minhash64(seeds=seeds)
 
-def test_read_text_byte_range_large(tmpdir):
-    content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000))
-    delimiter = "\n"
-    temp_file = str(tmpdir) + "/temp.txt"
-
-    with open(temp_file, "w") as f:
-        f.write(content)
-
-    expected = cudf.Series(["xxxx\n" for i in range(0, 200)])
-
-    actual = cudf.read_text(
-        temp_file, delimiter=delimiter, byte_range=[1000, 1000]
-    )
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_in_memory(datadir):
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(["x::", "y::", "z"])
-
-    actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_in_memory_strip_delimiter(datadir):
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(["x", "y", "z"])
 
-    actual = cudf.read_text(
-        StringIO("x::y::z"), delimiter="::", strip_delimiters=True
-    )
+def test_jaccard_index():
+    str1 = cudf.Series(["the brown dog", "jumped about"])
+    str2 = cudf.Series(["the black cat", "jumped around"])
 
+    expected = cudf.Series([0.058824, 0.307692], dtype=np.float32)
+    actual = str1.str.jaccard_index(str2, 5)
     assert_eq(expected, actual)
 
-
-def test_read_text_bgzip(datadir):
-    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
-    chess_file = str(datadir) + "/chess.pgn"
-    delimiter = "1."
-
-    with open(chess_file) as f:
-        content = f.read().split(delimiter)
-
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(
-        [
-            c + delimiter if i < (len(content) - 1) else c
-            for i, c in enumerate(content)
-        ]
-    )
-
-    actual = cudf.read_text(
-        chess_file_compressed, compression="bgzip", delimiter=delimiter
-    )
-
+    actual = str2.str.jaccard_index(str1, 5)
     assert_eq(expected, actual)
 
-
-def test_read_text_bgzip_offsets(datadir):
-    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
-    chess_file = str(datadir) + "/chess.pgn"
-    delimiter = "1."
-
-    with open(chess_file) as f:
-        content = f.read()[29:695].split(delimiter)
-
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(
-        [
-            c + delimiter if i < (len(content) - 1) else c
-            for i, c in enumerate(content)
-        ]
-    )
-
-    actual = cudf.read_text(
-        chess_file_compressed,
-        compression="bgzip",
-        compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7],
-        delimiter=delimiter,
-    )
-
+    with pytest.raises(ValueError):
+        str1.str.jaccard_index(str2, 1)
+    with pytest.raises(ValueError):
+        str3 = cudf.Series(["not enough rows"])
+        str1.str.jaccard_index(str3, 5)
+
+
+def _make_list_of_strings_of_random_length(
+    num_strings, min_length, max_length
+):
+    return [
+        "".join(
+            random.choice(string.ascii_lowercase)
+            for _ in range(random.randint(min_length, max_length))
+        )
+        for _ in range(num_strings)
+    ]
+
+
+def test_jaccard_index_random_strings():
+    # Seed the rng before random string generation.
+    random.seed(42)
+    num_strings = 100
+    jaccard_width = 5
+    common_strings = _make_list_of_strings_of_random_length(
+        num_strings, jaccard_width, 50
+    )
+    uncommon_strings1 = _make_list_of_strings_of_random_length(
+        num_strings, jaccard_width, 10
+    )
+    uncommon_strings2 = _make_list_of_strings_of_random_length(
+        num_strings, jaccard_width, 20
+    )
+    str1 = cudf.Series(uncommon_strings1).str.cat(cudf.Series(common_strings))
+    str2 = cudf.Series(uncommon_strings2).str.cat(cudf.Series(common_strings))
+
+    # adopted from https://github.com/rapidsai/rapids-deduplication/issues/36
+    da = str1.str.character_ngrams(jaccard_width, True)
+    db = str2.str.character_ngrams(jaccard_width, True)
+    da = da.list.unique()
+    db = db.list.unique()
+    da = da.explode()
+    db = db.explode()
+    da = da.to_frame()
+    db = db.to_frame()
+    da = da.reset_index()
+    db = db.reset_index()
+    da = da.rename(columns={0: "token"})
+    db = db.rename(columns={0: "token"})
+    db["match"] = 1
+    inter = da.merge(db, on=["index", "token"], how="left")
+    inter = inter.groupby("index")["match"].sum()
+    union = da.merge(db, on=["index", "token"], how="outer")
+    union = union.groupby("index").size()
+    res = inter / union
+    res.fillna(0, inplace=True)
+    res = res.sort_index()
+    res = res.values.astype("float32")
+    expected = cudf.Series(res)
+
+    actual = str1.str.jaccard_index(str2, jaccard_width)
     assert_eq(expected, actual)
diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
new file mode 100644
index 00000000000..09afb5680bd
--- /dev/null
+++ b/python/cudf/cudf/utils/_numba.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+import glob
+import os
+import sys
+import warnings
+
+from numba import config as numba_config
+
+CC_60_PTX_FILE = os.path.join(
+    os.path.dirname(__file__), "../core/udf/shim_60.ptx"
+)
+
+
+def _get_best_ptx_file(archs, max_compute_capability):
+    """
+    Determine of the available PTX files which one is
+    the most recent up to and including the device compute capability.
+    """
+    filtered_archs = [x for x in archs if x[0] <= max_compute_capability]
+    if filtered_archs:
+        return max(filtered_archs, key=lambda x: x[0])
+    else:
+        return None
+
+
+def _get_ptx_file(path, prefix):
+    if "RAPIDS_NO_INITIALIZE" in os.environ:
+        # cc=60 ptx is always built
+        cc = int(os.environ.get("STRINGS_UDF_CC", "60"))
+    else:
+        from numba import cuda
+
+        dev = cuda.get_current_device()
+
+        # Load the highest compute capability file available that is less than
+        # the current device's.
+        cc = int("".join(str(x) for x in dev.compute_capability))
+    files = glob.glob(os.path.join(path, f"{prefix}*.ptx"))
+    if len(files) == 0:
+        raise RuntimeError(f"Missing PTX files for cc={cc}")
+    regular_sms = []
+
+    for f in files:
+        file_name = os.path.basename(f)
+        sm_number = file_name.rstrip(".ptx").lstrip(prefix)
+        if sm_number.endswith("a"):
+            processed_sm_number = int(sm_number.rstrip("a"))
+            if processed_sm_number == cc:
+                return f
+        else:
+            regular_sms.append((int(sm_number), f))
+
+    regular_result = None
+
+    if regular_sms:
+        regular_result = _get_best_ptx_file(regular_sms, cc)
+
+    if regular_result is None:
+        raise RuntimeError(
+            "This cuDF installation is missing the necessary PTX "
+            f"files that are <={cc}."
+        )
+    else:
+        return regular_result[1]
+
+
+def _patch_numba_mvc():
+    # Enable the config option for minor version compatibility
+    numba_config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
+
+    if "numba.cuda" in sys.modules:
+        # Patch numba for version 0.57.0 MVC support, which must know the
+        # config value at import time. We cannot guarantee the order of imports
+        # between cudf and numba.cuda so we patch numba to ensure it has these
+        # names available.
+        # See https://github.com/numba/numba/issues/8977 for details.
+        import numba.cuda
+        from cubinlinker import CubinLinker, CubinLinkerError
+        from ptxcompiler import compile_ptx
+
+        numba.cuda.cudadrv.driver.compile_ptx = compile_ptx
+        numba.cuda.cudadrv.driver.CubinLinker = CubinLinker
+        numba.cuda.cudadrv.driver.CubinLinkerError = CubinLinkerError
+
+
+def _setup_numba():
+    """
+    Configure the numba linker for use with cuDF. This consists of
+    potentially putting numba into enhanced compatibility mode
+    based on the user driver and runtime versions as well as the
+    version of the CUDA Toolkit used to build the PTX files shipped
+    with the user cuDF package.
+    """
+    # ptxcompiler is a requirement for cuda 11.x packages but not
+    # cuda 12.x packages. However its version checking machinery
+    # is still necessary. If a user happens to have ptxcompiler
+    # in a cuda 12 environment, it's use for the purposes of
+    # checking the driver and runtime versions is harmless
+    try:
+        from ptxcompiler.patch import NO_DRIVER, safe_get_versions
+    except ModuleNotFoundError:
+        # use vendored version
+        from cudf.utils._ptxcompiler import NO_DRIVER, safe_get_versions
+
+    versions = safe_get_versions()
+    if versions != NO_DRIVER:
+        driver_version, runtime_version = versions
+        if driver_version >= (12, 0) and runtime_version > driver_version:
+            warnings.warn(
+                f"Using CUDA toolkit version {runtime_version} with CUDA "
+                f"driver version {driver_version} requires minor version "
+                "compatibility, which is not yet supported for CUDA "
+                "driver versions 12.0 and above. It is likely that many "
+                "cuDF operations will not work in this state. Please "
+                f"install CUDA toolkit version {driver_version} to "
+                "continue using cuDF."
+            )
+        else:
+            # Support MVC for all CUDA versions in the 11.x range
+            ptx_toolkit_version = _get_cuda_version_from_ptx_file(
+                CC_60_PTX_FILE
+            )
+            # Numba thinks cubinlinker is only needed if the driver is older
+            # than the CUDA runtime, but when PTX files are present, it might
+            # also need to patch because those PTX files may be compiled by
+            # a CUDA version that is newer than the driver as well
+            if (driver_version < ptx_toolkit_version) or (
+                driver_version < runtime_version
+            ):
+                _patch_numba_mvc()
+
+
+def _get_cuda_version_from_ptx_file(path):
+    """
+    https://docs.nvidia.com/cuda/parallel-thread-execution/
+    Each PTX module must begin with a .version
+    directive specifying the PTX language version
+
+    example header:
+    //
+    // Generated by NVIDIA NVVM Compiler
+    //
+    // Compiler Build ID: CL-31057947
+    // Cuda compilation tools, release 11.6, V11.6.124
+    // Based on NVVM 7.0.1
+    //
+
+    .version 7.6
+    .target sm_52
+    .address_size 64
+
+    """
+    with open(path) as ptx_file:
+        for line in ptx_file:
+            if line.startswith(".version"):
+                ver_line = line
+                break
+        else:
+            raise ValueError("Could not read CUDA version from ptx file.")
+    version = ver_line.strip("\n").split(" ")[1]
+    # This dictionary maps from supported versions of NVVM to the
+    # PTX version it produces. The lowest value should be the minimum
+    # CUDA version required to compile the library. Currently CUDA 11.5
+    # or higher is required to build cudf. New CUDA versions should
+    # be added to this dictionary when officially supported.
+    ver_map = {
+        "7.5": (11, 5),
+        "7.6": (11, 6),
+        "7.7": (11, 7),
+        "7.8": (11, 8),
+        "8.0": (12, 0),
+        "8.1": (12, 1),
+    }
+
+    cuda_ver = ver_map.get(version)
+    if cuda_ver is None:
+        raise ValueError(
+            f"Could not map PTX version {version} to a CUDA version"
+        )
+
+    return cuda_ver
+
+
+class _CUDFNumbaConfig:
+    def __enter__(self):
+        self.enter_val = numba_config.CUDA_LOW_OCCUPANCY_WARNINGS
+        numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = self.enter_val
diff --git a/python/cudf/cudf/utils/_ptxcompiler.py b/python/cudf/cudf/utils/_ptxcompiler.py
new file mode 100644
index 00000000000..54f5ea08ee1
--- /dev/null
+++ b/python/cudf/cudf/utils/_ptxcompiler.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import subprocess
+import sys
+import warnings
+
+NO_DRIVER = (math.inf, math.inf)
+
+NUMBA_CHECK_VERSION_CMD = """\
+from ctypes import c_int, byref
+from numba import cuda
+dv = c_int(0)
+cuda.cudadrv.driver.driver.cuDriverGetVersion(byref(dv))
+drv_major = dv.value // 1000
+drv_minor = (dv.value - (drv_major * 1000)) // 10
+run_major, run_minor = cuda.runtime.get_version()
+print(f'{drv_major} {drv_minor} {run_major} {run_minor}')
+"""
+
+
+def check_disabled_in_env():
+    # We should avoid checking whether the patch is
+    # needed if the user requested that we don't check
+    # (e.g. in a non-fork-safe environment)
+    check = os.getenv("PTXCOMPILER_CHECK_NUMBA_CODEGEN_PATCH_NEEDED")
+    if check is not None:
+        try:
+            check = int(check)
+        except ValueError:
+            check = False
+    else:
+        check = True
+
+    return not check
+
+
+def get_versions():
+    cp = subprocess.run(
+        [sys.executable, "-c", NUMBA_CHECK_VERSION_CMD], capture_output=True
+    )
+    if cp.returncode:
+        msg = (
+            f"Error getting driver and runtime versions:\n\nstdout:\n\n"
+            f"{cp.stdout.decode()}\n\nstderr:\n\n{cp.stderr.decode()}\n\n"
+            "Not patching Numba"
+        )
+        warnings.warn(msg, UserWarning)
+        return NO_DRIVER
+
+    versions = [int(s) for s in cp.stdout.strip().split()]
+    driver_version = tuple(versions[:2])
+    runtime_version = tuple(versions[2:])
+
+    return driver_version, runtime_version
+
+
+def safe_get_versions():
+    """
+    Return a 2-tuple of deduced driver and runtime versions.
+
+    To ensure that this function does not initialize a CUDA context,
+    calls to the runtime and driver are made in a subprocess.
+
+    If PTXCOMPILER_CHECK_NUMBA_CODEGEN_PATCH_NEEDED is set
+    in the environment, then this subprocess call is not launched.
+    To specify the driver and runtime versions of the environment
+    in this case, set PTXCOMPILER_KNOWN_DRIVER_VERSION and
+    PTXCOMPILER_KNOWN_RUNTIME_VERSION appropriately.
+    """
+    if check_disabled_in_env():
+        try:
+            # allow user to specify driver/runtime
+            # versions manually, if necessary
+            driver_version = os.environ[
+                "PTXCOMPILER_KNOWN_DRIVER_VERSION"
+            ].split(".")
+            runtime_version = os.environ[
+                "PTXCOMPILER_KNOWN_RUNTIME_VERSION"
+            ].split(".")
+            driver_version, runtime_version = (
+                tuple(map(int, driver_version)),
+                tuple(map(int, runtime_version)),
+            )
+        except (KeyError, ValueError):
+            warnings.warn(
+                "No way to determine driver and runtime versions for "
+                "patching, set PTXCOMPILER_KNOWN_DRIVER_VERSION and "
+                "PTXCOMPILER_KNOWN_RUNTIME_VERSION"
+            )
+            return NO_DRIVER
+    else:
+        driver_version, runtime_version = get_versions()
+    return driver_version, runtime_version
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index 933b98367b6..66dbd731e69 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -12,6 +12,7 @@
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import column
 from cudf.utils import utils
+from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.docutils import docfmt_partial
 
 _doc_applyparams = """
@@ -195,7 +196,8 @@ def compile(self, func, argnames, extra_argnames):
         return kernel
 
     def launch_kernel(self, df, args):
-        self.kernel.forall(len(df))(*args)
+        with _CUDFNumbaConfig():
+            self.kernel.forall(len(df))(*args)
 
 
 class ApplyChunksCompiler(ApplyKernelCompilerBase):
@@ -209,12 +211,14 @@ def compile(self, func, argnames, extra_argnames):
     def launch_kernel(self, df, args, chunks, blkct=None, tpb=None):
         chunks = self.normalize_chunks(len(df), chunks)
         if blkct is None and tpb is None:
-            self.kernel.forall(len(df))(len(df), chunks, *args)
+            with _CUDFNumbaConfig():
+                self.kernel.forall(len(df))(len(df), chunks, *args)
         else:
             assert tpb is not None
             if blkct is None:
                 blkct = chunks.size
-            self.kernel[blkct, tpb](len(df), chunks, *args)
+            with _CUDFNumbaConfig():
+                self.kernel[blkct, tpb](len(df), chunks, *args)
 
     def normalize_chunks(self, size, chunks):
         if isinstance(chunks, int):
@@ -336,7 +340,7 @@ def chunk_wise_kernel(nrows, chunks, {args}):
     return kernel
 
 
-_cache = dict()  # type: Dict[Any, Any]
+_cache: Dict[Any, Any] = dict()
 
 
 @functools.wraps(_make_row_wise_kernel)
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index e2bd4556ce8..020c32de9f3 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -1,144 +1,18 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 from pickle import dumps
 
 import cachetools
-import numpy as np
 from numba import cuda
 from numba.np import numpy_support
 
-import cudf
+from cudf.utils._numba import _CUDFNumbaConfig
 
 #
 # Misc kernels
 #
 
 
-# Find segments
-
-
-@cuda.jit
-def gpu_mark_found_int(arr, val, out, not_found):
-    i = cuda.grid(1)
-    if i < arr.size:
-        if arr[i] == val:
-            out[i] = i
-        else:
-            out[i] = not_found
-
-
-@cuda.jit
-def gpu_mark_found_float(arr, val, out, not_found):
-    i = cuda.grid(1)
-    if i < arr.size:
-        # TODO: Remove val typecast to float(val)
-        # once numba minimum version is pinned
-        # at 0.51.1, this will have a very slight
-        # performance improvement. Related
-        # discussion in : https://github.com/rapidsai/cudf/pull/6073
-        val = float(val)
-
-        # NaN-aware equality comparison.
-        if (arr[i] == val) or (arr[i] != arr[i] and val != val):
-            out[i] = i
-        else:
-            out[i] = not_found
-
-
-@cuda.jit
-def gpu_mark_gt(arr, val, out, not_found):
-    i = cuda.grid(1)
-    if i < arr.size:
-        if arr[i] > val:
-            out[i] = i
-        else:
-            out[i] = not_found
-
-
-@cuda.jit
-def gpu_mark_lt(arr, val, out, not_found):
-    i = cuda.grid(1)
-    if i < arr.size:
-        if arr[i] < val:
-            out[i] = i
-        else:
-            out[i] = not_found
-
-
-def find_index_of_val(arr, val, mask=None, compare="eq"):
-    """
-    Returns the indices of the occurrence of *val* in *arr*
-    as per *compare*, if not found it will be filled with
-    size of *arr*
-
-    Parameters
-    ----------
-    arr : device array
-    val : scalar
-    mask : mask of the array
-    compare: str ('gt', 'lt', or 'eq' (default))
-    """
-    found = cuda.device_array(shape=(arr.shape), dtype="int32")
-    if found.size > 0:
-        if compare == "gt":
-            gpu_mark_gt.forall(found.size)(arr, val, found, arr.size)
-        elif compare == "lt":
-            gpu_mark_lt.forall(found.size)(arr, val, found, arr.size)
-        else:
-            if arr.dtype in ("float32", "float64"):
-                gpu_mark_found_float.forall(found.size)(
-                    arr, val, found, arr.size
-                )
-            else:
-                gpu_mark_found_int.forall(found.size)(
-                    arr, val, found, arr.size
-                )
-
-    return cudf.core.column.column.as_column(found).set_mask(mask)
-
-
-def find_first(arr, val, mask=None, compare="eq"):
-    """
-    Returns the index of the first occurrence of *val* in *arr*..
-    Or the first occurrence of *arr* *compare* *val*, if *compare* is not eq
-    Otherwise, returns -1.
-
-    Parameters
-    ----------
-    arr : device array
-    val : scalar
-    mask : mask of the array
-    compare: str ('gt', 'lt', or 'eq' (default))
-    """
-
-    found_col = find_index_of_val(arr, val, mask=mask, compare=compare)
-    found_col = found_col.find_and_replace([arr.size], [None], True)
-
-    min_index = found_col.min()
-    return -1 if min_index is None or np.isnan(min_index) else min_index
-
-
-def find_last(arr, val, mask=None, compare="eq"):
-    """
-    Returns the index of the last occurrence of *val* in *arr*.
-    Or the last occurrence of *arr* *compare* *val*, if *compare* is not eq
-    Otherwise, returns -1.
-
-    Parameters
-    ----------
-    arr : device array
-    val : scalar
-    mask : mask of the array
-    compare: str ('gt', 'lt', or 'eq' (default))
-    """
-
-    found_col = find_index_of_val(arr, val, mask=mask, compare=compare)
-    found_col = found_col.find_and_replace([arr.size], [None], True)
-
-    max_index = found_col.max()
-    return -1 if max_index is None or np.isnan(max_index) else max_index
-
-
 @cuda.jit
 def gpu_window_sizes_from_offset(arr, window_sizes, offset):
     i = cuda.grid(1)
@@ -154,9 +28,10 @@ def gpu_window_sizes_from_offset(arr, window_sizes, offset):
 def window_sizes_from_offset(arr, offset):
     window_sizes = cuda.device_array(shape=(arr.shape), dtype="int32")
     if arr.size > 0:
-        gpu_window_sizes_from_offset.forall(arr.size)(
-            arr, window_sizes, offset
-        )
+        with _CUDFNumbaConfig():
+            gpu_window_sizes_from_offset.forall(arr.size)(
+                arr, window_sizes, offset
+            )
     return window_sizes
 
 
@@ -177,9 +52,10 @@ def gpu_grouped_window_sizes_from_offset(
 def grouped_window_sizes_from_offset(arr, group_starts, offset):
     window_sizes = cuda.device_array(shape=(arr.shape), dtype="int32")
     if arr.size > 0:
-        gpu_grouped_window_sizes_from_offset.forall(arr.size)(
-            arr, window_sizes, group_starts, offset
-        )
+        with _CUDFNumbaConfig():
+            gpu_grouped_window_sizes_from_offset.forall(arr.size)(
+                arr, window_sizes, group_starts, offset
+            )
     return window_sizes
 
 
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index b8dc33345b1..ea96a0859ce 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -13,9 +13,6 @@
 import cudf
 from cudf._typing import DtypeObj
 from cudf.api.types import is_bool, is_float, is_integer
-from cudf.core.missing import NA
-
-_NA_REP = "<NA>"
 
 """Map numpy dtype to pyarrow types.
 Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
@@ -270,14 +267,21 @@ def to_cudf_compatible_scalar(val, dtype=None):
             # the string value directly (cudf.DeviceScalar will DTRT)
             return val
 
-    if isinstance(val, datetime.datetime):
-        val = np.datetime64(val)
-    elif isinstance(val, datetime.timedelta):
-        val = np.timedelta64(val)
-    elif isinstance(val, pd.Timestamp):
+    tz_error_msg = (
+        "Cannot covert a timezone-aware timestamp to timezone-naive scalar."
+    )
+    if isinstance(val, pd.Timestamp):
+        if val.tz is not None:
+            raise NotImplementedError(tz_error_msg)
         val = val.to_datetime64()
     elif isinstance(val, pd.Timedelta):
         val = val.to_timedelta64()
+    elif isinstance(val, datetime.datetime):
+        if val.tzinfo is not None:
+            raise NotImplementedError(tz_error_msg)
+        val = np.datetime64(val)
+    elif isinstance(val, datetime.timedelta):
+        val = np.timedelta64(val)
 
     val = _maybe_convert_to_default_type(
         cudf.api.types.pandas_dtype(type(val))
@@ -632,7 +636,7 @@ def _can_cast(from_dtype, to_dtype):
     `np.can_cast` but with some special handling around
     cudf specific dtypes.
     """
-    if from_dtype in {None, NA}:
+    if cudf.utils.utils.is_na_like(from_dtype):
         return True
     if isinstance(from_dtype, type):
         from_dtype = cudf.dtype(from_dtype)
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index bf51b360fec..91925bf3c0c 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -147,11 +147,12 @@
     For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
     pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
     ``urllib`` for more details.
-filters : list of tuple, list of lists of tuples default None
+filters : list of tuple, list of lists of tuples, default None
     If not None, specifies a filter predicate used to filter out row groups
     using statistics stored for each row group as Parquet metadata. Row groups
-    that do not match the given filter predicate are not read. The
-    predicate is expressed in disjunctive normal form (DNF) like
+    that do not match the given filter predicate are not read. The filters
+    will also be applied to the rows of the in-memory DataFrame after IO.
+    The predicate is expressed in disjunctive normal form (DNF) like
     `[[('x', '=', 0), ...], ...]`. DNF allows arbitrary boolean logical
     combinations of single column predicates. The innermost tuples each
     describe a single column predicate. The list of inner predicates is
@@ -165,9 +166,6 @@
     If not None, specifies, for each input file, which row groups to read.
     If reading multiple inputs, a list of lists should be passed, one list
     for each input.
-strings_to_categorical : boolean, default False
-    If True, return string columns as GDF_CATEGORY dtype; if False, return a
-    as GDF_STRING dtype.
 categorical_partitions : boolean, default True
     Whether directory-partitioned columns should be interpreted as categorical
     or raw dtypes.
@@ -1228,15 +1226,8 @@
     A string representing the encoding to use in the output file
     Only 'utf-8' is currently supported
 compression : str, None
-    A string representing the compression scheme to use in the the output file
+    A string representing the compression scheme to use in the output file
     Compression while writing csv is not supported currently
-line_terminator : str, optional
-
-    .. deprecated:: 23.04
-
-        Replaced with ``lineterminator`` for consistency with
-        :meth:`cudf.read_csv` and :meth:`pandas.DataFrame.to_csv`
-
 lineterminator : str, optional
     The newline character or character sequence to use in the output file.
     Defaults to :data:`os.linesep`.
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index 4ce89b526d6..239438afd24 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -11,6 +11,7 @@
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import column_empty
 from cudf.utils import applyutils
+from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.dtypes import (
     BOOL_TYPES,
     DATETIME_TYPES,
@@ -113,7 +114,7 @@ def _check_error(tree):
         raise QuerySyntaxError("too many expressions")
 
 
-_cache = {}  # type: Dict[Any, Any]
+_cache: Dict[Any, Any] = {}
 
 
 def query_compile(expr):
@@ -137,7 +138,8 @@ def query_compile(expr):
         key "args" is a sequence of name of the arguments.
     """
 
-    funcid = f"queryexpr_{np.uintp(hash(expr)):x}"
+    # hash returns in the semi-open interval [-2**63, 2**63)
+    funcid = f"queryexpr_{(hash(expr) + 2**63):x}"
     # Load cache
     compiled = _cache.get(funcid)
     # Cache not found
@@ -247,6 +249,7 @@ def query_execute(df, expr, callenv):
     out = column_empty(nrows, dtype=np.bool_)
     # run kernel
     args = [out] + colarrays + envargs
-    kernel.forall(nrows)(*args)
+    with _CUDFNumbaConfig():
+        kernel.forall(nrows)(*args)
     out_mask = applyutils.make_aggregate_nullmask(df, columns=columns)
     return out.set_mask(out_mask).fillna(False)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 523d619a72b..e2cb3f145a1 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
+import decimal
 import functools
 import hashlib
 import os
@@ -191,49 +192,6 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-@initfunc
-def set_allocator(
-    allocator="default",
-    pool=False,
-    initial_pool_size=None,
-    enable_logging=False,
-):
-    """
-    Set the GPU memory allocator. This function should be run only once,
-    before any cudf objects are created.
-
-    allocator : {"default", "managed"}
-        "default": use default allocator.
-        "managed": use managed memory allocator.
-    pool : bool
-        Enable memory pool.
-    initial_pool_size : int
-        Memory pool size in bytes. If ``None`` (default), 1/2 of total
-        GPU memory is used. If ``pool=False``, this argument is ignored.
-    enable_logging : bool, optional
-        Enable logging (default ``False``).
-        Enabling this option will introduce performance overhead.
-    """
-    # TODO: Remove this in 23.04 to give users some time to switch.
-    warnings.warn(
-        "The cudf.set_allocator function is deprecated and will be removed in "
-        "a future release. Please use rmm.reinitialize "
-        "(https://docs.rapids.ai/api/rmm/stable/api.html#rmm.reinitialize) "
-        'instead. Note that `cudf.set_allocator(allocator="managed")` is '
-        "equivalent to `rmm.reinitialize(managed_memory=True)`.",
-        FutureWarning,
-    )
-
-    use_managed_memory = allocator == "managed"
-
-    rmm.reinitialize(
-        pool_allocator=pool,
-        managed_memory=use_managed_memory,
-        initial_pool_size=initial_pool_size,
-        logging=enable_logging,
-    )
-
-
 def clear_cache():
     """Clear all internal caches"""
     cudf.Scalar._clear_instance_cache()
@@ -328,50 +286,71 @@ def _fillna_natwise(col):
     )
 
 
-def search_range(start, stop, x, step=1, side="left"):
-    """Find the position to insert a value in a range, so that the resulting
-    sequence remains sorted.
-
-    When ``side`` is set to 'left', the insertion point ``i`` will hold the
-    following invariant:
-    `all(x < n for x in range_left) and all(x >= n for x in range_right)`
-    where ``range_left`` and ``range_right`` refers to the range to the left
-    and right of position ``i``, respectively.
+def search_range(x: int, ri: range, *, side: str) -> int:
+    """
 
-    When ``side`` is set to 'right', ``i`` will hold the following invariant:
-    `all(x <= n for x in range_left) and all(x > n for x in range_right)`
+    Find insertion point in a range to maintain sorted order
 
     Parameters
     ----------
-    start : int
-        Start value of the series
-    stop : int
-        Stop value of the range
-    x : int
-        The value to insert
-    step : int, default 1
-        Step value of the series, assumed positive
-    side : {'left', 'right'}, default 'left'
-        See description for usage.
+    x
+        Integer to insert
+    ri
+        Range to insert into
+    side
+        Tie-breaking decision for the case that `x` is a member of the
+        range. If `"left"` then the insertion point is before the
+        entry, otherwise it is after.
 
     Returns
     -------
     int
-        Insertion position of n.
+        The insertion point
+
+    See Also
+    --------
+    numpy.searchsorted
+
+    Notes
+    -----
+    Let ``p`` be the return value, then if ``side="left"`` the
+    following invariants are maintained::
+
+        all(x < n for n in ri[:p])
+        all(x >= n for n in ri[p:])
+
+    Conversely, if ``side="right"`` then we have::
+
+        all(x <= n for n in ri[:p])
+        all(x > n for n in ri[p:])
 
     Examples
     --------
     For series: 1 4 7
-    >>> search_range(start=1, stop=10, x=4, step=3, side="left")
+    >>> search_range(4, range(1, 10, 3), side="left")
     1
-    >>> search_range(start=1, stop=10, x=4, step=3, side="right")
+    >>> search_range(4, range(1, 10, 3), side="right")
     2
     """
-    z = 1 if side == "left" else 0
-    i = (x - start - z) // step + 1
+    assert side in {"left", "right"}
+    if flip := (ri.step < 0):
+        ri = ri[::-1]
+        shift = int(side == "right")
+    else:
+        shift = int(side == "left")
 
-    length = (stop - start) // step
-    return max(min(length, i), 0)
+    offset = (x - ri.start - shift) // ri.step + 1
+    if flip:
+        offset = len(ri) - offset
+    return max(min(len(ri), offset), 0)
+
+
+def is_na_like(obj):
+    """
+    Check if `obj` is a cudf NA value,
+    i.e., None, cudf.NA or cudf.NaT
+    """
+    return obj is None or obj is cudf.NA or obj is cudf.NaT
 
 
 def _get_color_for_nvtx(name):
@@ -394,3 +373,69 @@ def _cudf_nvtx_annotate(func, domain="cudf_python"):
 _dask_cudf_nvtx_annotate = partial(
     _cudf_nvtx_annotate, domain="dask_cudf_python"
 )
+
+
+def _warn_no_dask_cudf(fn):
+    @functools.wraps(fn)
+    def wrapper(self):
+        # try import
+        try:
+            # Import dask_cudf (if available) in case
+            # this is being called within Dask Dataframe
+            import dask_cudf  # noqa: F401
+
+        except ImportError:
+            warnings.warn(
+                f"Using dask to tokenize a {type(self)} object, "
+                "but `dask_cudf` is not installed. Please install "
+                "`dask_cudf` for proper dispatching."
+            )
+        return fn(self)
+
+    return wrapper
+
+
+def _is_same_name(left_name, right_name):
+    # Internal utility to compare if two names are same.
+    with warnings.catch_warnings():
+        # numpy throws warnings while comparing
+        # NaT values with non-NaT values.
+        warnings.simplefilter("ignore")
+        try:
+            same = (left_name is right_name) or (left_name == right_name)
+            if not same:
+                if isinstance(left_name, decimal.Decimal) and isinstance(
+                    right_name, decimal.Decimal
+                ):
+                    return left_name.is_nan() and right_name.is_nan()
+                if isinstance(left_name, float) and isinstance(
+                    right_name, float
+                ):
+                    return np.isnan(left_name) and np.isnan(right_name)
+                if isinstance(left_name, np.datetime64) and isinstance(
+                    right_name, np.datetime64
+                ):
+                    return np.isnan(left_name) and np.isnan(right_name)
+            return same
+        except TypeError:
+            return False
+
+
+def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
+    # Internal utility to construct a boolean column
+    # by combining nulls from `lhs` & `rhs`.
+    if lhs.has_nulls() and rhs.has_nulls():
+        result_mask = lhs._get_mask_as_column() & rhs._get_mask_as_column()
+    elif lhs.has_nulls():
+        result_mask = lhs._get_mask_as_column()
+    elif rhs.has_nulls():
+        result_mask = rhs._get_mask_as_column()
+    else:
+        result_mask = None
+
+    result_col = column.full(
+        size=len(lhs), fill_value=bool_fill_value, dtype=cudf.dtype(np.bool_)
+    )
+    if result_mask is not None:
+        result_col = result_col.set_mask(result_mask.as_mask())
+    return result_col
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index d13324a7404..574769f68d1 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -3,21 +3,21 @@
 [build-system]
 build-backend = "setuptools.build_meta"
 requires = [
-    "cmake>=3.23.1,!=3.25.0",
-    "cython>=0.29,<0.30",
+    "cmake>=3.26.4",
+    "cython>=3.0.0",
     "ninja",
-    "numpy>=1.21,<1.24",
+    "numpy>=1.21",
     "protoc-wheel",
-    "pyarrow==11.0.0.*",
-    "rmm==23.6.*",
-    "scikit-build>=0.13.1,<0.17.2",
+    "pyarrow==12.0.1.*",
+    "rmm==23.10.*",
+    "scikit-build>=0.13.1",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "cudf"
-version = "23.06.00"
+version = "23.10.00"
 description = "cuDF - GPU Dataframe"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -28,19 +28,19 @@ requires-python = ">=3.9"
 dependencies = [
     "cachetools",
     "cubinlinker",
-    "cuda-python>=11.7.1,<12.0",
+    "cuda-python>=11.7.1,<12.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
-    "numba>=0.56.4,<0.57",
-    "numpy>=1.21,<1.24",
+    "numba>=0.57",
+    "numpy>=1.21",
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=1.3,<1.6.0dev0",
-    "protobuf>=4.21.6,<4.22",
+    "protobuf>=4.21,<5",
     "ptxcompiler",
-    "pyarrow==11.*",
-    "rmm==23.6.*",
-    "typing_extensions",
+    "pyarrow==12.*",
+    "rmm==23.10.*",
+    "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt
index 0c07236682f..7d6dc84b322 100644
--- a/python/cudf/udf_cpp/CMakeLists.txt
+++ b/python/cudf/udf_cpp/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-cmake_minimum_required(VERSION 3.23.1)
+cmake_minimum_required(VERSION 3.26.4)
 
 include(rapids-cmake)
 include(rapids-cpm)
diff --git a/python/cudf/udf_cpp/shim.cu b/python/cudf/udf_cpp/shim.cu
index 63ad1039da6..cabca3154be 100644
--- a/python/cudf/udf_cpp/shim.cu
+++ b/python/cudf/udf_cpp/shim.cu
@@ -388,26 +388,30 @@ __device__ bool are_all_nans(cooperative_groups::thread_block const& block,
   return count == 0;
 }
 
-template <typename T>
-__device__ void device_sum(cooperative_groups::thread_block const& block,
-                           T const* data,
-                           int64_t size,
-                           T* sum)
+template <typename T, typename AccumT = std::conditional_t<std::is_integral_v<T>, int64_t, T>>
+__device__ AccumT device_sum(cooperative_groups::thread_block const& block,
+                             T const* data,
+                             int64_t size)
 {
-  T local_sum = 0;
+  __shared__ AccumT block_sum;
+  if (block.thread_rank() == 0) { block_sum = 0; }
+  block.sync();
+
+  AccumT local_sum = 0;
 
   for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) {
-    local_sum += data[idx];
+    local_sum += static_cast<AccumT>(data[idx]);
   }
 
-  cuda::atomic_ref<T, cuda::thread_scope_block> ref{*sum};
+  cuda::atomic_ref<AccumT, cuda::thread_scope_block> ref{block_sum};
   ref.fetch_add(local_sum, cuda::std::memory_order_relaxed);
 
   block.sync();
+  return block_sum;
 }
 
-template <typename T>
-__device__ T BlockSum(T const* data, int64_t size)
+template <typename T, typename AccumT = std::conditional_t<std::is_integral_v<T>, int64_t, T>>
+__device__ AccumT BlockSum(T const* data, int64_t size)
 {
   auto block = cooperative_groups::this_thread_block();
 
@@ -415,11 +419,7 @@ __device__ T BlockSum(T const* data, int64_t size)
     if (are_all_nans(block, data, size)) { return 0; }
   }
 
-  __shared__ T block_sum;
-  if (block.thread_rank() == 0) { block_sum = 0; }
-  block.sync();
-
-  device_sum<T>(block, data, size, &block_sum);
+  auto block_sum = device_sum<T>(block, data, size);
   return block_sum;
 }
 
@@ -428,46 +428,54 @@ __device__ double BlockMean(T const* data, int64_t size)
 {
   auto block = cooperative_groups::this_thread_block();
 
-  __shared__ T block_sum;
-  if (block.thread_rank() == 0) { block_sum = 0; }
-  block.sync();
-
-  device_sum<T>(block, data, size, &block_sum);
+  auto block_sum = device_sum<T>(block, data, size);
   return static_cast<double>(block_sum) / static_cast<double>(size);
 }
 
 template <typename T>
-__device__ double BlockVar(T const* data, int64_t size)
+__device__ double BlockCoVar(T const* lhs, T const* rhs, int64_t size)
 {
   auto block = cooperative_groups::this_thread_block();
 
-  __shared__ double block_var;
-  __shared__ T block_sum;
-  if (block.thread_rank() == 0) {
-    block_var = 0;
-    block_sum = 0;
-  }
-  block.sync();
+  __shared__ double block_covar;
 
-  T local_sum      = 0;
-  double local_var = 0;
+  if (block.thread_rank() == 0) { block_covar = 0; }
+  block.sync();
 
-  device_sum<T>(block, data, size, &block_sum);
+  auto block_sum_lhs = device_sum<T>(block, lhs, size);
+
+  auto const mu_l = static_cast<double>(block_sum_lhs) / static_cast<double>(size);
+  auto const mu_r = [=]() {
+    if (lhs == rhs) {
+      // If the lhs and rhs are the same, this is calculating variance.
+      // Thus we can assume mu_r = mu_l.
+      return mu_l;
+    } else {
+      auto block_sum_rhs = device_sum<T>(block, rhs, size);
+      return static_cast<double>(block_sum_rhs) / static_cast<double>(size);
+    }
+  }();
 
-  auto const mean = static_cast<double>(block_sum) / static_cast<double>(size);
+  double local_covar = 0;
 
   for (int64_t idx = block.thread_rank(); idx < size; idx += block.size()) {
-    auto const delta = static_cast<double>(data[idx]) - mean;
-    local_var += delta * delta;
+    local_covar += (static_cast<double>(lhs[idx]) - mu_l) * (static_cast<double>(rhs[idx]) - mu_r);
   }
 
-  cuda::atomic_ref<double, cuda::thread_scope_block> ref{block_var};
-  ref.fetch_add(local_var, cuda::std::memory_order_relaxed);
+  cuda::atomic_ref<double, cuda::thread_scope_block> ref{block_covar};
+  ref.fetch_add(local_covar, cuda::std::memory_order_relaxed);
   block.sync();
 
-  if (block.thread_rank() == 0) { block_var = block_var / static_cast<double>(size - 1); }
+  if (block.thread_rank() == 0) { block_covar /= static_cast<double>(size - 1); }
   block.sync();
-  return block_var;
+
+  return block_covar;
+}
+
+template <typename T>
+__device__ double BlockVar(T const* data, int64_t size)
+{
+  return BlockCoVar<T>(data, data, size);
 }
 
 template <typename T>
@@ -620,6 +628,18 @@ __device__ int64_t BlockIdxMin(T const* data, int64_t* index, int64_t size)
   return block_idx_min;
 }
 
+template <typename T>
+__device__ double BlockCorr(T* const lhs_ptr, T* const rhs_ptr, int64_t size)
+{
+  auto numerator   = BlockCoVar(lhs_ptr, rhs_ptr, size);
+  auto denominator = BlockStd(lhs_ptr, size) * BlockStd<T>(rhs_ptr, size);
+  if (denominator == 0.0) {
+    return std::numeric_limits<double>::quiet_NaN();
+  } else {
+    return numerator / denominator;
+  }
+}
+
 extern "C" {
 #define make_definition(name, cname, type, return_type)                                          \
   __device__ int name##_##cname(return_type* numba_return_value, type* const data, int64_t size) \
@@ -630,17 +650,34 @@ extern "C" {
     return 0;                                                                                    \
   }
 
+make_definition(BlockSum, int32, int32_t, int64_t);
 make_definition(BlockSum, int64, int64_t, int64_t);
+make_definition(BlockSum, float32, float, float);
 make_definition(BlockSum, float64, double, double);
+
+make_definition(BlockMean, int32, int32_t, double);
 make_definition(BlockMean, int64, int64_t, double);
+make_definition(BlockMean, float32, float, float);
 make_definition(BlockMean, float64, double, double);
+
+make_definition(BlockStd, int32, int32_t, double);
 make_definition(BlockStd, int64, int64_t, double);
+make_definition(BlockStd, float32, float, float);
 make_definition(BlockStd, float64, double, double);
+
 make_definition(BlockVar, int64, int64_t, double);
+make_definition(BlockVar, int32, int32_t, double);
+make_definition(BlockVar, float32, float, float);
 make_definition(BlockVar, float64, double, double);
+
+make_definition(BlockMin, int32, int32_t, int32_t);
 make_definition(BlockMin, int64, int64_t, int64_t);
+make_definition(BlockMin, float32, float, float);
 make_definition(BlockMin, float64, double, double);
+
+make_definition(BlockMax, int32, int32_t, int32_t);
 make_definition(BlockMax, int64, int64_t, int64_t);
+make_definition(BlockMax, float32, float, float);
 make_definition(BlockMax, float64, double, double);
 #undef make_definition
 }
@@ -656,9 +693,31 @@ extern "C" {
     return 0;                                                                    \
   }
 
+make_definition_idx(BlockIdxMin, int32, int32_t);
 make_definition_idx(BlockIdxMin, int64, int64_t);
+make_definition_idx(BlockIdxMin, float32, float);
 make_definition_idx(BlockIdxMin, float64, double);
+
+make_definition_idx(BlockIdxMax, int32, int32_t);
 make_definition_idx(BlockIdxMax, int64, int64_t);
+make_definition_idx(BlockIdxMax, float32, float);
 make_definition_idx(BlockIdxMax, float64, double);
 #undef make_definition_idx
 }
+
+extern "C" {
+#define make_definition_corr(name, cname, type)                                 \
+  __device__ int name##_##cname##_##cname(                                      \
+    double* numba_return_value, type* const lhs, type* const rhs, int64_t size) \
+  {                                                                             \
+    double const res    = name<type>(lhs, rhs, size);                           \
+    *numba_return_value = res;                                                  \
+    __syncthreads();                                                            \
+    return 0;                                                                   \
+  }
+
+make_definition_corr(BlockCorr, int32, int32_t);
+make_definition_corr(BlockCorr, int64, int64_t);
+
+#undef make_definition_corr
+}
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index e64d8f82739..ca729c62512 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
@@ -7,7 +7,7 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.io.types cimport datasource
+from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.io.datasource cimport Datasource
 
 
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index bff60e63fdb..4d732478723 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -1,12 +1,14 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool, nullptr
 from libcpp.map cimport map
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
+from libcpp.utility cimport move
 
-from cudf._lib.cpp.io.types cimport datasource
+from cudf._lib.cpp.io.datasource cimport datasource
+from cudf._lib.cpp.libcpp.memory cimport make_unique
 
 from cudf_kafka._lib.kafka cimport kafka_consumer
 
@@ -50,20 +52,20 @@ cdef class KafkaDatasource(Datasource):
 
         if topic != b"" and partition != -1:
             self.c_datasource = <unique_ptr[datasource]> \
-                make_unique[kafka_consumer](configs,
-                                            python_callable,
-                                            python_callable_wrapper,
-                                            topic,
-                                            partition,
-                                            start_offset,
-                                            end_offset,
-                                            batch_timeout,
-                                            delimiter)
+                move(make_unique[kafka_consumer](configs,
+                                                 python_callable,
+                                                 python_callable_wrapper,
+                                                 topic,
+                                                 partition,
+                                                 start_offset,
+                                                 end_offset,
+                                                 batch_timeout,
+                                                 delimiter))
         else:
             self.c_datasource = <unique_ptr[datasource]> \
-                make_unique[kafka_consumer](configs,
-                                            python_callable,
-                                            python_callable_wrapper)
+                move(make_unique[kafka_consumer](configs,
+                                                 python_callable,
+                                                 python_callable_wrapper))
 
     cdef datasource* get_datasource(self) nogil:
         return <datasource *> self.c_datasource.get()
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index df0825c846a..a6ef867451b 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -3,16 +3,16 @@
 [build-system]
 
 requires = [
-    "cython>=0.29,<0.30",
-    "numpy>=1.21,<1.24",
-    "pyarrow==11.0.0.*",
+    "cython>=3.0.0",
+    "numpy>=1.21",
+    "pyarrow==12.0.1.*",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "cudf_kafka"
-version = "23.06.00"
+version = "23.10.00"
 description = "cuDF Kafka Datasource"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -21,7 +21,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.6.*",
+    "cudf==23.10.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py
index a88b05c83b1..0def0ba746e 100644
--- a/python/custreamz/custreamz/kafka.py
+++ b/python/custreamz/custreamz/kafka.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 import confluent_kafka as ck
 from cudf_kafka._lib.kafka import KafkaDatasource
 
@@ -25,6 +25,7 @@ def __init__(self, kafka_configs):
 
         self.kafka_configs = kafka_configs
         self.kafka_meta_client = KafkaDatasource(kafka_configs)
+        self.ck_consumer = ck.Consumer(kafka_configs)
 
     def list_topics(self, specific_topic=None):
         """
@@ -270,3 +271,21 @@ def commit(self, offsets=None, asynchronous=True):
             self.kafka_meta_client.commit_offset(
                 offs.topic.encode(), offs.partition, offs.offset
             )
+
+    def poll(self, timeout=None):
+        """
+        Consumes a single message, calls callbacks and returns events.
+
+        The application must check the returned Message object's
+        Message.error() method to distinguish between proper messages
+        (error() returns None), or an event or error
+        (see error().code() for specifics).
+
+        Parameters
+        ----------
+        timeout : float
+            Maximum time to block waiting for message, event or callback
+            (default: infinite (None translated into -1 in the
+            library)). (Seconds)
+        """
+        return self.ck.poll(timeout)
diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py
index b07f3f9d946..1a1fc84ef89 100644
--- a/python/custreamz/custreamz/tests/test_dataframes.py
+++ b/python/custreamz/custreamz/tests/test_dataframes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 """
 Tests for Streamz Dataframes (SDFs) built on top of cuDF DataFrames.
@@ -32,7 +32,7 @@ def client():
 
 
 @pytest.fixture(params=["core", "dask"])
-def stream(request, client):  # flake8: noqa
+def stream(request, client):
     if request.param == "core":
         return Stream()
     else:
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 3aec924a38a..47ade91b5eb 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "custreamz"
-version = "23.06.00"
+version = "23.10.00"
 description = "cuStreamz - GPU Accelerated Streaming"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -19,8 +19,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==23.6.*",
-    "cudf_kafka==23.6.*",
+    "cudf==23.10.*",
+    "cudf_kafka==23.10.*",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index 06e5bdb02b1..6952c3d5882 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -14,7 +14,7 @@
 except ImportError:
     pass
 
-__version__ = "23.06.00"
+__version__ = "23.10.00"
 
 __all__ = [
     "DataFrame",
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 821ec103204..2470b4d50f1 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import pyarrow as pa
 from pandas.api.types import is_scalar
+from pandas.core.tools.datetimes import is_datetime64tz_dtype
 
 import dask.dataframe as dd
 from dask import config
@@ -122,6 +123,11 @@ def _get_non_empty_data(s):
         data = cudf.core.column.as_column(data, dtype=s.dtype)
     elif is_string_dtype(s.dtype):
         data = pa.array(["cat", "dog"])
+    elif is_datetime64tz_dtype(s.dtype):
+        from cudf.utils.dtypes import get_time_unit
+
+        data = cudf.date_range("2001-01-01", periods=2, freq=get_time_unit(s))
+        data = data.tz_localize(str(s.dtype.tz))._column
     else:
         if pd.api.types.is_numeric_dtype(s.dtype):
             data = cudf.core.column.as_column(
@@ -372,6 +378,36 @@ def percentile_cudf(a, q, interpolation="linear"):
 except ImportError:
     pass
 
+try:
+    # Requires dask>2023.6.0
+    from dask.dataframe.dispatch import (
+        from_pyarrow_table_dispatch,
+        to_pyarrow_table_dispatch,
+    )
+
+    @to_pyarrow_table_dispatch.register(cudf.DataFrame)
+    def _cudf_to_table(obj, preserve_index=True, **kwargs):
+        if kwargs:
+            warnings.warn(
+                "Ignoring the following arguments to "
+                f"`to_pyarrow_table_dispatch`: {list(kwargs)}"
+            )
+        return obj.to_arrow(preserve_index=preserve_index)
+
+    @from_pyarrow_table_dispatch.register(cudf.DataFrame)
+    def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
+        # cudf ignores self_destruct.
+        kwargs.pop("self_destruct", None)
+        if kwargs:
+            warnings.warn(
+                f"Ignoring the following arguments to "
+                f"`from_pyarrow_table_dispatch`: {list(kwargs)}"
+            )
+        return obj.from_arrow(table)
+
+except ImportError:
+    pass
+
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
 @_dask_cudf_nvtx_annotate
diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index fd27083bbf4..fa5400344f9 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -56,6 +56,11 @@ def read_csv(path, blocksize="default", **kwargs):
         Passthrough key-word arguments that are sent to
         :func:`cudf:cudf.read_csv`.
 
+    Notes
+    -----
+    If any of `skipfooter`/`skiprows`/`nrows` are passed,
+    `blocksize` will default to None.
+
     Examples
     --------
     >>> import dask_cudf
@@ -81,7 +86,16 @@ def read_csv(path, blocksize="default", **kwargs):
 
     # Set default `blocksize`
     if blocksize == "default":
-        blocksize = "256 MiB"
+        if (
+            kwargs.get("skipfooter", 0) != 0
+            or kwargs.get("skiprows", 0) != 0
+            or kwargs.get("nrows", None) is not None
+        ):
+            # Cannot read in blocks if skipfooter,
+            # skiprows or nrows is passed.
+            blocksize = None
+        else:
+            blocksize = "256 MiB"
 
     if "://" in str(path):
         func = make_reader(cudf.read_csv, "read_csv", "CSV")
@@ -189,9 +203,14 @@ def read_csv_without_blocksize(path, **kwargs):
 
     name = "read-csv-" + tokenize(path, **kwargs)
 
+    meta_kwargs = kwargs.copy()
+    if "skipfooter" in meta_kwargs:
+        meta_kwargs.pop("skipfooter")
+    if "nrows" in meta_kwargs:
+        meta_kwargs.pop("nrows")
     # Read "head" of first file (first 5 rows).
     # Convert to empty df for metadata.
-    meta = cudf.read_csv(filenames[0], nrows=5, **kwargs).iloc[:0]
+    meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0]
 
     graph = {
         (name, i): (apply, cudf.read_csv, [fn], kwargs)
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index f19c373150d..dd8c3394a2c 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2019-2023, NVIDIA CORPORATION.
+import itertools
 import warnings
 from contextlib import ExitStack
 from functools import partial
@@ -20,7 +21,11 @@
 import cudf
 from cudf.core.column import as_column, build_categorical_column
 from cudf.io import write_to_dataset
-from cudf.io.parquet import _default_open_file_options
+from cudf.io.parquet import (
+    _apply_post_filters,
+    _default_open_file_options,
+    _normalize_filters,
+)
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
 from cudf.utils.ioutils import (
     _ROW_GROUP_SIZE_BYTES_DEFAULT,
@@ -30,28 +35,22 @@
 
 
 class CudfEngine(ArrowDatasetEngine):
-    @staticmethod
-    def read_metadata(*args, **kwargs):
-        meta, stats, parts, index = ArrowDatasetEngine.read_metadata(
-            *args, **kwargs
+    @classmethod
+    def _create_dd_meta(cls, dataset_info, **kwargs):
+        # Start with pandas-version of meta
+        meta_pd = super()._create_dd_meta(dataset_info, **kwargs)
+
+        # Convert to cudf
+        meta_cudf = cudf.from_pandas(meta_pd)
+
+        # Re-set "object" dtypes to align with pa schema
+        kwargs = dataset_info.get("kwargs", {})
+        set_object_dtypes_from_pa_schema(
+            meta_cudf,
+            kwargs.get("schema", None),
         )
-        new_meta = cudf.from_pandas(meta)
-        if parts:
-            # Re-set "object" dtypes align with pa schema
-            set_object_dtypes_from_pa_schema(
-                new_meta,
-                parts[0].get("common_kwargs", {}).get("schema", None),
-            )
 
-        # If `strings_to_categorical==True`, convert objects to int32
-        strings_to_cats = kwargs.get("strings_to_categorical", False)
-        for col in new_meta._data.names:
-            if (
-                isinstance(new_meta._data[col], cudf.core.column.StringColumn)
-                and strings_to_cats
-            ):
-                new_meta._data[col] = new_meta._data[col].astype("int32")
-        return (new_meta, stats, parts, index)
+        return meta_cudf
 
     @classmethod
     def multi_support(cls):
@@ -66,7 +65,7 @@ def _read_paths(
         fs,
         columns=None,
         row_groups=None,
-        strings_to_categorical=None,
+        filters=None,
         partitions=None,
         partitioning=None,
         partition_keys=None,
@@ -79,6 +78,19 @@ def _read_paths(
         if row_groups == [None for path in paths]:
             row_groups = None
 
+        # Make sure we read in the columns needed for row-wise
+        # filtering after IO. This means that one or more columns
+        # will be dropped almost immediately after IO. However,
+        # we do NEED these columns for accurate filtering.
+        filters = _normalize_filters(filters)
+        projected_columns = None
+        if columns and filters:
+            projected_columns = [c for c in columns if c is not None]
+            columns = sorted(
+                set(v[0] for v in itertools.chain.from_iterable(filters))
+                | set(projected_columns)
+            )
+
         dataset_kwargs = dataset_kwargs or {}
         dataset_kwargs["partitioning"] = partitioning or "hive"
         with ExitStack() as stack:
@@ -102,7 +114,6 @@ def _read_paths(
                     engine="cudf",
                     columns=columns,
                     row_groups=row_groups if row_groups else None,
-                    strings_to_categorical=strings_to_categorical,
                     dataset_kwargs=dataset_kwargs,
                     categorical_partitions=False,
                     **kwargs,
@@ -120,7 +131,6 @@ def _read_paths(
                                 row_groups=row_groups[i]
                                 if row_groups
                                 else None,
-                                strings_to_categorical=strings_to_categorical,
                                 dataset_kwargs=dataset_kwargs,
                                 categorical_partitions=False,
                                 **kwargs,
@@ -131,6 +141,17 @@ def _read_paths(
                 else:
                     raise err
 
+        # Apply filters (if any are defined)
+        df = _apply_post_filters(df, filters)
+
+        if projected_columns:
+            # Elements of `projected_columns` may now be in the index.
+            # We must filter these names from our projection
+            projected_columns = [
+                col for col in projected_columns if col in df._column_names
+            ]
+            df = df[projected_columns]
+
         if partitions and partition_keys is None:
 
             # Use `HivePartitioning` by default
@@ -155,19 +176,23 @@ def _read_paths(
 
             for i, (name, index2) in enumerate(partition_keys):
 
-                # Build the column from `codes` directly
-                # (since the category is often a larger dtype)
-                codes = as_column(
-                    partitions[i].keys.index(index2),
-                    length=len(df),
-                )
-                df[name] = build_categorical_column(
-                    categories=partitions[i].keys,
-                    codes=codes,
-                    size=codes.size,
-                    offset=codes.offset,
-                    ordered=False,
-                )
+                if len(partitions[i].keys):
+                    # Build a categorical column from `codes` directly
+                    # (since the category is often a larger dtype)
+                    codes = as_column(
+                        partitions[i].keys.get_loc(index2),
+                        length=len(df),
+                    )
+                    df[name] = build_categorical_column(
+                        categories=partitions[i].keys,
+                        codes=codes,
+                        size=codes.size,
+                        offset=codes.offset,
+                        ordered=False,
+                    )
+                elif name not in df.columns:
+                    # Add non-categorical partition column
+                    df[name] = as_column(index2, length=len(df))
 
         return df
 
@@ -180,6 +205,7 @@ def read_partition(
         index,
         categories=(),
         partitions=(),
+        filters=None,
         partitioning=None,
         schema=None,
         open_file_options=None,
@@ -207,7 +233,6 @@ def read_partition(
             pieces = [pieces]
 
         # Extract supported kwargs from `kwargs`
-        strings_to_cats = kwargs.get("strings_to_categorical", False)
         read_kwargs = kwargs.get("read", {})
         read_kwargs.update(open_file_options or {})
         check_file_size = read_kwargs.pop("check_file_size", None)
@@ -252,7 +277,7 @@ def read_partition(
                             fs,
                             columns=read_columns,
                             row_groups=rgs if rgs else None,
-                            strings_to_categorical=strings_to_cats,
+                            filters=filters,
                             partitions=partitions,
                             partitioning=partitioning,
                             partition_keys=last_partition_keys,
@@ -278,7 +303,7 @@ def read_partition(
                     fs,
                     columns=read_columns,
                     row_groups=rgs if rgs else None,
-                    strings_to_categorical=strings_to_cats,
+                    filters=filters,
                     partitions=partitions,
                     partitioning=partitioning,
                     partition_keys=last_partition_keys,
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
index 2eb69b0bd5c..4ff630a89e8 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
@@ -16,6 +16,42 @@
 import dask_cudf
 
 
+@pytest.fixture
+def csv_begin_bad_lines(tmp_path):
+    lines = """x
+    x
+    x
+    A, B, C, D
+    1, 2, 3, 4
+    2, 3, 5, 1
+    4, 5, 2, 5"""
+
+    file = tmp_path / "test_read_csv_begin.csv"
+
+    with open(file, "w") as fp:
+        fp.write(lines)
+
+    return file
+
+
+@pytest.fixture
+def csv_end_bad_lines(tmp_path):
+    lines = """A, B, C, D
+    1, 2, 3, 4
+    2, 3, 5, 1
+    4, 5, 2, 5
+    x
+    x
+    x"""
+
+    file = tmp_path / "test_read_csv_end.csv"
+
+    with open(file, "w") as fp:
+        fp.write(lines)
+
+    return file
+
+
 def test_csv_roundtrip_backend_dispatch(tmp_path):
     # Test ddf.read_csv cudf-backend dispatch
     df = cudf.DataFrame({"x": [1, 2, 3, 4], "id": ["a", "b", "c", "d"]})
@@ -170,3 +206,50 @@ def test_csv_reader_usecols(tmp_path, dtype):
     ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"], dtype=dtype)
 
     dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)
+
+
+def test_read_csv_skiprows(csv_begin_bad_lines):
+    # Repro from Issue#13552
+    ddf_cpu = dd.read_csv(csv_begin_bad_lines, skiprows=3).compute()
+    ddf_gpu = dask_cudf.read_csv(csv_begin_bad_lines, skiprows=3).compute()
+
+    dd.assert_eq(ddf_cpu, ddf_gpu)
+
+
+def test_read_csv_skiprows_error(csv_begin_bad_lines):
+    # Repro from Issue#13552
+    with pytest.raises(ValueError):
+        dask_cudf.read_csv(
+            csv_begin_bad_lines, skiprows=3, blocksize="100 MiB"
+        ).compute()
+
+
+def test_read_csv_skipfooter(csv_end_bad_lines):
+    # Repro from Issue#13552
+
+    ddf_cpu = dd.read_csv(csv_end_bad_lines, skipfooter=3).compute()
+    ddf_gpu = dask_cudf.read_csv(csv_end_bad_lines, skipfooter=3).compute()
+
+    dd.assert_eq(ddf_cpu, ddf_gpu, check_dtype=False)
+
+
+def test_read_csv_skipfooter_error(csv_end_bad_lines):
+    with pytest.raises(ValueError):
+        dask_cudf.read_csv(
+            csv_end_bad_lines, skipfooter=3, blocksize="100 MiB"
+        ).compute()
+
+
+def test_read_csv_nrows(csv_end_bad_lines):
+    ddf_cpu = pd.read_csv(csv_end_bad_lines, nrows=2)
+    ddf_gpu = dask_cudf.read_csv(csv_end_bad_lines, nrows=2).compute()
+
+    dd.assert_eq(ddf_cpu, ddf_gpu)
+
+
+def test_read_csv_nrows_error(csv_end_bad_lines):
+
+    with pytest.raises(ValueError):
+        dask_cudf.read_csv(
+            csv_end_bad_lines, nrows=2, blocksize="100 MiB"
+        ).compute()
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index f5ae9706fde..85ec36cf2c5 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -159,12 +159,6 @@ def test_strings(tmpdir):
     read_df = dask_cudf.read_parquet(fn, index=["a"])
     dd.assert_eq(ddf2, read_df.compute().to_pandas())
 
-    read_df_cats = dask_cudf.read_parquet(
-        fn, index=["a"], strings_to_categorical=True
-    )
-    dd.assert_eq(read_df_cats.dtypes, read_df_cats.compute().dtypes)
-    dd.assert_eq(read_df_cats.dtypes[0], "int32")
-
 
 def test_dask_timeseries_from_pandas(tmpdir):
 
@@ -254,6 +248,41 @@ def test_filters(tmpdir):
     assert not len(c)
 
 
+@pytest.mark.parametrize("numeric", [True, False])
+@pytest.mark.parametrize("null", [np.nan, None])
+def test_isna_filters(tmpdir, null, numeric):
+
+    tmp_path = str(tmpdir)
+    df = pd.DataFrame(
+        {
+            "x": range(10),
+            "y": list("aabbccddee"),
+            "i": [0] * 4 + [np.nan] * 2 + [0] * 4,
+            "j": [""] * 4 + [None] * 2 + [""] * 4,
+        }
+    )
+    ddf = dd.from_pandas(df, npartitions=5)
+    assert ddf.npartitions == 5
+    ddf.to_parquet(tmp_path, engine="pyarrow")
+
+    # Test "is"
+    col = "i" if numeric else "j"
+    filters = [(col, "is", null)]
+    out = dask_cudf.read_parquet(
+        tmp_path, filters=filters, split_row_groups=True
+    )
+    assert len(out) == 2
+    assert list(out.x.compute().values) == [4, 5]
+
+    # Test "is not"
+    filters = [(col, "is not", null)]
+    out = dask_cudf.read_parquet(
+        tmp_path, filters=filters, split_row_groups=True
+    )
+    assert len(out) == 8
+    assert list(out.x.compute().values) == [0, 1, 2, 3, 6, 7, 8, 9]
+
+
 def test_filters_at_row_group_level(tmpdir):
 
     tmp_path = str(tmpdir)
@@ -267,7 +296,7 @@ def test_filters_at_row_group_level(tmpdir):
         tmp_path, filters=[("x", "==", 1)], split_row_groups=True
     )
     assert a.npartitions == 1
-    assert (a.shape[0] == 2).compute()
+    assert (a.shape[0] == 1).compute()
 
     ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=1)
 
@@ -539,3 +568,30 @@ def test_nullable_schema_mismatch(tmpdir):
         )
         expect = pd.read_parquet([path0, path1])
     dd.assert_eq(ddf, expect, check_index=False)
+
+
+def test_parquet_read_filter_and_project(tmpdir):
+    # Filter on columns that are not included
+    # in the current column projection
+
+    # Write parquet data
+    path = str(tmpdir.join("test.parquet"))
+    df = cudf.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5] * 10,
+            "b": [0, 1, 2, 3, 4] * 10,
+            "c": range(50),
+            "d": [6, 7] * 25,
+            "e": [8, 9] * 25,
+        }
+    )
+    df.to_parquet(path)
+
+    # Read back with filter and projection
+    columns = ["b"]
+    filters = [[("a", "==", 5), ("c", ">", 20)]]
+    got = dask_cudf.read_parquet(path, columns=columns, filters=filters)
+
+    # Check result
+    expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True)
+    dd.assert_eq(got, expected)
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 6b1627c91e8..bea0cbb48ae 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -7,7 +7,7 @@
 
 from dask import dataframe as dd
 
-from cudf import DataFrame, Series
+from cudf import DataFrame, Series, date_range
 from cudf.testing._utils import assert_eq, does_not_raise
 
 import dask_cudf as dgd
@@ -527,3 +527,33 @@ def test_struct_explode(data):
     got = dgd.from_cudf(Series(data), 2).struct.explode()
     # Output index will not agree for >1 partitions
     assert_eq(expect, got.compute().reset_index(drop=True))
+
+
+def test_tz_localize():
+    data = Series(date_range("2000-04-01", "2000-04-03", freq="H"))
+    expect = data.dt.tz_localize(
+        "US/Eastern", ambiguous="NaT", nonexistent="NaT"
+    )
+    got = dgd.from_cudf(data, 2).dt.tz_localize(
+        "US/Eastern", ambiguous="NaT", nonexistent="NaT"
+    )
+    dd.assert_eq(expect, got)
+
+    expect = expect.dt.tz_localize(None)
+    got = got.dt.tz_localize(None)
+    dd.assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        date_range("2000-04-01", "2000-04-03", freq="H").tz_localize("UTC"),
+        date_range("2000-04-01", "2000-04-03", freq="H").tz_localize(
+            "US/Eastern"
+        ),
+    ],
+)
+def test_tz_convert(data):
+    expect = Series(data).dt.tz_convert("US/Pacific")
+    got = dgd.from_cudf(Series(data), 2).dt.tz_convert("US/Pacific")
+    dd.assert_eq(expect, got)
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index 5d17a605577..22cc0f161e2 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -1,7 +1,13 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
+import numpy as np
 import pandas as pd
+import pytest
+from packaging import version
 
+import dask
+from dask.base import tokenize
+from dask.dataframe import assert_eq
 from dask.dataframe.methods import is_categorical_dtype
 
 import cudf
@@ -16,3 +22,60 @@ def test_is_categorical_dispatch():
 
     assert is_categorical_dtype(pd.Index([1, 2, 3], dtype="category"))
     assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category"))
+
+
+@pytest.mark.skipif(
+    version.parse(dask.__version__) <= version.parse("2023.6.0"),
+    reason="Pyarrow-conversion dispatch requires dask>2023.6.0",
+)
+def test_pyarrow_conversion_dispatch():
+    from dask.dataframe.dispatch import (
+        from_pyarrow_table_dispatch,
+        to_pyarrow_table_dispatch,
+    )
+
+    df1 = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc"))
+    df2 = from_pyarrow_table_dispatch(df1, to_pyarrow_table_dispatch(df1))
+
+    assert type(df1) == type(df2)
+    assert_eq(df1, df2)
+
+
+@pytest.mark.parametrize("index", [None, [1, 2] * 5])
+def test_deterministic_tokenize(index):
+    # Checks that `dask.base.normalize_token` correctly
+    # dispatches to the logic defined in `backends.py`
+    # (making `tokenize(<cudf-data>)` deterministic).
+    df = cudf.DataFrame(
+        {"A": range(10), "B": ["dog", "cat"] * 5, "C": range(10, 0, -1)},
+        index=index,
+    )
+
+    # Matching data should produce the same token
+    assert tokenize(df) == tokenize(df)
+    assert tokenize(df.A) == tokenize(df.A)
+    assert tokenize(df.index) == tokenize(df.index)
+    assert tokenize(df) == tokenize(df.copy(deep=True))
+    assert tokenize(df.A) == tokenize(df.A.copy(deep=True))
+    assert tokenize(df.index) == tokenize(df.index.copy(deep=True))
+
+    # Modifying a column element should change the token
+    original_token = tokenize(df)
+    original_token_a = tokenize(df.A)
+    df.A.iloc[2] = 10
+    assert original_token != tokenize(df)
+    assert original_token_a != tokenize(df.A)
+
+    # Modifying an index element should change the token
+    original_token = tokenize(df)
+    original_token_index = tokenize(df.index)
+    new_index = df.index.values
+    new_index[2] = 10
+    df.index = new_index
+    assert original_token != tokenize(df)
+    assert original_token_index != tokenize(df.index)
+
+    # Check MultiIndex case
+    df2 = df.set_index(["B", "C"], drop=False)
+    assert tokenize(df) != tokenize(df2)
+    assert tokenize(df2) == tokenize(df2)
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index cfb951901d3..84a821aaf79 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -1,7 +1,5 @@
 # Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
-import contextlib
-
 import numpy as np
 import pandas as pd
 import pytest
@@ -78,7 +76,18 @@ def test_groupby_basic(series, aggregation, pdf):
 
 # TODO: explore adding support with `.agg()`
 @pytest.mark.parametrize("series", [True, False])
-@pytest.mark.parametrize("aggregation", ["cumsum", "cumcount"])
+@pytest.mark.parametrize(
+    "aggregation",
+    [
+        "cumsum",
+        pytest.param(
+            "cumcount",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/13390"
+            ),
+        ),
+    ],
+)
 def test_groupby_cumulative(aggregation, pdf, series):
     gdf = cudf.DataFrame.from_pandas(pdf)
     ddf = dask_cudf.from_cudf(gdf, npartitions=5)
@@ -90,17 +99,10 @@ def test_groupby_cumulative(aggregation, pdf, series):
         gdf_grouped = gdf_grouped.xx
         ddf_grouped = ddf_grouped.xx
 
-    if pdf.isna().sum().any():
-        # https://github.com/rapidsai/cudf/issues/12055
-        gdf_grouped = gdf.groupby("xx")
-        context = pytest.raises(ValueError)
-    else:
-        context = contextlib.nullcontext()
-    with context:
-        a = getattr(gdf_grouped, aggregation)()
-        b = getattr(ddf_grouped, aggregation)()
+    a = getattr(gdf_grouped, aggregation)()
+    b = getattr(ddf_grouped, aggregation)()
 
-        dd.assert_eq(a, b)
+    dd.assert_eq(a, b)
 
 
 @pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
@@ -136,7 +138,6 @@ def test_groupby_agg(func, aggregation, pdf):
 
 @pytest.mark.parametrize("split_out", [1, 3])
 def test_groupby_agg_empty_partition(tmpdir, split_out):
-
     # Write random and empty cudf DataFrames
     # to two distinct files.
     df = cudf.datasets.randomdata()
@@ -496,7 +497,6 @@ def test_groupby_mean_sort_false():
 
 
 def test_groupby_reset_index_dtype():
-
     # Make sure int8 dtype is properly preserved
     # Through various cudf/dask_cudf ops
     #
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index ff2a3f2d095..2464abca71a 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "dask_cudf"
-version = "23.06.00"
+version = "23.10.00"
 description = "Utilities for Dask and cuDF interactions"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -18,12 +18,12 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.6.*",
+    "cudf==23.10.*",
     "cupy-cuda11x>=12.0.0",
-    "dask==2023.3.2",
-    "distributed==2023.3.2.1",
+    "dask>=2023.7.1",
+    "distributed>=2023.7.1",
     "fsspec>=0.6.0",
-    "numpy>=1.21,<1.24",
+    "numpy>=1.21",
     "pandas>=1.3,<1.6.0dev0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -39,8 +39,8 @@ dynamic = ["entry-points"]
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==23.6.*",
-    "numba>=0.56.4,<0.57",
+    "dask-cuda==23.10.*",
+    "numba>=0.57",
     "pytest",
     "pytest-cov",
     "pytest-xdist",