diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 9ceb4fea64a..a8c4e481683 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -10,6 +10,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  # Please keep pr-builder as the top job here
   pr-builder:
     needs:
       - changed-files
@@ -37,13 +38,23 @@ jobs:
       - unit-tests-cudf-pandas
       - pandas-tests
       - pandas-tests-diff
+      - telemetry-setup
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.02
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
+  telemetry-setup:
+    continue-on-error: true
+    runs-on: ubuntu-latest
+    env:
+      OTEL_SERVICE_NAME: 'pr-cudf'
+    steps:
+      - name: Telemetry setup
+        uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
   changed-files:
     secrets: inherit
+    needs: telemetry-setup
     uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.02
     with:
       files_yaml: |
@@ -91,9 +102,11 @@ jobs:
           - '!notebooks/**'
   checks:
     secrets: inherit
+    needs: telemetry-setup
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.02
     with:
       enable_check_generated_files: false
+      ignored_pr_jobs: "telemetry-summarize"
   conda-cpp-build:
     needs: checks
     secrets: inherit
@@ -259,6 +272,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
+    needs: telemetry-setup
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.02
     with:
       arch: '["amd64"]'
@@ -298,3 +312,18 @@ jobs:
         node_type: cpu4
         build_type: pull-request
         run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh"
+
+  telemetry-summarize:
+    runs-on: ubuntu-latest
+    needs: pr-builder
+    if: always()
+    continue-on-error: true
+    steps:
+      - name: Load stashed telemetry env vars
+        uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main
+        with:
+            load_service_name: true
+      - name: Telemetry summarize
+        uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main
+        with:
+          cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index bc931d73071..858352f515d 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -121,8 +121,6 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
@@ -153,8 +151,6 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
@@ -164,8 +160,6 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh
index 95f78f17f2f..b1bfac2a1dd 100755
--- a/ci/run_cudf_polars_polars_tests.sh
+++ b/ci/run_cudf_polars_polars_tests.sh
@@ -12,8 +12,29 @@ DESELECTED_TESTS=(
     "tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode
     "tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error
     "tests/docs/test_user_guide.py" # No dot binary in CI image
+    "tests/unit/test_polars_import.py::test_fork_safety" # test started to hang in polars-1.14
+    "tests/unit/operations/test_join.py::test_join_4_columns_with_validity" # fails in some systems, see https://github.com/pola-rs/polars/issues/19870
+    "tests/unit/io/test_csv.py::test_read_web_file" # fails in rockylinux8 due to SSL CA issues
 )
 
+if [[ $(arch) == "aarch64" ]]; then
+    # The binary used for TPC-H generation is compiled for x86_64, not aarch64.
+    DESELECTED_TESTS+=("tests/benchmark/test_pdsh.py::test_pdsh")
+    # The connectorx package is not available on arm
+    DESELECTED_TESTS+=("tests/unit/io/database/test_read.py::test_read_database")
+    # The necessary timezone information cannot be found in our CI image.
+    DESELECTED_TESTS+=("tests/unit/io/test_parquet.py::test_parametric_small_page_mask_filtering")
+    DESELECTED_TESTS+=("tests/unit/testing/test_assert_series_equal.py::test_assert_series_equal_parametric")
+    DESELECTED_TESTS+=("tests/unit/operations/test_join.py::test_join_4_columns_with_validity")
+else
+    # Ensure that we don't run dbgen when it uses newer symbols than supported by the glibc version in the CI image.
+    glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+    latest_glibc_symbol_found=$(nm py-polars/tests/benchmark/data/pdsh/dbgen/dbgen | grep GLIBC | grep -o "[0-9]\.[0-9]\+" | sort --version-sort | tail -1 | cut -d "." -f 2)
+    if [[ ${glibc_minor_version} -lt ${latest_glibc_symbol_found} ]]; then
+        DESELECTED_TESTS+=("tests/benchmark/test_pdsh.py::test_pdsh")
+    fi
+fi
+
 DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}")
 python -m pytest \
        --import-mode=importlib \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 0d315f95983..ecc490b378b 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -66,7 +66,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.14
+- polars>=1.11,<1.15
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<19.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index ac849f28c14..4559829ac3a 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -64,7 +64,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.14
+- polars>=1.11,<1.15
 - pre-commit
 - pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index 7a477291e7a..b6c03dc1bc2 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.11,<1.14
+    - polars >=1.11,<1.15
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6752ce12d83..506f6c185f5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -464,6 +464,7 @@ add_library(
   src/io/avro/avro_gpu.cu
   src/io/avro/reader_impl.cu
   src/io/comp/brotli_dict.cpp
+  src/io/comp/comp.cpp
   src/io/comp/cpu_unbz2.cpp
   src/io/comp/debrotli.cu
   src/io/comp/gpuinflate.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index ae78b206810..7fdaff35525 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -354,10 +354,7 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * strings benchmark -------------------------------------------------------------------
-ConfigureBench(
-  STRINGS_BENCH string/factory.cu string/repeat_strings.cpp string/replace.cpp string/translate.cpp
-  string/url_decode.cu
-)
+ConfigureBench(STRINGS_BENCH string/factory.cu string/repeat_strings.cpp string/url_decode.cu)
 
 ConfigureNVBench(
   STRINGS_NVBENCH
@@ -381,11 +378,13 @@ ConfigureNVBench(
   string/lengths.cpp
   string/like.cpp
   string/make_strings_column.cu
+  string/replace.cpp
   string/replace_re.cpp
   string/reverse.cpp
   string/slice.cpp
   string/split.cpp
   string/split_re.cpp
+  string/translate.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/io/json/json_reader_input.cpp b/cpp/benchmarks/io/json/json_reader_input.cpp
index 4366790f208..678f2f1a600 100644
--- a/cpp/benchmarks/io/json/json_reader_input.cpp
+++ b/cpp/benchmarks/io/json/json_reader_input.cpp
@@ -24,17 +24,19 @@
 
 #include <nvbench/nvbench.cuh>
 
-// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
-// run on most GPUs, but large enough to allow highest throughput
-constexpr size_t data_size         = 512 << 20;
+// Default size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks
+// to run on most GPUs, but large enough to allow highest throughput
+constexpr size_t default_data_size = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
 
 void json_read_common(cuio_source_sink_pair& source_sink,
                       cudf::size_type num_rows_to_read,
-                      nvbench::state& state)
+                      nvbench::state& state,
+                      cudf::io::compression_type comptype = cudf::io::compression_type::NONE,
+                      size_t data_size                    = default_data_size)
 {
   cudf::io::json_reader_options read_opts =
-    cudf::io::json_reader_options::builder(source_sink.make_source_info());
+    cudf::io::json_reader_options::builder(source_sink.make_source_info()).compression(comptype);
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
@@ -57,15 +59,21 @@ void json_read_common(cuio_source_sink_pair& source_sink,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
-cudf::size_type json_write_bm_data(cudf::io::sink_info sink,
-                                   std::vector<cudf::type_id> const& dtypes)
+cudf::size_type json_write_bm_data(
+  cudf::io::sink_info sink,
+  std::vector<cudf::type_id> const& dtypes,
+  cudf::io::compression_type comptype = cudf::io::compression_type::NONE,
+  size_t data_size                    = default_data_size)
 {
   auto const tbl = create_random_table(
     cycle_dtypes(dtypes, num_cols), table_size_bytes{data_size}, data_profile_builder());
   auto const view = tbl->view();
 
   cudf::io::json_writer_options const write_opts =
-    cudf::io::json_writer_options::builder(sink, view).na_rep("null").rows_per_chunk(100'000);
+    cudf::io::json_writer_options::builder(sink, view)
+      .na_rep("null")
+      .rows_per_chunk(100'000)
+      .compression(comptype);
   cudf::io::write_json(write_opts);
   return view.num_rows();
 }
@@ -87,6 +95,26 @@ void BM_json_read_io(nvbench::state& state, nvbench::type_list<nvbench::enum_typ
   json_read_common(source_sink, num_rows, state);
 }
 
+template <cudf::io::compression_type comptype, io_type IO>
+void BM_json_read_compressed_io(
+  nvbench::state& state, nvbench::type_list<nvbench::enum_type<comptype>, nvbench::enum_type<IO>>)
+{
+  size_t const data_size = state.get_int64("data_size");
+  cuio_source_sink_pair source_sink(IO);
+  auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
+                                         static_cast<int32_t>(data_type::FLOAT),
+                                         static_cast<int32_t>(data_type::DECIMAL),
+                                         static_cast<int32_t>(data_type::TIMESTAMP),
+                                         static_cast<int32_t>(data_type::DURATION),
+                                         static_cast<int32_t>(data_type::STRING),
+                                         static_cast<int32_t>(data_type::LIST),
+                                         static_cast<int32_t>(data_type::STRUCT)});
+  auto const num_rows =
+    json_write_bm_data(source_sink.make_sink_info(), d_type, comptype, data_size);
+
+  json_read_common(source_sink, num_rows, state, comptype, data_size);
+}
+
 template <data_type DataType, io_type IO>
 void BM_json_read_data_type(
   nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IO>>)
@@ -110,8 +138,9 @@ using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
 using io_list =
   nvbench::enum_type_list<io_type::FILEPATH, io_type::HOST_BUFFER, io_type::DEVICE_BUFFER>;
 
-using compression_list =
-  nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;
+using compression_list = nvbench::enum_type_list<cudf::io::compression_type::GZIP,
+                                                 cudf::io::compression_type::SNAPPY,
+                                                 cudf::io::compression_type::NONE>;
 
 NVBENCH_BENCH_TYPES(BM_json_read_data_type,
                     NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list<io_type::DEVICE_BUFFER>))
@@ -123,3 +152,10 @@ NVBENCH_BENCH_TYPES(BM_json_read_io, NVBENCH_TYPE_AXES(io_list))
   .set_name("json_read_io")
   .set_type_axes_names({"io"})
   .set_min_samples(4);
+
+NVBENCH_BENCH_TYPES(BM_json_read_compressed_io,
+                    NVBENCH_TYPE_AXES(compression_list, nvbench::enum_type_list<io_type::FILEPATH>))
+  .set_name("json_read_compressed_io")
+  .set_type_axes_names({"compression_type", "io"})
+  .add_int64_power_of_two_axis("data_size", nvbench::range(20, 29, 1))
+  .set_min_samples(4);
diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp
index cc548ccd3de..011b2590c6f 100644
--- a/cpp/benchmarks/io/nvbench_helpers.hpp
+++ b/cpp/benchmarks/io/nvbench_helpers.hpp
@@ -76,6 +76,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
   [](auto value) {
     switch (value) {
       case cudf::io::compression_type::SNAPPY: return "SNAPPY";
+      case cudf::io::compression_type::GZIP: return "GZIP";
       case cudf::io::compression_type::NONE: return "NONE";
       default: return "Unknown";
     }
diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp
index 3d9d51bfd6d..643e857f356 100644
--- a/cpp/benchmarks/string/replace.cpp
+++ b/cpp/benchmarks/string/replace.cpp
@@ -14,11 +14,8 @@
  * limitations under the License.
  */
 
-#include "string_bench_args.hpp"
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -27,59 +24,51 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <limits>
-
-class StringReplace : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
 enum replace_type { scalar, slice, multi };
 
-static void BM_replace(benchmark::State& state, replace_type rt)
+static void bench_replace(nvbench::state& state)
 {
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
+  auto const api       = state.get_string("api");
+
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
-  cudf::strings_column_view input(column->view());
-  cudf::string_scalar target("+");
-  cudf::string_scalar repl("");
-  cudf::test::strings_column_wrapper targets({"+", "-"});
-  cudf::test::strings_column_wrapper repls({"", ""});
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (rt) {
-      case scalar: cudf::strings::replace(input, target, repl); break;
-      case slice: cudf::strings::replace_slice(input, repl, 1, 10); break;
-      case multi:
-        cudf::strings::replace_multiple(
-          input, cudf::strings_column_view(targets), cudf::strings_column_view(repls));
-        break;
-    }
-  }
+  cudf::strings_column_view input(column->view());
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
-}
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto const chars_size = input.chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 2;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  if (api == "scalar") {
+    cudf::string_scalar target("+");
+    cudf::string_scalar repl("-");
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::replace(input, target, repl); });
+  } else if (api == "multi") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::test::strings_column_wrapper targets({"+", " "});
+      cudf::test::strings_column_wrapper repls({"-", "_"});
+      cudf::strings::replace_multiple(
+        input, cudf::strings_column_view(targets), cudf::strings_column_view(repls));
+    });
+  } else if (api == "slice") {
+    cudf::string_scalar repl("0123456789");
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::replace_slice(input, repl, 1, 10); });
+  }
 }
 
-#define STRINGS_BENCHMARK_DEFINE(name)                              \
-  BENCHMARK_DEFINE_F(StringReplace, name)                           \
-  (::benchmark::State & st) { BM_replace(st, replace_type::name); } \
-  BENCHMARK_REGISTER_F(StringReplace, name)                         \
-    ->Apply(generate_bench_args)                                    \
-    ->UseManualTime()                                               \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(scalar)
-STRINGS_BENCHMARK_DEFINE(slice)
-STRINGS_BENCHMARK_DEFINE(multi)
+NVBENCH_BENCH(bench_replace)
+  .set_name("replace")
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
+  .add_string_axis("api", {"scalar", "multi", "slice"});
diff --git a/cpp/benchmarks/string/translate.cpp b/cpp/benchmarks/string/translate.cpp
index dc3c8c71488..020ab3ca965 100644
--- a/cpp/benchmarks/string/translate.cpp
+++ b/cpp/benchmarks/string/translate.cpp
@@ -14,13 +14,7 @@
  * limitations under the License.
  */
 
-#include "string_bench_args.hpp"
-
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
@@ -28,20 +22,24 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-#include <algorithm>
+#include <nvbench/nvbench.cuh>
 
-class StringTranslate : public cudf::benchmark {};
+#include <algorithm>
+#include <vector>
 
 using entry_type = std::pair<cudf::char_utf8, cudf::char_utf8>;
 
-static void BM_translate(benchmark::State& state, int entry_count)
+static void bench_translate(nvbench::state& state)
 {
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  auto const num_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width   = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width   = static_cast<cudf::size_type>(state.get_int64("max_width"));
+  auto const entry_count = static_cast<cudf::size_type>(state.get_int64("entries"));
+
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
-  cudf::strings_column_view input(column->view());
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+  auto const input  = cudf::strings_column_view(column->view());
 
   std::vector<entry_type> entries(entry_count);
   std::transform(thrust::counting_iterator<int>(0),
@@ -51,33 +49,19 @@ static void BM_translate(benchmark::State& state, int entry_count)
                    return entry_type{'!' + idx, '~' - idx};
                  });
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    cudf::strings::translate(input, entries);
-  }
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = input.chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { cudf::strings::translate(input, entries); });
 }
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
-}
-
-#define STRINGS_BENCHMARK_DEFINE(name, entries)            \
-  BENCHMARK_DEFINE_F(StringTranslate, name)                \
-  (::benchmark::State & st) { BM_translate(st, entries); } \
-  BENCHMARK_REGISTER_F(StringTranslate, name)              \
-    ->Apply(generate_bench_args)                           \
-    ->UseManualTime()                                      \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(translate_small, 5)
-STRINGS_BENCHMARK_DEFINE(translate_medium, 25)
-STRINGS_BENCHMARK_DEFINE(translate_large, 50)
+NVBENCH_BENCH(bench_translate)
+  .set_name("translate")
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
+  .add_int64_axis("entries", {5, 25, 50});
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7cd4697f592..0c3244a1c75 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -946,6 +946,8 @@ class json_writer_options_builder;
 class json_writer_options {
   // Specify the sink to use for writer output
   sink_info _sink;
+  // Specify the compression format of the sink
+  compression_type _compression = compression_type::NONE;
   // maximum number of rows to write in each chunk (limits memory use)
   size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
   // Set of columns to output
@@ -1022,6 +1024,13 @@ class json_writer_options {
    */
   [[nodiscard]] std::string const& get_na_rep() const { return _na_rep; }
 
+  /**
+   * @brief Returns compression type used for sink
+   *
+   * @return compression type for sink
+   */
+  [[nodiscard]] compression_type get_compression() const { return _compression; }
+
   /**
    * @brief Whether to output nulls as 'null'.
    *
@@ -1066,6 +1075,13 @@ class json_writer_options {
    */
   void set_table(table_view tbl) { _table = tbl; }
 
+  /**
+   * @brief Sets compression type to be used
+   *
+   * @param comptype Compression type for sink
+   */
+  void set_compression(compression_type comptype) { _compression = comptype; }
+
   /**
    * @brief Sets metadata.
    *
@@ -1153,6 +1169,18 @@ class json_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets compression type of output sink
+   *
+   * @param comptype Compression type used
+   * @return this for chaining
+   */
+  json_writer_options_builder& compression(compression_type comptype)
+  {
+    options._compression = comptype;
+    return *this;
+  }
+
   /**
    * @brief Sets optional metadata (with column names).
    *
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index c6af0c3c58a..06987139188 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -21,7 +21,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/unary.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -253,16 +253,11 @@ struct binary_op_double_device_dispatcher {
 template <typename Functor>
 CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f)
 {
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
-
-  int start = tid + blkid * blksz;
-  int step  = blksz * gridsz;
+  auto start        = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
 #pragma unroll
-  for (cudf::size_type i = start; i < size; i += step) {
+  for (auto i = start; i < size; i += stride) {
     f(i);
   }
 }
@@ -282,9 +277,9 @@ void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
   int min_grid_size;
   CUDF_CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel<decltype(f)>));
-  // 2 elements per thread.
-  int const grid_size = util::div_rounding_up_safe(size, 2 * block_size);
-  for_each_kernel<<<grid_size, block_size, 0, stream.value()>>>(size, std::forward<Functor&&>(f));
+  auto grid = cudf::detail::grid_1d(size, block_size, 2 /* elements_per_thread */);
+  for_each_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    size, std::forward<Functor&&>(f));
 }
 
 template <class BinaryOperator>
diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp
new file mode 100644
index 00000000000..b26a6292806
--- /dev/null
+++ b/cpp/src/io/comp/comp.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "comp.hpp"
+
+#include "io/utilities/hostdevice_vector.hpp"
+#include "nvcomp_adapter.hpp"
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <zlib.h>  // GZIP compression
+
+namespace cudf::io::detail {
+
+namespace {
+
+/**
+ * @brief GZIP host compressor (includes header)
+ */
+std::vector<std::uint8_t> compress_gzip(host_span<uint8_t const> src)
+{
+  z_stream zs;
+  zs.zalloc   = Z_NULL;
+  zs.zfree    = Z_NULL;
+  zs.opaque   = Z_NULL;
+  zs.avail_in = src.size();
+  zs.next_in  = reinterpret_cast<unsigned char*>(const_cast<unsigned char*>(src.data()));
+
+  std::vector<uint8_t> dst;
+  zs.avail_out = 0;
+  zs.next_out  = nullptr;
+
+  int windowbits    = 15;
+  int gzip_encoding = 16;
+  int ret           = deflateInit2(
+    &zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, windowbits | gzip_encoding, 8, Z_DEFAULT_STRATEGY);
+  CUDF_EXPECTS(ret == Z_OK, "GZIP DEFLATE compression initialization failed.");
+
+  uint32_t estcomplen = deflateBound(&zs, src.size());
+  dst.resize(estcomplen);
+  zs.avail_out = estcomplen;
+  zs.next_out  = dst.data();
+
+  ret = deflate(&zs, Z_FINISH);
+  CUDF_EXPECTS(ret == Z_STREAM_END, "GZIP DEFLATE compression failed due to insufficient space!");
+  dst.resize(std::distance(dst.data(), zs.next_out));
+
+  ret = deflateEnd(&zs);
+  CUDF_EXPECTS(ret == Z_OK, "GZIP DEFLATE compression failed at deallocation");
+
+  return dst;
+}
+
+/**
+ * @brief SNAPPY device compressor
+ */
+std::vector<std::uint8_t> compress_snappy(host_span<uint8_t const> src,
+                                          rmm::cuda_stream_view stream)
+{
+  auto const d_src =
+    cudf::detail::make_device_uvector_async(src, stream, cudf::get_current_device_resource_ref());
+  cudf::detail::hostdevice_vector<device_span<uint8_t const>> inputs(1, stream);
+  inputs[0] = d_src;
+  inputs.host_to_device_async(stream);
+
+  auto dst_size = compress_max_output_chunk_size(nvcomp::compression_type::SNAPPY, src.size());
+  rmm::device_uvector<uint8_t> d_dst(dst_size, stream);
+  cudf::detail::hostdevice_vector<device_span<uint8_t>> outputs(1, stream);
+  outputs[0] = d_dst;
+  outputs.host_to_device_async(stream);
+
+  cudf::detail::hostdevice_vector<cudf::io::compression_result> hd_status(1, stream);
+  hd_status[0] = {};
+  hd_status.host_to_device_async(stream);
+
+  nvcomp::batched_compress(nvcomp::compression_type::SNAPPY, inputs, outputs, hd_status, stream);
+
+  hd_status.device_to_host_sync(stream);
+  CUDF_EXPECTS(hd_status[0].status == cudf::io::compression_status::SUCCESS,
+               "snappy compression failed");
+  return cudf::detail::make_std_vector_sync<uint8_t>(d_dst, stream);
+}
+
+}  // namespace
+
+std::vector<std::uint8_t> compress(compression_type compression,
+                                   host_span<uint8_t const> src,
+                                   rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  switch (compression) {
+    case compression_type::GZIP: return compress_gzip(src);
+    case compression_type::SNAPPY: return compress_snappy(src, stream);
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/comp/comp.hpp b/cpp/src/io/comp/comp.hpp
new file mode 100644
index 00000000000..652abbbeda6
--- /dev/null
+++ b/cpp/src/io/comp/comp.hpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/io/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace CUDF_EXPORT cudf {
+namespace io::detail {
+
+/**
+ * @brief Compresses a system memory buffer.
+ *
+ * @param compression Type of compression of the input data
+ * @param src         Decompressed host buffer
+ * @param stream      CUDA stream used for device memory operations and kernel launches
+ *
+ * @return Vector containing the Compressed output
+ */
+std::vector<uint8_t> compress(compression_type compression,
+                              host_span<uint8_t const> src,
+                              rmm::cuda_stream_view stream);
+
+}  // namespace io::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index fff1cf0c96a..090ea1430b5 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -980,27 +980,27 @@ __device__ int parse_gzip_header(uint8_t const* src, size_t src_size)
     {
       uint8_t flags = src[3];
       hdr_len       = 10;
-      if (flags & GZIPHeaderFlag::fextra)  // Extra fields present
+      if (flags & detail::GZIPHeaderFlag::fextra)  // Extra fields present
       {
         int xlen = src[hdr_len] | (src[hdr_len + 1] << 8);
         hdr_len += xlen;
         if (hdr_len >= src_size) return -1;
       }
-      if (flags & GZIPHeaderFlag::fname)  // Original file name present
+      if (flags & detail::GZIPHeaderFlag::fname)  // Original file name present
       {
         // Skip zero-terminated string
         do {
           if (hdr_len >= src_size) return -1;
         } while (src[hdr_len++] != 0);
       }
-      if (flags & GZIPHeaderFlag::fcomment)  // Comment present
+      if (flags & detail::GZIPHeaderFlag::fcomment)  // Comment present
       {
         // Skip zero-terminated string
         do {
           if (hdr_len >= src_size) return -1;
         } while (src[hdr_len++] != 0);
       }
-      if (flags & GZIPHeaderFlag::fhcrc)  // Header CRC present
+      if (flags & detail::GZIPHeaderFlag::fhcrc)  // Header CRC present
       {
         hdr_len += 2;
       }
diff --git a/cpp/src/io/comp/io_uncomp.hpp b/cpp/src/io/comp/io_uncomp.hpp
index 1c9578fa5c0..ca722a9b7ee 100644
--- a/cpp/src/io/comp/io_uncomp.hpp
+++ b/cpp/src/io/comp/io_uncomp.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,8 +25,8 @@
 
 using cudf::host_span;
 
-namespace cudf {
-namespace io {
+namespace CUDF_EXPORT cudf {
+namespace io::detail {
 
 /**
  * @brief Decompresses a system memory buffer.
@@ -36,13 +36,35 @@ namespace io {
  *
  * @return Vector containing the Decompressed output
  */
-std::vector<uint8_t> decompress(compression_type compression, host_span<uint8_t const> src);
+[[nodiscard]] std::vector<uint8_t> decompress(compression_type compression,
+                                              host_span<uint8_t const> src);
 
+/**
+ * @brief Decompresses a system memory buffer.
+ *
+ * @param compression Type of compression of the input data
+ * @param src         Compressed host buffer
+ * @param dst         Destination host span to place decompressed buffer
+ * @param stream      CUDA stream used for device memory operations and kernel launches
+ *
+ * @return Size of decompressed output
+ */
 size_t decompress(compression_type compression,
                   host_span<uint8_t const> src,
                   host_span<uint8_t> dst,
                   rmm::cuda_stream_view stream);
 
+/**
+ * @brief Without actually decompressing the compressed input buffer passed, return the size of
+ * decompressed output. If the decompressed size cannot be extracted apriori, return zero.
+ *
+ * @param compression Type of compression of the input data
+ * @param src         Compressed host buffer
+ *
+ * @return Size of decompressed output
+ */
+size_t get_uncompressed_size(compression_type compression, host_span<uint8_t const> src);
+
 /**
  * @brief GZIP header flags
  * See https://tools.ietf.org/html/rfc1952
@@ -55,5 +77,5 @@ constexpr uint8_t fname    = 0x08;  // Original file name present
 constexpr uint8_t fcomment = 0x10;  // Comment present
 };                                  // namespace GZIPHeaderFlag
 
-}  // namespace io
-}  // namespace cudf
+}  // namespace io::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index fb8c308065d..b3d43fa786a 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -28,13 +28,12 @@
 
 #include <cstring>  // memset
 
-using cudf::host_span;
-
-namespace cudf {
-namespace io {
+namespace cudf::io::detail {
 
 #pragma pack(push, 1)
 
+namespace {
+
 struct gz_file_header_s {
   uint8_t id1;        // 0x1f
   uint8_t id2;        // 0x8b
@@ -261,7 +260,7 @@ void cpu_inflate_vector(std::vector<uint8_t>& dst, uint8_t const* comp_data, siz
   strm.avail_out = dst.size();
   strm.total_out = 0;
   auto zerr      = inflateInit2(&strm, -15);  // -15 for raw data without GZIP headers
-  CUDF_EXPECTS(zerr == 0, "Error in DEFLATE stream");
+  CUDF_EXPECTS(zerr == 0, "Error in DEFLATE stream: inflateInit2 failed");
   do {
     if (strm.avail_out == 0) {
       dst.resize(strm.total_out + (1 << 30));
@@ -273,125 +272,7 @@ void cpu_inflate_vector(std::vector<uint8_t>& dst, uint8_t const* comp_data, siz
            strm.total_out == dst.size());
   dst.resize(strm.total_out);
   inflateEnd(&strm);
-  CUDF_EXPECTS(zerr == Z_STREAM_END, "Error in DEFLATE stream");
-}
-
-std::vector<uint8_t> decompress(compression_type compression, host_span<uint8_t const> src)
-{
-  CUDF_EXPECTS(src.data() != nullptr, "Decompression: Source cannot be nullptr");
-  CUDF_EXPECTS(not src.empty(), "Decompression: Source size cannot be 0");
-
-  auto raw                 = src.data();
-  uint8_t const* comp_data = nullptr;
-  size_t comp_len          = 0;
-  size_t uncomp_len        = 0;
-
-  switch (compression) {
-    case compression_type::AUTO:
-    case compression_type::GZIP: {
-      gz_archive_s gz;
-      if (ParseGZArchive(&gz, raw, src.size())) {
-        compression = compression_type::GZIP;
-        comp_data   = gz.comp_data;
-        comp_len    = gz.comp_len;
-        uncomp_len  = gz.isize;
-      }
-      if (compression != compression_type::AUTO) break;
-      [[fallthrough]];
-    }
-    case compression_type::ZIP: {
-      zip_archive_s za;
-      if (OpenZipArchive(&za, raw, src.size())) {
-        size_t cdfh_ofs = 0;
-        for (int i = 0; i < za.eocd->num_entries; i++) {
-          auto const* cdfh = reinterpret_cast<zip_cdfh_s const*>(
-            reinterpret_cast<uint8_t const*>(za.cdfh) + cdfh_ofs);
-          int cdfh_len = sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len;
-          if (cdfh_ofs + cdfh_len > za.eocd->cdir_size || cdfh->sig != 0x0201'4b50) {
-            // Bad cdir
-            break;
-          }
-          // For now, only accept with non-zero file sizes and DEFLATE
-          if (cdfh->comp_method == 8 && cdfh->comp_size > 0 && cdfh->uncomp_size > 0) {
-            size_t lfh_ofs  = cdfh->hdr_ofs;
-            auto const* lfh = reinterpret_cast<zip_lfh_s const*>(raw + lfh_ofs);
-            if (lfh_ofs + sizeof(zip_lfh_s) <= src.size() && lfh->sig == 0x0403'4b50 &&
-                lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len <= src.size()) {
-              if (lfh->comp_method == 8 && lfh->comp_size > 0 && lfh->uncomp_size > 0) {
-                size_t file_start = lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len;
-                size_t file_end   = file_start + lfh->comp_size;
-                if (file_end <= src.size()) {
-                  // Pick the first valid file of non-zero size (only 1 file expected in archive)
-                  compression = compression_type::ZIP;
-                  comp_data   = raw + file_start;
-                  comp_len    = lfh->comp_size;
-                  uncomp_len  = lfh->uncomp_size;
-                  break;
-                }
-              }
-            }
-          }
-          cdfh_ofs += cdfh_len;
-        }
-      }
-    }
-      if (compression != compression_type::AUTO) break;
-      [[fallthrough]];
-    case compression_type::BZIP2:
-      if (src.size() > 4) {
-        auto const* fhdr = reinterpret_cast<bz2_file_header_s const*>(raw);
-        // Check for BZIP2 file signature "BZh1" to "BZh9"
-        if (fhdr->sig[0] == 'B' && fhdr->sig[1] == 'Z' && fhdr->sig[2] == 'h' &&
-            fhdr->blksz >= '1' && fhdr->blksz <= '9') {
-          compression = compression_type::BZIP2;
-          comp_data   = raw;
-          comp_len    = src.size();
-          uncomp_len  = 0;
-        }
-      }
-      if (compression != compression_type::AUTO) break;
-      [[fallthrough]];
-    default: CUDF_FAIL("Unsupported compressed stream type");
-  }
-
-  CUDF_EXPECTS(comp_data != nullptr and comp_len > 0, "Unsupported compressed stream type");
-
-  if (uncomp_len <= 0) {
-    uncomp_len = comp_len * 4 + 4096;  // In case uncompressed size isn't known in advance, assume
-                                       // ~4:1 compression for initial size
-  }
-
-  if (compression == compression_type::GZIP || compression == compression_type::ZIP) {
-    // INFLATE
-    std::vector<uint8_t> dst(uncomp_len);
-    cpu_inflate_vector(dst, comp_data, comp_len);
-    return dst;
-  }
-  if (compression == compression_type::BZIP2) {
-    size_t src_ofs = 0;
-    size_t dst_ofs = 0;
-    int bz_err     = 0;
-    std::vector<uint8_t> dst(uncomp_len);
-    do {
-      size_t dst_len = uncomp_len - dst_ofs;
-      bz_err = cpu_bz2_uncompress(comp_data, comp_len, dst.data() + dst_ofs, &dst_len, &src_ofs);
-      if (bz_err == BZ_OUTBUFF_FULL) {
-        // TBD: We could infer the compression ratio based on produced/consumed byte counts
-        // in order to minimize realloc events and over-allocation
-        dst_ofs = dst_len;
-        dst_len = uncomp_len + (uncomp_len / 2);
-        dst.resize(dst_len);
-        uncomp_len = dst_len;
-      } else if (bz_err == 0) {
-        uncomp_len = dst_len;
-        dst.resize(uncomp_len);
-      }
-    } while (bz_err == BZ_OUTBUFF_FULL);
-    CUDF_EXPECTS(bz_err == 0, "Decompression: error in stream");
-    return dst;
-  }
-
-  CUDF_FAIL("Unsupported compressed stream type");
+  CUDF_EXPECTS(zerr == Z_STREAM_END, "Error in DEFLATE stream: Z_STREAM_END not encountered");
 }
 
 /**
@@ -536,14 +417,130 @@ size_t decompress_zstd(host_span<uint8_t const> src,
   CUDF_EXPECTS(hd_stats[0].status == compression_status::SUCCESS, "ZSTD decompression failed");
 
   // Copy temporary output to `dst`
-  cudf::detail::cuda_memcpy_async(
-    dst.subspan(0, hd_stats[0].bytes_written),
-    device_span<uint8_t const>{d_dst.data(), hd_stats[0].bytes_written},
-    stream);
+  cudf::detail::cuda_memcpy(dst.subspan(0, hd_stats[0].bytes_written),
+                            device_span<uint8_t const>{d_dst.data(), hd_stats[0].bytes_written},
+                            stream);
 
   return hd_stats[0].bytes_written;
 }
 
+struct source_properties {
+  compression_type compression = compression_type::NONE;
+  uint8_t const* comp_data     = nullptr;
+  size_t comp_len              = 0;
+  size_t uncomp_len            = 0;
+};
+
+source_properties get_source_properties(compression_type compression, host_span<uint8_t const> src)
+{
+  auto raw                 = src.data();
+  uint8_t const* comp_data = nullptr;
+  size_t comp_len          = 0;
+  size_t uncomp_len        = 0;
+
+  switch (compression) {
+    case compression_type::AUTO:
+    case compression_type::GZIP: {
+      gz_archive_s gz;
+      auto const parse_succeeded = ParseGZArchive(&gz, src.data(), src.size());
+      CUDF_EXPECTS(parse_succeeded, "Failed to parse GZIP header while fetching source properties");
+      compression = compression_type::GZIP;
+      comp_data   = gz.comp_data;
+      comp_len    = gz.comp_len;
+      uncomp_len  = gz.isize;
+      if (compression != compression_type::AUTO) break;
+      [[fallthrough]];
+    }
+    case compression_type::ZIP: {
+      zip_archive_s za;
+      if (OpenZipArchive(&za, raw, src.size())) {
+        size_t cdfh_ofs = 0;
+        for (int i = 0; i < za.eocd->num_entries; i++) {
+          auto const* cdfh = reinterpret_cast<zip_cdfh_s const*>(
+            reinterpret_cast<uint8_t const*>(za.cdfh) + cdfh_ofs);
+          int cdfh_len = sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len;
+          if (cdfh_ofs + cdfh_len > za.eocd->cdir_size || cdfh->sig != 0x0201'4b50) {
+            // Bad cdir
+            break;
+          }
+          // For now, only accept with non-zero file sizes and DEFLATE
+          if (cdfh->comp_method == 8 && cdfh->comp_size > 0 && cdfh->uncomp_size > 0) {
+            size_t lfh_ofs  = cdfh->hdr_ofs;
+            auto const* lfh = reinterpret_cast<zip_lfh_s const*>(raw + lfh_ofs);
+            if (lfh_ofs + sizeof(zip_lfh_s) <= src.size() && lfh->sig == 0x0403'4b50 &&
+                lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len <= src.size()) {
+              if (lfh->comp_method == 8 && lfh->comp_size > 0 && lfh->uncomp_size > 0) {
+                size_t file_start = lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len;
+                size_t file_end   = file_start + lfh->comp_size;
+                if (file_end <= src.size()) {
+                  // Pick the first valid file of non-zero size (only 1 file expected in archive)
+                  compression = compression_type::ZIP;
+                  comp_data   = raw + file_start;
+                  comp_len    = lfh->comp_size;
+                  uncomp_len  = lfh->uncomp_size;
+                  break;
+                }
+              }
+            }
+          }
+          cdfh_ofs += cdfh_len;
+        }
+      }
+      if (compression != compression_type::AUTO) break;
+      [[fallthrough]];
+    }
+    case compression_type::BZIP2: {
+      if (src.size() > 4) {
+        auto const* fhdr = reinterpret_cast<bz2_file_header_s const*>(raw);
+        // Check for BZIP2 file signature "BZh1" to "BZh9"
+        if (fhdr->sig[0] == 'B' && fhdr->sig[1] == 'Z' && fhdr->sig[2] == 'h' &&
+            fhdr->blksz >= '1' && fhdr->blksz <= '9') {
+          compression = compression_type::BZIP2;
+          comp_data   = raw;
+          comp_len    = src.size();
+          uncomp_len  = 0;
+        }
+      }
+      if (compression != compression_type::AUTO) break;
+      [[fallthrough]];
+    }
+    case compression_type::SNAPPY: {
+      uncomp_len     = 0;
+      auto cur       = src.begin();
+      auto const end = src.end();
+      // Read uncompressed length (varint)
+      {
+        uint32_t l = 0, c;
+        do {
+          c              = *cur++;
+          auto const lo7 = c & 0x7f;
+          if (l >= 28 && c > 0xf) {
+            uncomp_len = 0;
+            break;
+          }
+          uncomp_len |= lo7 << l;
+          l += 7;
+        } while (c > 0x7f && cur < end);
+        CUDF_EXPECTS(uncomp_len != 0 and cur < end, "Error in retrieving SNAPPY source properties");
+      }
+      comp_data = raw;
+      comp_len  = src.size();
+      if (compression != compression_type::AUTO) break;
+      [[fallthrough]];
+    }
+    default: CUDF_FAIL("Unsupported compressed stream type");
+  }
+
+  return source_properties{compression, comp_data, comp_len, uncomp_len};
+}
+
+}  // namespace
+
+size_t get_uncompressed_size(compression_type compression, host_span<uint8_t const> src)
+{
+  return get_source_properties(compression, src).uncomp_len;
+}
+
 size_t decompress(compression_type compression,
                   host_span<uint8_t const> src,
                   host_span<uint8_t> dst,
@@ -558,5 +555,63 @@ size_t decompress(compression_type compression,
   }
 }
 
-}  // namespace io
-}  // namespace cudf
+std::vector<uint8_t> decompress(compression_type compression, host_span<uint8_t const> src)
+{
+  CUDF_EXPECTS(src.data() != nullptr, "Decompression: Source cannot be nullptr");
+  CUDF_EXPECTS(not src.empty(), "Decompression: Source size cannot be 0");
+
+  auto srcprops = get_source_properties(compression, src);
+  CUDF_EXPECTS(srcprops.comp_data != nullptr and srcprops.comp_len > 0,
+               "Unsupported compressed stream type");
+
+  if (srcprops.uncomp_len <= 0) {
+    srcprops.uncomp_len =
+      srcprops.comp_len * 4 + 4096;  // In case uncompressed size isn't known in advance, assume
+                                     // ~4:1 compression for initial size
+  }
+
+  if (compression == compression_type::GZIP) {
+    // INFLATE
+    std::vector<uint8_t> dst(srcprops.uncomp_len);
+    decompress_gzip(src, dst);
+    return dst;
+  }
+  if (compression == compression_type::ZIP) {
+    std::vector<uint8_t> dst(srcprops.uncomp_len);
+    cpu_inflate_vector(dst, srcprops.comp_data, srcprops.comp_len);
+    return dst;
+  }
+  if (compression == compression_type::BZIP2) {
+    size_t src_ofs = 0;
+    size_t dst_ofs = 0;
+    int bz_err     = 0;
+    std::vector<uint8_t> dst(srcprops.uncomp_len);
+    do {
+      size_t dst_len = srcprops.uncomp_len - dst_ofs;
+      bz_err         = cpu_bz2_uncompress(
+        srcprops.comp_data, srcprops.comp_len, dst.data() + dst_ofs, &dst_len, &src_ofs);
+      if (bz_err == BZ_OUTBUFF_FULL) {
+        // TBD: We could infer the compression ratio based on produced/consumed byte counts
+        // in order to minimize realloc events and over-allocation
+        dst_ofs = dst_len;
+        dst_len = srcprops.uncomp_len + (srcprops.uncomp_len / 2);
+        dst.resize(dst_len);
+        srcprops.uncomp_len = dst_len;
+      } else if (bz_err == 0) {
+        srcprops.uncomp_len = dst_len;
+        dst.resize(srcprops.uncomp_len);
+      }
+    } while (bz_err == BZ_OUTBUFF_FULL);
+    CUDF_EXPECTS(bz_err == 0, "Decompression: error in stream");
+    return dst;
+  }
+  if (compression == compression_type::SNAPPY) {
+    std::vector<uint8_t> dst(srcprops.uncomp_len);
+    decompress_snappy(src, dst);
+    return dst;
+  }
+
+  CUDF_FAIL("Unsupported compressed stream type");
+}
+
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 279f5e71351..82d8152ca1c 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -24,6 +24,7 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -42,6 +43,70 @@ namespace cudf::io::json::detail {
 
 namespace {
 
+class compressed_host_buffer_source final : public datasource {
+ public:
+  explicit compressed_host_buffer_source(std::unique_ptr<datasource> const& src,
+                                         compression_type comptype)
+    : _comptype{comptype}, _dbuf_ptr{src->host_read(0, src->size())}
+  {
+    auto ch_buffer = host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(_dbuf_ptr->data()),
+                                              _dbuf_ptr->size());
+    if (comptype == compression_type::GZIP || comptype == compression_type::ZIP ||
+        comptype == compression_type::SNAPPY) {
+      _decompressed_ch_buffer_size = cudf::io::detail::get_uncompressed_size(_comptype, ch_buffer);
+    } else {
+      _decompressed_buffer         = cudf::io::detail::decompress(_comptype, ch_buffer);
+      _decompressed_ch_buffer_size = _decompressed_buffer.size();
+    }
+  }
+
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
+  {
+    auto ch_buffer = host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(_dbuf_ptr->data()),
+                                              _dbuf_ptr->size());
+    if (_decompressed_buffer.empty()) {
+      auto decompressed_hbuf = cudf::io::detail::decompress(_comptype, ch_buffer);
+      auto const count       = std::min(size, decompressed_hbuf.size() - offset);
+      bool partial_read      = offset + count < decompressed_hbuf.size();
+      if (!partial_read) {
+        std::memcpy(dst, decompressed_hbuf.data() + offset, count);
+        return count;
+      }
+      _decompressed_buffer = std::move(decompressed_hbuf);
+    }
+    auto const count = std::min(size, _decompressed_buffer.size() - offset);
+    std::memcpy(dst, _decompressed_buffer.data() + offset, count);
+    return count;
+  }
+
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  {
+    auto ch_buffer = host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(_dbuf_ptr->data()),
+                                              _dbuf_ptr->size());
+    if (_decompressed_buffer.empty()) {
+      auto decompressed_hbuf = cudf::io::detail::decompress(_comptype, ch_buffer);
+      auto const count       = std::min(size, decompressed_hbuf.size() - offset);
+      bool partial_read      = offset + count < decompressed_hbuf.size();
+      if (!partial_read)
+        return std::make_unique<owning_buffer<std::vector<uint8_t>>>(
+          std::move(decompressed_hbuf), decompressed_hbuf.data() + offset, count);
+      _decompressed_buffer = std::move(decompressed_hbuf);
+    }
+    auto const count = std::min(size, _decompressed_buffer.size() - offset);
+    return std::make_unique<non_owning_buffer>(_decompressed_buffer.data() + offset, count);
+  }
+
+  [[nodiscard]] bool supports_device_read() const override { return false; }
+
+  [[nodiscard]] size_t size() const override { return _decompressed_ch_buffer_size; }
+
+ private:
+  std::unique_ptr<datasource::buffer> _dbuf_ptr;
+  compression_type _comptype;
+  size_t _decompressed_ch_buffer_size;
+  std::vector<std::uint8_t> _decompressed_buffer;
+};
+
 // Return total size of sources enclosing the passed range
 std::size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
                          std::size_t range_offset,
@@ -126,13 +191,12 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
 {
   CUDF_FUNC_RANGE();
 
-  std::size_t const total_source_size       = sources_size(sources, 0, 0);
-  auto constexpr num_delimiter_chars        = 1;
-  auto const delimiter                      = reader_opts.get_delimiter();
-  auto const num_extra_delimiters           = num_delimiter_chars * sources.size();
-  compression_type const reader_compression = reader_opts.get_compression();
-  std::size_t const chunk_offset            = reader_opts.get_byte_range_offset();
-  std::size_t chunk_size                    = reader_opts.get_byte_range_size();
+  std::size_t const total_source_size = sources_size(sources, 0, 0);
+  auto constexpr num_delimiter_chars  = 1;
+  auto const delimiter                = reader_opts.get_delimiter();
+  auto const num_extra_delimiters     = num_delimiter_chars * sources.size();
+  std::size_t const chunk_offset      = reader_opts.get_byte_range_offset();
+  std::size_t chunk_size              = reader_opts.get_byte_range_size();
 
   CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
                "Invalid offsetting",
@@ -143,22 +207,16 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
   int num_subchunks_prealloced        = should_load_till_last_source ? 0 : max_subchunks_prealloced;
   std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
 
-  // The allocation for single source compressed input is estimated by assuming a ~4:1
-  // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
-  // of subchunks.
-  auto constexpr header_size = 4096;
   std::size_t buffer_size =
-    reader_compression != compression_type::NONE
-      ? total_source_size * estimated_compression_ratio + header_size
-      : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
-          num_extra_delimiters;
+    std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
+    num_extra_delimiters;
   rmm::device_buffer buffer(buffer_size, stream);
   device_span<char> bufspan(reinterpret_cast<char*>(buffer.data()), buffer.size());
 
   // Offset within buffer indicating first read position
   std::int64_t buffer_offset = 0;
-  auto readbufspan           = ingest_raw_input(
-    bufspan, sources, reader_compression, chunk_offset, chunk_size, delimiter, stream);
+  auto readbufspan =
+    ingest_raw_input(bufspan, sources, chunk_offset, chunk_size, delimiter, stream);
 
   auto const shift_for_nonzero_offset = std::min<std::int64_t>(chunk_offset, 1);
   auto const first_delim_pos =
@@ -179,7 +237,6 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
         buffer_offset += readbufspan.size();
         readbufspan    = ingest_raw_input(bufspan.last(buffer_size - buffer_offset),
                                        sources,
-                                       reader_compression,
                                        next_subchunk_start,
                                        size_per_subchunk,
                                        delimiter,
@@ -196,11 +253,9 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
           // Our buffer_size estimate is insufficient to read until the end of the line! We need to
           // allocate more memory and try again!
           num_subchunks_prealloced *= 2;
-          buffer_size = reader_compression != compression_type::NONE
-                          ? 2 * buffer_size
-                          : std::min(total_source_size,
-                                     buffer_size + num_subchunks_prealloced * size_per_subchunk) +
-                              num_extra_delimiters;
+          buffer_size = std::min(total_source_size,
+                                 buffer_size + num_subchunks_prealloced * size_per_subchunk) +
+                        num_extra_delimiters;
           buffer.resize(buffer_size, stream);
           bufspan = device_span<char>(reinterpret_cast<char*>(buffer.data()), buffer.size());
         }
@@ -258,111 +313,11 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
 }
 
-}  // anonymous namespace
-
-device_span<char> ingest_raw_input(device_span<char> buffer,
-                                   host_span<std::unique_ptr<datasource>> sources,
-                                   compression_type compression,
-                                   std::size_t range_offset,
-                                   std::size_t range_size,
-                                   char delimiter,
-                                   rmm::cuda_stream_view stream)
+table_with_metadata read_json_impl(host_span<std::unique_ptr<datasource>> sources,
+                                   json_reader_options const& reader_opts,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::device_async_resource_ref mr)
 {
-  CUDF_FUNC_RANGE();
-  // We append a line delimiter between two files to make sure the last line of file i and the first
-  // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
-  // delimiter.
-  auto constexpr num_delimiter_chars = 1;
-
-  if (compression == compression_type::NONE) {
-    auto delimiter_map = cudf::detail::make_empty_host_vector<std::size_t>(sources.size(), stream);
-    std::vector<std::size_t> prefsum_source_sizes(sources.size());
-    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    std::size_t bytes_read = 0;
-    std::transform_inclusive_scan(sources.begin(),
-                                  sources.end(),
-                                  prefsum_source_sizes.begin(),
-                                  std::plus<std::size_t>{},
-                                  [](std::unique_ptr<datasource> const& s) { return s->size(); });
-    auto upper =
-      std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
-    std::size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
-
-    auto const total_bytes_to_read =
-      std::min(range_size, prefsum_source_sizes.back() - range_offset);
-    range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
-    for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read;
-         i++) {
-      if (sources[i]->is_empty()) continue;
-      auto data_size =
-        std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
-      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
-                         (num_delimiter_chars * delimiter_map.size());
-      if (sources[i]->is_device_read_preferred(data_size)) {
-        bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
-      } else {
-        h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
-        auto const& h_buffer = h_buffers.back();
-        CUDF_CUDA_TRY(cudaMemcpyAsync(
-          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value()));
-        bytes_read += h_buffer->size();
-      }
-      range_offset = 0;
-      delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size()));
-    }
-    // Removing delimiter inserted after last non-empty source is read
-    if (!delimiter_map.empty()) { delimiter_map.pop_back(); }
-
-    // If this is a multi-file source, we scatter the JSON line delimiters between files
-    if (sources.size() > 1) {
-      static_assert(num_delimiter_chars == 1,
-                    "Currently only single-character delimiters are supported");
-      auto const delimiter_source = thrust::make_constant_iterator(delimiter);
-      auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
-        delimiter_map, stream, cudf::get_current_device_resource_ref());
-      thrust::scatter(rmm::exec_policy_nosync(stream),
-                      delimiter_source,
-                      delimiter_source + d_delimiter_map.size(),
-                      d_delimiter_map.data(),
-                      buffer.data());
-    }
-    stream.synchronize();
-    return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
-  }
-  // TODO: allow byte range reading from multiple compressed files.
-  auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
-  auto hbuffer                 = std::vector<uint8_t>(remaining_bytes_to_read);
-  // Single read because only a single compressed source is supported
-  // Reading to host because decompression of a single block is much faster on the CPU
-  sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
-  auto uncomp_data = decompress(compression, hbuffer);
-  auto ret_buffer  = buffer.first(uncomp_data.size());
-  cudf::detail::cuda_memcpy<char>(
-    ret_buffer,
-    host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
-    stream);
-  return ret_buffer;
-}
-
-table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
-                              json_reader_options const& reader_opts,
-                              rmm::cuda_stream_view stream,
-                              rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-
-  if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) {
-    CUDF_EXPECTS(reader_opts.is_enabled_lines(),
-                 "Specifying a byte range is supported only for JSON Lines");
-  }
-
-  if (sources.size() > 1) {
-    CUDF_EXPECTS(reader_opts.get_compression() == compression_type::NONE,
-                 "Multiple compressed inputs are not supported");
-    CUDF_EXPECTS(reader_opts.is_enabled_lines(),
-                 "Multiple inputs are supported only for JSON Lines format");
-  }
-
   /*
    * The batched JSON reader enforces that the size of each batch is at most INT_MAX
    * bytes (~2.14GB). Batches are defined to be byte range chunks - characterized by
@@ -462,4 +417,101 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                              {partial_tables[0].metadata.schema_info}};
 }
 
+}  // anonymous namespace
+
+device_span<char> ingest_raw_input(device_span<char> buffer,
+                                   host_span<std::unique_ptr<datasource>> sources,
+                                   std::size_t range_offset,
+                                   std::size_t range_size,
+                                   char delimiter,
+                                   rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  // We append a line delimiter between two files to make sure the last line of file i and the first
+  // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
+  // delimiter.
+  auto constexpr num_delimiter_chars = 1;
+
+  auto delimiter_map = cudf::detail::make_empty_host_vector<std::size_t>(sources.size(), stream);
+  std::vector<std::size_t> prefsum_source_sizes(sources.size());
+  std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
+  std::size_t bytes_read = 0;
+  std::transform_inclusive_scan(sources.begin(),
+                                sources.end(),
+                                prefsum_source_sizes.begin(),
+                                std::plus<std::size_t>{},
+                                [](std::unique_ptr<datasource> const& s) { return s->size(); });
+  auto upper =
+    std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
+  std::size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
+
+  auto const total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset);
+  range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
+  for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) {
+    if (sources[i]->is_empty()) continue;
+    auto data_size = std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
+    auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
+                       (num_delimiter_chars * delimiter_map.size());
+    if (sources[i]->is_device_read_preferred(data_size)) {
+      bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
+    } else {
+      h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
+      auto const& h_buffer = h_buffers.back();
+      CUDF_CUDA_TRY(cudaMemcpyAsync(
+        destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value()));
+      bytes_read += h_buffer->size();
+    }
+    range_offset = 0;
+    delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size()));
+  }
+  // Removing delimiter inserted after last non-empty source is read
+  if (!delimiter_map.empty()) { delimiter_map.pop_back(); }
+
+  // If this is a multi-file source, we scatter the JSON line delimiters between files
+  if (sources.size() > 1 && !delimiter_map.empty()) {
+    static_assert(num_delimiter_chars == 1,
+                  "Currently only single-character delimiters are supported");
+    auto const delimiter_source = thrust::make_constant_iterator(delimiter);
+    auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
+      delimiter_map, stream, cudf::get_current_device_resource_ref());
+    thrust::scatter(rmm::exec_policy_nosync(stream),
+                    delimiter_source,
+                    delimiter_source + d_delimiter_map.size(),
+                    d_delimiter_map.data(),
+                    buffer.data());
+  }
+  stream.synchronize();
+  return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
+}
+
+table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
+                              json_reader_options const& reader_opts,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+
+  if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) {
+    CUDF_EXPECTS(reader_opts.is_enabled_lines(),
+                 "Specifying a byte range is supported only for JSON Lines");
+  }
+
+  if (sources.size() > 1) {
+    CUDF_EXPECTS(reader_opts.is_enabled_lines(),
+                 "Multiple inputs are supported only for JSON Lines format");
+  }
+
+  if (reader_opts.get_compression() == compression_type::NONE)
+    return read_json_impl(sources, reader_opts, stream, mr);
+
+  std::vector<std::unique_ptr<datasource>> compressed_sources;
+  for (size_t i = 0; i < sources.size(); i++) {
+    compressed_sources.emplace_back(
+      std::make_unique<compressed_host_buffer_source>(sources[i], reader_opts.get_compression()));
+  }
+  // in read_json_impl, we need the compressed source size to actually be the
+  // uncompressed source size for correct batching
+  return read_json_impl(compressed_sources, reader_opts, stream, mr);
+}
+
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 4def69cc629..ac980938522 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -32,10 +32,9 @@ namespace CUDF_EXPORT cudf {
 namespace io::json::detail {
 
 // Some magic numbers
-constexpr int num_subchunks               = 10;  // per chunk_size
-constexpr size_t min_subchunk_size        = 10000;
-constexpr int estimated_compression_ratio = 4;
-constexpr int max_subchunks_prealloced    = 3;
+constexpr int num_subchunks            = 10;  // per chunk_size
+constexpr size_t min_subchunk_size     = 10000;
+constexpr int max_subchunks_prealloced = 3;
 
 /**
  * @brief Read from array of data sources into RMM buffer. The size of the returned device span
@@ -45,15 +44,14 @@ constexpr int max_subchunks_prealloced    = 3;
  *
  * @param buffer Device span buffer to which data is read
  * @param sources Array of data sources
- * @param compression Compression format of source
  * @param range_offset Number of bytes to skip from source start
  * @param range_size Number of bytes to read from source
+ * @param delimiter Delimiter character for JSONL inputs
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @returns A subspan of the input device span containing data read
  */
 device_span<char> ingest_raw_input(device_span<char> buffer,
                                    host_span<std::unique_ptr<datasource>> sources,
-                                   compression_type compression,
                                    size_t range_offset,
                                    size_t range_size,
                                    char delimiter,
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index e1241f8f90c..8156258c810 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -19,6 +19,7 @@
  * @brief cuDF-IO JSON writer implementation
  */
 
+#include "io/comp/comp.hpp"
 #include "io/csv/durations.hpp"
 #include "io/utilities/parsing_utils.cuh"
 #include "lists/utilities.hpp"
@@ -828,10 +829,10 @@ void write_chunked(data_sink* out_sink,
   }
 }
 
-void write_json(data_sink* out_sink,
-                table_view const& table,
-                json_writer_options const& options,
-                rmm::cuda_stream_view stream)
+void write_json_uncompressed(data_sink* out_sink,
+                             table_view const& table,
+                             json_writer_options const& options,
+                             rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   std::vector<column_name_info> user_column_names = [&]() {
@@ -934,4 +935,24 @@ void write_json(data_sink* out_sink,
   }
 }
 
+void write_json(data_sink* out_sink,
+                table_view const& table,
+                json_writer_options const& options,
+                rmm::cuda_stream_view stream)
+{
+  if (options.get_compression() != compression_type::NONE) {
+    std::vector<char> hbuf;
+    auto hbuf_sink_ptr = data_sink::create(&hbuf);
+    write_json_uncompressed(hbuf_sink_ptr.get(), table, options, stream);
+    stream.synchronize();
+    auto comp_hbuf = cudf::io::detail::compress(
+      options.get_compression(),
+      host_span<uint8_t>(reinterpret_cast<uint8_t*>(hbuf.data()), hbuf.size()),
+      stream);
+    out_sink->host_write(comp_hbuf.data(), comp_hbuf.size());
+    return;
+  }
+  write_json_uncompressed(out_sink, table, options, stream);
+}
+
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 1fe5e5aa41e..7046b3b3f91 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -460,7 +460,7 @@ host_span<uint8_t const> OrcDecompressor::decompress_blocks(host_span<uint8_t co
       dst_length += block_len;
     } else {
       // Compressed block
-      dst_length += decompress(
+      dst_length += cudf::io::detail::decompress(
         _compression, src.subspan(i, block_len), {m_buf.data() + dst_length, m_blockSize}, stream);
     }
     i += block_len;
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 9ed2929a70e..ab40a5b9a55 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -149,10 +149,21 @@ inline __device__ bool is_bounds_page(page_state_s* const s,
   size_t const begin      = start_row;
   size_t const end        = start_row + num_rows;
 
-  // for non-nested schemas, rows cannot span pages, so use a more restrictive test
-  return has_repetition
-           ? ((page_begin <= begin && page_end >= begin) || (page_begin <= end && page_end >= end))
-           : ((page_begin < begin && page_end > begin) || (page_begin < end && page_end > end));
+  // Test for list schemas.
+  auto const is_bounds_page_lists =
+    ((page_begin <= begin and page_end >= begin) or (page_begin <= end and page_end >= end));
+
+  // For non-list schemas, rows cannot span pages, so use a more restrictive test. Make sure to
+  // relax the test for `page_end` if we adjusted the `num_rows` for the last page to compensate
+  // for list row size estimates in `generate_list_column_row_count_estimates()` when chunked
+  // read mode.
+  auto const test_page_end_nonlists =
+    s->page.is_num_rows_adjusted ? page_end >= end : page_end > end;
+
+  auto const is_bounds_page_nonlists =
+    (page_begin < begin and page_end > begin) or (page_begin < end and test_page_end_nonlists);
+
+  return has_repetition ? is_bounds_page_lists : is_bounds_page_nonlists;
 }
 
 /**
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index a8a8c441a84..6aec4ce0ec2 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -433,6 +433,7 @@ void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks,
       // definition levels
       bs->page.chunk_row            = 0;
       bs->page.num_rows             = 0;
+      bs->page.is_num_rows_adjusted = false;
       bs->page.skipped_values       = -1;
       bs->page.skipped_leaf_values  = 0;
       bs->page.str_bytes            = 0;
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 3b4d0e6dc80..ce9d48693ec 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -310,8 +310,10 @@ struct PageInfo {
   // - In the case of a nested schema, you have to decode the repetition and definition
   //   levels to extract actual column values
   int32_t num_input_values;
-  int32_t chunk_row;  // starting row of this page relative to the start of the chunk
-  int32_t num_rows;   // number of rows in this page
+  int32_t chunk_row;          // starting row of this page relative to the start of the chunk
+  int32_t num_rows;           // number of rows in this page
+  bool is_num_rows_adjusted;  // Flag to indicate if the number of rows of this page have been
+                              // adjusted to compensate for the list row size estimates.
   // the next four are calculated in gpuComputePageStringSizes
   int32_t num_nulls;       // number of null values (V2 header), but recalculated for string cols
   int32_t num_valids;      // number of non-null values, taking into account skip_rows/num_rows
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index f03f1214b9a..bcdae4cbd3b 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -729,7 +729,10 @@ struct set_final_row_count {
     if (i < pages.size() - 1 && (pages[i + 1].chunk_idx == page.chunk_idx)) { return; }
     size_t const page_start_row = chunk.start_row + page.chunk_row;
     size_t const chunk_last_row = chunk.start_row + chunk.num_rows;
-    page.num_rows               = chunk_last_row - page_start_row;
+    // Mark `is_num_rows_adjusted` to signal string decoders that the `num_rows` of this page has
+    // been adjusted.
+    page.is_num_rows_adjusted = page.num_rows != (chunk_last_row - page_start_row);
+    page.num_rows             = chunk_last_row - page_start_row;
   }
 };
 
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 3307b4fa539..cea0ebad8f5 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -56,7 +56,8 @@ void set_up_kvikio()
 {
   static std::once_flag flag{};
   std::call_once(flag, [] {
-    auto const compat_mode = kvikio::detail::getenv_or<bool>("KVIKIO_COMPAT_MODE", true);
+    auto const compat_mode =
+      kvikio::detail::getenv_or("KVIKIO_COMPAT_MODE", kvikio::CompatMode::ON);
     kvikio::defaults::compat_mode_reset(compat_mode);
 
     auto const nthreads = getenv_or<unsigned int>("KVIKIO_NTHREADS", 4u);
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 68377ad6d5f..b37a5ac900a 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -45,7 +45,7 @@ class file_sink : public data_sink {
       cufile_integration::set_up_kvikio();
       _kvikio_file = kvikio::FileHandle(filepath, "w");
       CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.",
-                    _kvikio_file.is_compat_mode_on() ? "on" : "off");
+                    _kvikio_file.is_compat_mode_preferred() ? "on" : "off");
     } else {
       _cufile_out = detail::make_cufile_output(filepath);
     }
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 0870e4a84a7..10814eea458 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -56,7 +56,7 @@ class file_source : public datasource {
       cufile_integration::set_up_kvikio();
       _kvikio_file = kvikio::FileHandle(filepath);
       CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
-                    _kvikio_file.is_compat_mode_on() ? "on" : "off");
+                    _kvikio_file.is_compat_mode_preferred() ? "on" : "off");
     } else {
       _cufile_in = detail::make_cufile_input(filepath);
     }
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index 515d28201e8..ce4d2067b82 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -33,7 +33,9 @@
 #include <cub/block/block_scan.cuh>
 #include <cuco/static_set.cuh>
 #include <thrust/fill.h>
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/sequence.h>
 
 #include <cstddef>
@@ -79,14 +81,9 @@ class build_keys_fn {
 
 /**
  * @brief Device output transform functor to construct `size_type` with `cuco::pair<hash_value_type,
- * lhs_index_type>` or `cuco::pair<hash_value_type, rhs_index_type>`
+ * rhs_index_type>`
  */
 struct output_fn {
-  __device__ constexpr cudf::size_type operator()(
-    cuco::pair<hash_value_type, lhs_index_type> const& x) const
-  {
-    return static_cast<cudf::size_type>(x.second);
-  }
   __device__ constexpr cudf::size_type operator()(
     cuco::pair<hash_value_type, rhs_index_type> const& x) const
   {
@@ -176,15 +173,33 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
   auto const iter           = cudf::detail::make_counting_transform_iterator(
     0, build_keys_fn<decltype(d_probe_hasher), lhs_index_type>{d_probe_hasher});
 
-  auto const build_indices_begin =
-    thrust::make_transform_output_iterator(build_indices->begin(), output_fn{});
-  auto const probe_indices_begin =
-    thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});
-
-  auto const [probe_indices_end, _] = this->_hash_table.retrieve(
-    iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()});
+  auto found_indices = rmm::device_uvector<size_type>(probe_table_num_rows, stream);
+  auto const found_begin =
+    thrust::make_transform_output_iterator(found_indices.begin(), output_fn{});
+
+  // TODO conditional find for nulls once `cuco::static_set::find_if` is added
+  // If `idx` is within the range `[0, probe_table_num_rows)` and `found_indices[idx]` is not equal
+  // to `JoinNoneValue`, then `idx` has a match in the hash set.
+  this->_hash_table.find_async(iter, iter + probe_table_num_rows, found_begin, stream.value());
+
+  auto const tuple_iter = cudf::detail::make_counting_transform_iterator(
+    0,
+    cuda::proclaim_return_type<thrust::tuple<size_type, size_type>>(
+      [found_iter = found_indices.begin()] __device__(size_type idx) {
+        return thrust::tuple{*(found_iter + idx), idx};
+      }));
+  auto const output_begin =
+    thrust::make_zip_iterator(build_indices->begin(), probe_indices->begin());
+  auto const output_end =
+    thrust::copy_if(rmm::exec_policy_nosync(stream),
+                    tuple_iter,
+                    tuple_iter + probe_table_num_rows,
+                    found_indices.begin(),
+                    output_begin,
+                    cuda::proclaim_return_type<bool>(
+                      [] __device__(size_type idx) { return idx != JoinNoneValue; }));
+  auto const actual_size = std::distance(output_begin, output_end);
 
-  auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
   build_indices->resize(actual_size, stream);
   probe_indices->resize(actual_size, stream);
 
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 3bd67001c16..7cdce1ff735 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -23,9 +23,11 @@
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/aggregation.hpp>
 #include <cudf/binaryop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
@@ -820,4 +822,24 @@ TEST_F(BinaryOperationCompiledTest_NullOpsString, NullMin_Vector_Vector)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
 }
 
+TEST(BinaryOperationCompiledTest, LargeColumnNoOverflow)
+{
+  cudf::size_type num_rows{1'799'989'091};
+  auto big = cudf::make_column_from_scalar(
+    cudf::numeric_scalar<cudf::id_to_type<cudf::type_id::INT8>>{10, true}, num_rows);
+  auto small = cudf::make_column_from_scalar(
+    cudf::numeric_scalar<cudf::id_to_type<cudf::type_id::INT8>>{1, true}, num_rows);
+
+  auto mask = cudf::binary_operation(big->view(),
+                                     small->view(),
+                                     cudf::binary_operator::GREATER,
+                                     cudf::data_type{cudf::type_id::BOOL8});
+
+  auto agg = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  auto result =
+    cudf::reduce(mask->view(), *agg, cudf::data_type{cudf::type_to_id<cudf::size_type>()});
+  auto got = static_cast<cudf::numeric_scalar<cudf::size_type>*>(result.get())->value();
+  EXPECT_EQ(num_rows, got);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json/json_chunked_reader.cu b/cpp/tests/io/json/json_chunked_reader.cu
index c9ee6542a4d..9c8208b8300 100644
--- a/cpp/tests/io/json/json_chunked_reader.cu
+++ b/cpp/tests/io/json/json_chunked_reader.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "io/comp/comp.hpp"
 #include "json_utils.cuh"
 
 #include <cudf_test/base_fixture.hpp>
@@ -31,36 +32,66 @@
 /**
  * @brief Base test fixture for JSON reader tests
  */
-struct JsonReaderTest : public cudf::test::BaseFixture {};
+struct JsonReaderTest : public cudf::test::BaseFixture,
+                        public testing::WithParamInterface<cudf::io::compression_type> {};
+
+// Parametrize qualifying JSON tests for multiple compression types
+INSTANTIATE_TEST_SUITE_P(JsonReaderTest,
+                         JsonReaderTest,
+                         ::testing::Values(cudf::io::compression_type::GZIP,
+                                           cudf::io::compression_type::SNAPPY,
+                                           cudf::io::compression_type::NONE));
 
 cudf::test::TempDirTestEnvironment* const temp_env =
   static_cast<cudf::test::TempDirTestEnvironment*>(
     ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
 
-TEST_F(JsonReaderTest, ByteRange_SingleSource)
+TEST_P(JsonReaderTest, ByteRange_SingleSource)
 {
+  cudf::io::compression_type const comptype = GetParam();
+
   std::string const json_string = R"(
     { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
     { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
     { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
     { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
 
+  std::vector<std::uint8_t> cdata;
+  if (comptype != cudf::io::compression_type::NONE) {
+    cdata = cudf::io::detail::compress(
+      comptype,
+      cudf::host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(json_string.data()),
+                                     json_string.size()),
+      cudf::get_default_stream());
+  } else
+    cdata = std::vector<uint8_t>(
+      reinterpret_cast<uint8_t const*>(json_string.data()),
+      reinterpret_cast<uint8_t const*>(json_string.data()) + json_string.size());
+
   // Initialize parsing options (reading json lines)
   cudf::io::json_reader_options json_lines_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{json_string.c_str(), json_string.size()})
       .compression(cudf::io::compression_type::NONE)
       .lines(true);
+  cudf::io::json_reader_options cjson_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{cudf::host_span<uint8_t>(cdata.data(), cdata.size())})
+      .compression(comptype)
+      .lines(true);
 
   // Read full test data via existing, nested JSON lines reader
-  cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options);
+  cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(cjson_lines_options);
 
-  auto datasources = cudf::io::datasource::create(json_lines_options.get_source().host_buffers());
+  auto datasources  = cudf::io::datasource::create(json_lines_options.get_source().host_buffers());
+  auto cdatasources = cudf::io::datasource::create(cjson_lines_options.get_source().host_buffers());
 
   // Test for different chunk sizes
   for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500}) {
     auto const tables = split_byte_range_reading(datasources,
+                                                 cdatasources,
                                                  json_lines_options,
+                                                 cjson_lines_options,
                                                  chunk_size,
                                                  cudf::get_default_stream(),
                                                  cudf::get_current_device_resource_ref());
@@ -77,37 +108,54 @@ TEST_F(JsonReaderTest, ByteRange_SingleSource)
   }
 }
 
-TEST_F(JsonReaderTest, ReadCompleteFiles)
+TEST_P(JsonReaderTest, ReadCompleteFiles)
 {
+  cudf::io::compression_type const comptype = GetParam();
+
   std::string const json_string = R"(
     { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
     { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
     { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
     { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
-  auto filename                 = temp_env->get_temp_dir() + "ParseInRangeIntegers.json";
+
+  std::vector<std::uint8_t> cdata;
+  if (comptype != cudf::io::compression_type::NONE) {
+    cdata = cudf::io::detail::compress(
+      comptype,
+      cudf::host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(json_string.data()),
+                                     json_string.size()),
+      cudf::get_default_stream());
+  } else
+    cdata = std::vector<uint8_t>(
+      reinterpret_cast<uint8_t const*>(json_string.data()),
+      reinterpret_cast<uint8_t const*>(json_string.data()) + json_string.size());
+
+  auto cfilename = temp_env->get_temp_dir() + "cParseInRangeIntegers.json";
   {
-    std::ofstream outfile(filename, std::ofstream::out);
-    outfile << json_string;
+    std::ofstream outfile(cfilename, std::ofstream::out);
+    std::copy(cdata.begin(), cdata.end(), std::ostreambuf_iterator<char>(outfile));
   }
 
   constexpr int num_sources = 5;
-  std::vector<std::string> filepaths(num_sources, filename);
+  std::vector<std::string> cfilepaths(num_sources, cfilename);
 
-  cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{filepaths})
+  cudf::io::json_reader_options cin_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{cfilepaths})
       .lines(true)
+      .compression(comptype)
       .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
 
-  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(cin_options);
 
   std::vector<cudf::io::table_with_metadata> part_tables;
-  for (auto filepath : filepaths) {
-    cudf::io::json_reader_options part_in_options =
-      cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+  for (auto cfilepath : cfilepaths) {
+    cudf::io::json_reader_options part_cin_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{cfilepath})
         .lines(true)
+        .compression(comptype)
         .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
 
-    part_tables.push_back(cudf::io::read_json(part_in_options));
+    part_tables.push_back(cudf::io::read_json(part_cin_options));
   }
 
   auto part_table_views = std::vector<cudf::table_view>(part_tables.size());
@@ -120,42 +168,69 @@ TEST_F(JsonReaderTest, ReadCompleteFiles)
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result.tbl->view(), expected_result->view());
 }
 
-TEST_F(JsonReaderTest, ByteRange_MultiSource)
+TEST_P(JsonReaderTest, ByteRange_MultiSource)
 {
+  cudf::io::compression_type const comptype = GetParam();
+
   std::string const json_string = R"(
     { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
     { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
     { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
     { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
-  auto filename                 = temp_env->get_temp_dir() + "ParseInRangeIntegers.json";
+
+  std::vector<std::uint8_t> cdata;
+  if (comptype != cudf::io::compression_type::NONE) {
+    cdata = cudf::io::detail::compress(
+      comptype,
+      cudf::host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(json_string.data()),
+                                     json_string.size()),
+      cudf::get_default_stream());
+  } else
+    cdata = std::vector<uint8_t>(
+      reinterpret_cast<uint8_t const*>(json_string.data()),
+      reinterpret_cast<uint8_t const*>(json_string.data()) + json_string.size());
+
+  auto cfilename = temp_env->get_temp_dir() + "cParseInRangeIntegers.json";
   {
-    std::ofstream outfile(filename, std::ofstream::out);
-    outfile << json_string;
+    std::ofstream outfile(cfilename, std::ofstream::out);
+    std::copy(cdata.begin(), cdata.end(), std::ostreambuf_iterator<char>(outfile));
   }
 
   constexpr int num_sources = 5;
-  std::vector<std::string> filepaths(num_sources, filename);
+  std::vector<std::string> cfilepaths(num_sources, cfilename);
+  std::vector<cudf::host_span<char const>> hostbufs(
+    num_sources,
+    cudf::host_span<char const>(reinterpret_cast<char const*>(json_string.data()),
+                                json_string.size()));
 
   // Initialize parsing options (reading json lines)
   cudf::io::json_reader_options json_lines_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{filepaths})
-      .lines(true)
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{
+        cudf::host_span<cudf::host_span<char const>>(hostbufs.data(), hostbufs.size())})
       .compression(cudf::io::compression_type::NONE)
+      .lines(true);
+  cudf::io::json_reader_options cjson_lines_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{cfilepaths})
+      .lines(true)
+      .compression(comptype)
       .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
 
   // Read full test data via existing, nested JSON lines reader
-  cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options);
+  cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(cjson_lines_options);
 
-  auto file_paths = json_lines_options.get_source().filepaths();
-  std::vector<std::unique_ptr<cudf::io::datasource>> datasources;
-  for (auto& fp : file_paths) {
-    datasources.emplace_back(cudf::io::datasource::create(fp));
+  std::vector<std::unique_ptr<cudf::io::datasource>> cdatasources;
+  for (auto& fp : cfilepaths) {
+    cdatasources.emplace_back(cudf::io::datasource::create(fp));
   }
+  auto datasources = cudf::io::datasource::create(json_lines_options.get_source().host_buffers());
 
   // Test for different chunk sizes
   for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500, 1000, 2000}) {
     auto const tables = split_byte_range_reading(datasources,
+                                                 cdatasources,
                                                  json_lines_options,
+                                                 cjson_lines_options,
                                                  chunk_size,
                                                  cudf::get_default_stream(),
                                                  cudf::get_current_device_resource_ref());
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 26937c9298a..3c8db99c3c7 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "io/comp/comp.hpp"
+#include "io/comp/io_uncomp.hpp"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -3252,4 +3255,59 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder)
   }
 }
 
+struct JsonCompressedIOTest : public cudf::test::BaseFixture,
+                              public testing::WithParamInterface<cudf::io::compression_type> {};
+
+// Parametrize qualifying JSON tests for multiple compression types
+INSTANTIATE_TEST_SUITE_P(JsonCompressedIOTest,
+                         JsonCompressedIOTest,
+                         ::testing::Values(cudf::io::compression_type::GZIP,
+                                           cudf::io::compression_type::SNAPPY,
+                                           cudf::io::compression_type::NONE));
+
+TEST_P(JsonCompressedIOTest, BasicJsonLines)
+{
+  cudf::io::compression_type const comptype = GetParam();
+  std::string data                          = to_records_orient(
+    {{{"0", "1"}, {"1", "1.1"}}, {{"0", "2"}, {"1", "2.2"}}, {{"0", "3"}, {"1", "3.3"}}}, "\n");
+
+  std::vector<std::uint8_t> cdata;
+  if (comptype != cudf::io::compression_type::NONE) {
+    cdata = cudf::io::detail::compress(
+      comptype,
+      cudf::host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(data.data()), data.size()),
+      cudf::get_default_stream());
+    auto decomp_out_buffer = cudf::io::detail::decompress(
+      comptype, cudf::host_span<uint8_t const>(cdata.data(), cdata.size()));
+    std::string const expected = R"({"0":1, "1":1.1}
+{"0":2, "1":2.2}
+{"0":3, "1":3.3})";
+    EXPECT_EQ(
+      expected,
+      std::string(reinterpret_cast<char*>(decomp_out_buffer.data()), decomp_out_buffer.size()));
+  } else
+    cdata = std::vector<uint8_t>(reinterpret_cast<uint8_t*>(data.data()),
+                                 reinterpret_cast<uint8_t*>(data.data()) + data.size());
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{cudf::host_span<uint8_t>(cdata.data(), cdata.size())})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<double>()})
+      .compression(comptype)
+      .lines(true);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 2);
+  EXPECT_EQ(result.tbl->num_rows(), 3);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT32);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+  EXPECT_EQ(result.metadata.schema_info[1].name, "1");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int_wrapper{{1, 2, 3}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3}});
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json/json_utils.cuh b/cpp/tests/io/json/json_utils.cuh
index c31bb2d24e0..629a89fa777 100644
--- a/cpp/tests/io/json/json_utils.cuh
+++ b/cpp/tests/io/json/json_utils.cuh
@@ -32,7 +32,9 @@
 template <typename IndexType = std::int32_t>
 std::vector<cudf::io::table_with_metadata> split_byte_range_reading(
   cudf::host_span<std::unique_ptr<cudf::io::datasource>> sources,
+  cudf::host_span<std::unique_ptr<cudf::io::datasource>> csources,
   cudf::io::json_reader_options const& reader_opts,
+  cudf::io::json_reader_options const& creader_opts,
   IndexType chunk_size,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
@@ -49,7 +51,6 @@ std::vector<cudf::io::table_with_metadata> split_byte_range_reading(
     rmm::device_uvector<char> buffer(total_source_size, stream);
     auto readbufspan = cudf::io::json::detail::ingest_raw_input(buffer,
                                                                 sources,
-                                                                reader_opts.get_compression(),
                                                                 reader_opts.get_byte_range_offset(),
                                                                 reader_opts.get_byte_range_size(),
                                                                 reader_opts.get_delimiter(),
@@ -95,10 +96,11 @@ std::vector<cudf::io::table_with_metadata> split_byte_range_reading(
   record_ranges.emplace_back(prev, total_source_size);
 
   std::vector<cudf::io::table_with_metadata> tables;
+  auto creader_opts_chunk = creader_opts;
   for (auto const& [chunk_start, chunk_end] : record_ranges) {
-    reader_opts_chunk.set_byte_range_offset(chunk_start);
-    reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
-    tables.push_back(cudf::io::json::detail::read_json(sources, reader_opts_chunk, stream, mr));
+    creader_opts_chunk.set_byte_range_offset(chunk_start);
+    creader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
+    tables.push_back(cudf::io::json::detail::read_json(csources, creader_opts_chunk, stream, mr));
   }
   // assume all records have same number of columns, and inferred same type. (or schema is passed)
   // TODO a step before to merge all columns, types and infer final schema.
diff --git a/cpp/tests/io/json/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp
index 39d31c406a5..b96fc6425e4 100644
--- a/cpp/tests/io/json/json_writer.cpp
+++ b/cpp/tests/io/json/json_writer.cpp
@@ -14,10 +14,14 @@
  * limitations under the License.
  */
 
+#include "io/comp/io_uncomp.hpp"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/default_stream.hpp>
 #include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -31,7 +35,36 @@
 
 struct JsonWriterTest : public cudf::test::BaseFixture {};
 
-TEST_F(JsonWriterTest, EmptyInput)
+/**
+ * @brief Test fixture for parametrized JSON reader tests
+ */
+struct JsonCompressedWriterTest : public cudf::test::BaseFixture,
+                                  public testing::WithParamInterface<cudf::io::compression_type> {};
+
+// Parametrize qualifying JSON tests for multiple compression types
+INSTANTIATE_TEST_SUITE_P(JsonCompressedWriterTest,
+                         JsonCompressedWriterTest,
+                         ::testing::Values(cudf::io::compression_type::GZIP,
+                                           cudf::io::compression_type::SNAPPY,
+                                           cudf::io::compression_type::NONE));
+
+void run_test(cudf::io::json_writer_options const& wopts, std::string const& expected)
+{
+  auto outbuf   = wopts.get_sink().buffers().front();
+  auto comptype = wopts.get_compression();
+  cudf::io::write_json(wopts, cudf::test::get_default_stream());
+  if (comptype != cudf::io::compression_type::NONE) {
+    auto decomp_out_buffer = cudf::io::detail::decompress(
+      comptype,
+      cudf::host_span<uint8_t const>(reinterpret_cast<uint8_t*>(outbuf->data()), outbuf->size()));
+    EXPECT_EQ(expected,
+              std::string_view(reinterpret_cast<char*>(decomp_out_buffer.data()),
+                               decomp_out_buffer.size()));
+  } else
+    EXPECT_EQ(expected, std::string_view(outbuf->data(), outbuf->size()));
+}
+
+TEST_P(JsonCompressedWriterTest, EmptyInput)
 {
   cudf::test::strings_column_wrapper col1;
   cudf::test::strings_column_wrapper col2;
@@ -49,28 +82,21 @@ TEST_F(JsonWriterTest, EmptyInput)
                        .lines(false)
                        .na_rep("null")
                        .build();
-
-  // Empty columns in table
-  cudf::io::write_json(out_options, cudf::test::get_default_stream());
-  std::string const expected = R"([])";
-  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+  run_test(out_options, "[]");
 
   // Empty columns in table - JSON Lines
   out_buffer.clear();
   out_options.enable_lines(true);
-  cudf::io::write_json(out_options, cudf::test::get_default_stream());
-  std::string const expected_lines = "\n";
-  EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
+  run_test(out_options, "\n");
 
   // Empty table - JSON Lines
   cudf::table_view tbl_view2{};
   out_options.set_table(tbl_view2);
   out_buffer.clear();
-  cudf::io::write_json(out_options, cudf::test::get_default_stream());
-  EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
+  run_test(out_options, "\n");
 }
 
-TEST_F(JsonWriterTest, EmptyLeaf)
+TEST_P(JsonCompressedWriterTest, EmptyLeaf)
 {
   cudf::test::strings_column_wrapper col1{""};
   cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 0};
@@ -92,19 +118,14 @@ TEST_F(JsonWriterTest, EmptyLeaf)
                        .lines(false)
                        .na_rep("null")
                        .build();
-
-  // Empty columns in table
-  cudf::io::write_json(out_options, cudf::test::get_default_stream());
-  std::string const expected = R"([{"col1":"","col2":[],"col3":[]}])";
-  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+  run_test(out_options, R"([{"col1":"","col2":[],"col3":[]}])");
 
   // Empty columns in table - JSON Lines
   out_buffer.clear();
   out_options.enable_lines(true);
-  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected_lines = R"({"col1":"","col2":[],"col3":[]})"
                                      "\n";
-  EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
+  run_test(out_options, expected_lines);
 }
 
 TEST_F(JsonWriterTest, ErrorCases)
@@ -141,33 +162,34 @@ TEST_F(JsonWriterTest, ErrorCases)
                cudf::logic_error);
 }
 
-TEST_F(JsonWriterTest, PlainTable)
+TEST_P(JsonCompressedWriterTest, PlainTable)
 {
+  cudf::io::compression_type const comptype = GetParam();
   cudf::test::strings_column_wrapper col1{"a", "b", "c"};
   cudf::test::strings_column_wrapper col2{"d", "e", "f"};
-  cudf::test::fixed_width_column_wrapper<int> col3{1, 2, 3};
-  cudf::test::fixed_width_column_wrapper<float> col4{1.5, 2.5, 3.5};
-  cudf::test::fixed_width_column_wrapper<int16_t> col5{{1, 2, 3},
+  cudf::test::fixed_width_column_wrapper<int64_t> col3{1, 2, 3};
+  cudf::test::fixed_width_column_wrapper<double> col4{1.5, 2.5, 3.5};
+  cudf::test::fixed_width_column_wrapper<int64_t> col5{{1, 2, 3},
                                                        cudf::test::iterators::nulls_at({0, 2})};
   cudf::table_view tbl_view{{col1, col2, col3, col4, col5}};
-  cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int"}, {"float"}, {"int16"}}};
+  cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"col3"}, {"col4"}, {"col5"}}};
 
   std::vector<char> out_buffer;
-  auto destination     = cudf::io::sink_info(&out_buffer);
-  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
-                           .include_nulls(true)
-                           .metadata(mt)
-                           .lines(false)
-                           .na_rep("null");
-
-  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
+  auto destination = cudf::io::sink_info(&out_buffer);
+  auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view)
+                       .include_nulls(true)
+                       .metadata(mt)
+                       .lines(false)
+                       .compression(comptype)
+                       .na_rep("null")
+                       .build();
 
   std::string const expected =
-    R"([{"col1":"a","col2":"d","int":1,"float":1.5,"int16":null},{"col1":"b","col2":"e","int":2,"float":2.5,"int16":2},{"col1":"c","col2":"f","int":3,"float":3.5,"int16":null}])";
-  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+    R"([{"col1":"a","col2":"d","col3":1,"col4":1.5,"col5":null},{"col1":"b","col2":"e","col3":2,"col4":2.5,"col5":2},{"col1":"c","col2":"f","col3":3,"col4":3.5,"col5":null}])";
+  run_test(out_options, expected);
 }
 
-TEST_F(JsonWriterTest, SimpleNested)
+TEST_P(JsonCompressedWriterTest, SimpleNested)
 {
   std::string const data = R"(
 {"a": 1, "b": 2, "c": {"d": 3        }, "f": 5.5,  "g": [1]}
@@ -183,23 +205,23 @@ TEST_F(JsonWriterTest, SimpleNested)
   cudf::io::table_metadata mt{result.metadata};
 
   std::vector<char> out_buffer;
-  auto destination     = cudf::io::sink_info(&out_buffer);
-  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
-                           .include_nulls(false)
-                           .metadata(mt)
-                           .lines(true)
-                           .na_rep("null");
-
-  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
+  auto destination = cudf::io::sink_info(&out_buffer);
+  auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view)
+                       .include_nulls(false)
+                       .metadata(mt)
+                       .lines(true)
+                       .na_rep("null")
+                       .build();
+
   std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]}
 {"a":6,"b":7,"c":{"d":8},"f":10.5}
 {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]}
 {"a":6,"b":7,"c":{"e":9},"f":10.5,"g":[3,4,5]}
 )";
-  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+  run_test(out_options, expected);
 }
 
-TEST_F(JsonWriterTest, MixedNested)
+TEST_P(JsonCompressedWriterTest, MixedNested)
 {
   std::string const data = R"(
 {"a": 1, "b": 2, "c": {"d": [3]      }, "f": 5.5,  "g": [ {"h": 1}]}
@@ -215,20 +237,20 @@ TEST_F(JsonWriterTest, MixedNested)
   cudf::io::table_metadata mt{result.metadata};
 
   std::vector<char> out_buffer;
-  auto destination     = cudf::io::sink_info(&out_buffer);
-  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
-                           .include_nulls(false)
-                           .metadata(mt)
-                           .lines(false)
-                           .na_rep("null");
-
-  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
+  auto destination = cudf::io::sink_info(&out_buffer);
+  auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view)
+                       .include_nulls(false)
+                       .metadata(mt)
+                       .lines(false)
+                       .na_rep("null")
+                       .build();
+
   std::string const expected =
     R"([{"a":1,"b":2,"c":{"d":[3]},"f":5.5,"g":[{"h":1}]},)"
     R"({"a":6,"b":7,"c":{"d":[8]},"f":10.5},)"
     R"({"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[{"h":2},null]},)"
     R"({"a":6,"b":7,"c":{"e":9},"f":10.5,"g":[{"h":3},{"h":4},{"h":5}]}])";
-  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+  run_test(out_options, expected);
 }
 
 TEST_F(JsonWriterTest, WriteReadNested)
@@ -375,7 +397,7 @@ TEST_F(JsonWriterTest, WriteReadNested)
   }
 }
 
-TEST_F(JsonWriterTest, SpecialChars)
+TEST_P(JsonCompressedWriterTest, SpecialChars)
 {
   cudf::test::fixed_width_column_wrapper<int> a{1, 6, 1, 6};
   cudf::test::strings_column_wrapper b{"abcd", "b\b\f\n\r\t", "\"c\"", "/\\"};
@@ -391,17 +413,15 @@ TEST_F(JsonWriterTest, SpecialChars)
                        .na_rep("null")
                        .build();
 
-  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected = R"({"\"a\"":1,"'b'":"abcd"}
 {"\"a\"":6,"'b'":"b\b\f\n\r\t"}
 {"\"a\"":1,"'b'":"\"c\""}
 {"\"a\"":6,"'b'":"\/\\"}
 )";
-  auto const output_string   = std::string(out_buffer.data(), out_buffer.size());
-  EXPECT_EQ(expected, output_string);
+  run_test(out_options, expected);
 }
 
-TEST_F(JsonWriterTest, NullList)
+TEST_P(JsonCompressedWriterTest, NullList)
 {
   std::string const data = R"(
 {"a": [null], "b": [[1, 2, 3], [null], [null, null, null], [4, null, 5]]}
@@ -417,23 +437,23 @@ TEST_F(JsonWriterTest, NullList)
   cudf::io::table_metadata mt{result.metadata};
 
   std::vector<char> out_buffer;
-  auto destination     = cudf::io::sink_info(&out_buffer);
-  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
-                           .include_nulls(true)
-                           .metadata(mt)
-                           .lines(true)
-                           .na_rep("null");
-
-  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
+  auto destination = cudf::io::sink_info(&out_buffer);
+  auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view)
+                       .include_nulls(true)
+                       .metadata(mt)
+                       .lines(true)
+                       .na_rep("null")
+                       .build();
+
   std::string const expected = R"({"a":[null],"b":[[1,2,3],[null],[null,null,null],[4,null,5]]}
 {"a":[2,null,null,3],"b":null}
 {"a":[null,null,4],"b":[[2,null],null]}
 {"a":[5,null,null],"b":[null,[3,4,5]]}
 )";
-  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+  run_test(out_options, expected);
 }
 
-TEST_F(JsonWriterTest, ChunkedNested)
+TEST_P(JsonCompressedWriterTest, ChunkedNested)
 {
   std::string const data = R"(
 {"a": 1, "b": -2, "c": {      }, "e": [{"f": 1}]}
@@ -455,15 +475,15 @@ TEST_F(JsonWriterTest, ChunkedNested)
   cudf::io::table_metadata mt{result.metadata};
 
   std::vector<char> out_buffer;
-  auto destination     = cudf::io::sink_info(&out_buffer);
-  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
-                           .include_nulls(false)
-                           .metadata(mt)
-                           .lines(true)
-                           .na_rep("null")
-                           .rows_per_chunk(8);
-
-  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
+  auto destination = cudf::io::sink_info(&out_buffer);
+  auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view)
+                       .include_nulls(false)
+                       .metadata(mt)
+                       .lines(true)
+                       .na_rep("null")
+                       .rows_per_chunk(8)
+                       .build();
+
   std::string const expected =
     R"({"a":1,"b":-2,"c":{},"e":[{"f":1}]}
 {"a":2,"b":-2,"c":{}}
@@ -475,10 +495,10 @@ TEST_F(JsonWriterTest, ChunkedNested)
 {"a":8,"b":-2,"c":{"d":64},"e":[{"f":8}]}
 {"a":9,"b":-2,"c":{"d":81},"e":[{"f":9}]}
 )";
-  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+  run_test(out_options, expected);
 }
 
-TEST_F(JsonWriterTest, StructAllNullCombinations)
+TEST_P(JsonCompressedWriterTest, StructAllNullCombinations)
 {
   auto const_1_iter = thrust::make_constant_iterator(1);
 
@@ -512,14 +532,14 @@ TEST_F(JsonWriterTest, StructAllNullCombinations)
   cudf::io::table_metadata mt{{{"a"}, {"b"}, {"c"}, {"d"}, {"e"}}};
 
   std::vector<char> out_buffer;
-  auto destination     = cudf::io::sink_info(&out_buffer);
-  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
-                           .include_nulls(false)
-                           .metadata(mt)
-                           .lines(true)
-                           .na_rep("null");
-
-  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
+  auto destination = cudf::io::sink_info(&out_buffer);
+  auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view)
+                       .include_nulls(false)
+                       .metadata(mt)
+                       .lines(true)
+                       .na_rep("null")
+                       .build();
+
   std::string const expected = R"({}
 {"e":1}
 {"d":1}
@@ -553,10 +573,10 @@ TEST_F(JsonWriterTest, StructAllNullCombinations)
 {"a":1,"b":1,"c":1,"d":1}
 {"a":1,"b":1,"c":1,"d":1,"e":1}
 )";
-  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+  run_test(out_options, expected);
 }
 
-TEST_F(JsonWriterTest, Unicode)
+TEST_P(JsonCompressedWriterTest, Unicode)
 {
   //                                       UTF-8,                      UTF-16
   cudf::test::strings_column_wrapper col1{"\"\\/\b\f\n\r\t", "ராபிட்ஸ்", "$€𐐷𤭢", "C𝞵𝓓𝒻"};
@@ -574,14 +594,13 @@ TEST_F(JsonWriterTest, Unicode)
   cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int16"}}};
 
   std::vector<char> out_buffer;
-  auto destination     = cudf::io::sink_info(&out_buffer);
-  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
-                           .include_nulls(true)
-                           .metadata(mt)
-                           .lines(true)
-                           .na_rep("null");
-
-  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
+  auto destination = cudf::io::sink_info(&out_buffer);
+  auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view)
+                       .include_nulls(true)
+                       .metadata(mt)
+                       .lines(true)
+                       .na_rep("null")
+                       .build();
 
   std::string const expected =
     R"({"col1":"\"\\\/\b\f\n\r\t","col2":"C\u10ae\u226a\u31f3\u434f\u51f9\u6ca6\u738b\u8fbf\u9fb8\ua057\ubbdc\uc2a4\ud3f6\ue4fe\ufd20","int16":null}
@@ -589,7 +608,7 @@ TEST_F(JsonWriterTest, Unicode)
 {"col1":"$\u20ac\ud801\udc37\ud852\udf62","col2":"\ud841\ude28\ud846\udd4c\ud849\uddc9\ud84c\uddca\ud850\udea9\ud854\udd7d\ud858\ude71\ud85f\udd31\ud860\udc72\ud864\udc79\ud869\udc22\ud86c\udded\ud872\udf2d\ud877\udeb7\ud878\udea6\u5c6e","int16":null}
 {"col1":"C\ud835\udfb5\ud835\udcd3\ud835\udcbb","col2":"\ud883\udf91\ud885\udd08\ud888\udf49","int16":4}
 )";
-  EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
+  run_test(out_options, expected);
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/large_strings/json_tests.cu b/cpp/tests/large_strings/json_tests.cu
index a212d7d654a..0703fa72f67 100644
--- a/cpp/tests/large_strings/json_tests.cu
+++ b/cpp/tests/large_strings/json_tests.cu
@@ -15,6 +15,7 @@
  */
 
 #include "../io/json/json_utils.cuh"
+#include "io/comp/comp.hpp"
 #include "large_strings_fixture.hpp"
 
 #include <cudf_test/table_utilities.hpp>
@@ -25,10 +26,19 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-struct JsonLargeReaderTest : public cudf::test::StringsLargeTest {};
+struct JsonLargeReaderTest : public cudf::test::StringsLargeTest,
+                             public testing::WithParamInterface<cudf::io::compression_type> {};
 
-TEST_F(JsonLargeReaderTest, MultiBatch)
+// Parametrize qualifying JSON tests for multiple compression types
+INSTANTIATE_TEST_SUITE_P(JsonLargeReaderTest,
+                         JsonLargeReaderTest,
+                         ::testing::Values(cudf::io::compression_type::GZIP,
+                                           cudf::io::compression_type::NONE));
+
+TEST_P(JsonLargeReaderTest, MultiBatch)
 {
+  cudf::io::compression_type const comptype = GetParam();
+
   std::string json_string = R"(
     { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
     { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
@@ -48,11 +58,26 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
     json_string += json_string;
   }
 
+  std::vector<std::uint8_t> cdata;
+  if (comptype != cudf::io::compression_type::NONE) {
+    cdata = cudf::io::detail::compress(
+      comptype,
+      cudf::host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(json_string.data()),
+                                     json_string.size()),
+      cudf::get_default_stream());
+  } else
+    cdata = std::vector<uint8_t>(
+      reinterpret_cast<uint8_t const*>(json_string.data()),
+      reinterpret_cast<uint8_t const*>(json_string.data()) + json_string.size());
+
   constexpr int num_sources = 2;
   std::vector<cudf::host_span<std::byte>> hostbufs(
     num_sources,
     cudf::host_span<std::byte>(reinterpret_cast<std::byte*>(json_string.data()),
                                json_string.size()));
+  std::vector<cudf::host_span<std::byte>> chostbufs(
+    num_sources,
+    cudf::host_span<std::byte>(reinterpret_cast<std::byte*>(cdata.data()), cdata.size()));
 
   // Initialize parsing options (reading json lines)
   cudf::io::json_reader_options json_lines_options =
@@ -62,14 +87,20 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
       .lines(true)
       .compression(cudf::io::compression_type::NONE)
       .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+  cudf::io::json_reader_options cjson_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{
+        cudf::host_span<cudf::host_span<std::byte>>(chostbufs.data(), chostbufs.size())})
+      .lines(true)
+      .compression(comptype)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
 
   // Read full test data via existing, nested JSON lines reader
-  cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options);
+  cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(cjson_lines_options);
+
+  auto datasources  = cudf::io::datasource::create(json_lines_options.get_source().host_buffers());
+  auto cdatasources = cudf::io::datasource::create(cjson_lines_options.get_source().host_buffers());
 
-  std::vector<std::unique_ptr<cudf::io::datasource>> datasources;
-  for (auto& hb : hostbufs) {
-    datasources.emplace_back(cudf::io::datasource::create(hb));
-  }
   // Test for different chunk sizes
   std::vector<std::size_t> chunk_sizes{batch_size_upper_bound / 4,
                                        batch_size_upper_bound / 2,
@@ -79,7 +110,9 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
   for (auto chunk_size : chunk_sizes) {
     auto const tables =
       split_byte_range_reading<std::int64_t>(datasources,
+                                             cdatasources,
                                              json_lines_options,
+                                             cjson_lines_options,
                                              chunk_size,
                                              cudf::get_default_stream(),
                                              cudf::get_current_device_resource_ref());
diff --git a/dependencies.yaml b/dependencies.yaml
index 23f0bf7ba85..631ce12f0b0 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -748,7 +748,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.11,<1.14
+          - polars>=1.11,<1.15
   run_cudf_polars_experimental:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/java/ci/Dockerfile.rocky b/java/ci/Dockerfile.rocky
index 152af22f7e4..9f3305278cb 100644
--- a/java/ci/Dockerfile.rocky
+++ b/java/ci/Dockerfile.rocky
@@ -33,7 +33,7 @@ RUN dnf --enablerepo=powertools install -y  scl-utils gcc-toolset-${TOOLSET_VERS
 RUN mkdir /usr/local/rapids /rapids && chmod 777 /usr/local/rapids /rapids
 
 # 3.22.3+: CUDA architecture 'native' support + flexible CMAKE_<LANG>_*_LAUNCHER for ccache
-ARG CMAKE_VERSION=3.26.4
+ARG CMAKE_VERSION=3.28.6
 # default x86_64 from x86 build, aarch64 cmake for arm build
 ARG CMAKE_ARCH=x86_64
 RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 41a7db2285a..2958c286d20 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -16,24 +16,20 @@ set(cython_sources
     aggregation.pyx
     binaryop.pyx
     column.pyx
-    concat.pyx
     copying.pyx
     csv.pyx
     datetime.pyx
     filling.pyx
     groupby.pyx
-    hash.pyx
     interop.pyx
     join.pyx
     json.pyx
-    labeling.pyx
     lists.pyx
     merge.pyx
     null_mask.pyx
     orc.pyx
     parquet.pyx
     partitioning.pyx
-    quantiles.pyx
     reduce.pyx
     replace.pyx
     reshape.pyx
@@ -50,7 +46,6 @@ set(cython_sources
     transform.pyx
     transpose.pyx
     types.pyx
-    unary.pyx
     utils.pyx
 )
 set(linked_libraries cudf::cudf)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 57df6899a22..19dc4488560 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -3,24 +3,20 @@
 
 from . import (
     binaryop,
-    concat,
     copying,
     csv,
     datetime,
     filling,
     groupby,
-    hash,
     interop,
     join,
     json,
-    labeling,
     merge,
     null_mask,
     nvtext,
     orc,
     parquet,
     partitioning,
-    quantiles,
     reduce,
     replace,
     reshape,
@@ -35,7 +31,6 @@
     text,
     timezone,
     transpose,
-    unary,
 )
 
 MAX_COLUMN_SIZE = np.iinfo(np.int32).max
diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
index bb38488eefb..bdd90be45b8 100644
--- a/python/cudf/cudf/_lib/column.pyi
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -2,8 +2,12 @@
 
 from __future__ import annotations
 
+from typing import Literal
+
 from typing_extensions import Self
 
+import pylibcudf as plc
+
 from cudf._typing import Dtype, DtypeObj, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase
@@ -71,3 +75,8 @@ class Column:
     # TODO: The val parameter should be Scalar, not ScalarLike
     @staticmethod
     def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ...
+    @staticmethod
+    def from_pylibcudf(
+        col: plc.Column, data_ptr_exposed: bool = False
+    ) -> ColumnBase: ...
+    def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ...
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
deleted file mode 100644
index e6c2d136f0d..00000000000
--- a/python/cudf/cudf/_lib/concat.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-from cudf._lib.utils cimport data_from_pylibcudf_table
-
-import pylibcudf
-
-from cudf.core.buffer import acquire_spill_lock
-
-
-@acquire_spill_lock()
-def concat_columns(object columns):
-    return Column.from_pylibcudf(
-        pylibcudf.concatenate.concatenate(
-            [col.to_pylibcudf(mode="read") for col in columns]
-        )
-    )
-
-
-@acquire_spill_lock()
-def concat_tables(object tables, bool ignore_index=False):
-    plc_tables = []
-    for table in tables:
-        cols = table._columns
-        if not ignore_index:
-            cols = table._index._columns + cols
-        plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols]))
-
-    return data_from_pylibcudf_table(
-        pylibcudf.concatenate.concatenate(plc_tables),
-        column_names=tables[0]._column_names,
-        index_names=None if ignore_index else tables[0]._index_names
-    )
diff --git a/python/cudf/cudf/_lib/copying.pxd b/python/cudf/cudf/_lib/copying.pxd
deleted file mode 100644
index 14c7d2066d8..00000000000
--- a/python/cudf/cudf/_lib/copying.pxd
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from pylibcudf.libcudf.contiguous_split cimport packed_columns
-
-
-cdef class _CPackedColumns:
-    cdef packed_columns c_obj
-    cdef object column_names
-    cdef object column_dtypes
-    cdef object index_names
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 8b4d6199600..4dfb12d8ab3 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -2,37 +2,31 @@
 
 import pickle
 
-from libc.stdint cimport uint8_t, uintptr_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-from libcpp.vector cimport vector
-
-from rmm.pylibrmm.device_buffer cimport DeviceBuffer
-
 import pylibcudf
 
 import cudf
-from cudf.core.buffer import Buffer, acquire_spill_lock, as_buffer
-
+from cudf.core.buffer import acquire_spill_lock, as_buffer
+from cudf.core.abc import Serializable
 from cudf._lib.column cimport Column
 
 from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport table_view_from_table
 
 from cudf._lib.reduce import minmax
-from cudf.core.abc import Serializable
 
 from libcpp.memory cimport make_unique
 
-cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
 
-from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_table_view
+from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_pylibcudf_table
+import pylibcudf as plc
+from pylibcudf.contiguous_split cimport PackedColumns as PlcPackedColumns
 
 
 def _gather_map_is_valid(
@@ -331,54 +325,37 @@ def get_element(Column input_column, size_type index):
     )
 
 
-cdef class _CPackedColumns:
-
-    @staticmethod
-    def from_py_table(input_table, keep_index=True):
-        """
-        Construct a ``PackedColumns`` object from a ``cudf.DataFrame``.
-        """
-        import cudf.core.dtypes
-
-        cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns)
-
-        if keep_index and (
-            not isinstance(input_table.index, cudf.RangeIndex)
-            or input_table.index.start != 0
-            or input_table.index.stop != len(input_table)
-            or input_table.index.step != 1
-        ):
-            input_table_view = table_view_from_table(input_table)
-            p.index_names = input_table._index_names
-        else:
-            input_table_view = table_view_from_table(
-                input_table, ignore_index=True)
-
-        p.column_names = input_table._column_names
-        p.column_dtypes = {}
-        for name, col in input_table._column_labels_and_values:
-            if isinstance(col.dtype, cudf.core.dtypes._BaseDtype):
-                p.column_dtypes[name] = col.dtype
-
-        p.c_obj = move(cpp_contiguous_split.pack(input_table_view))
+class PackedColumns(Serializable):
+    """
+    A packed representation of a Frame, with all columns residing
+    in a single GPU memory buffer.
+    """
 
-        return p
+    def __init__(
+        self,
+        PlcPackedColumns data,
+        object column_names = None,
+        object index_names = None,
+        object column_dtypes = None
+    ):
+        self._metadata, self._gpu_data = data.release()
+        self.column_names=column_names
+        self.index_names=index_names
+        self.column_dtypes=column_dtypes
 
-    @property
-    def gpu_data_ptr(self):
-        return int(<uintptr_t>self.c_obj.gpu_data.get()[0].data())
+    def __reduce__(self):
+        return self.deserialize, self.serialize()
 
     @property
-    def gpu_data_size(self):
-        return int(<size_t>self.c_obj.gpu_data.get()[0].size())
+    def __cuda_array_interface__(self):
+        return self._gpu_data.__cuda_array_interface__
 
     def serialize(self):
         header = {}
         frames = []
-
         gpu_data = as_buffer(
-            data=self.gpu_data_ptr,
-            size=self.gpu_data_size,
+            data = self._gpu_data.obj.ptr,
+            size = self._gpu_data.obj.size,
             owner=self,
             exposed=True
         )
@@ -388,65 +365,83 @@ cdef class _CPackedColumns:
 
         header["column-names"] = self.column_names
         header["index-names"] = self.index_names
-        if self.c_obj.metadata.get()[0].data() != NULL:
-            header["metadata"] = list(
-                <uint8_t[:self.c_obj.metadata.get()[0].size()]>
-                self.c_obj.metadata.get()[0].data()
-            )
-
-        column_dtypes = {}
+        header["metadata"] = self._metadata.tobytes()
         for name, dtype in self.column_dtypes.items():
             dtype_header, dtype_frames = dtype.serialize()
-            column_dtypes[name] = (
+            self.column_dtypes[name] = (
                 dtype_header,
                 (len(frames), len(frames) + len(dtype_frames)),
             )
             frames.extend(dtype_frames)
-        header["column-dtypes"] = column_dtypes
-
+        header["column-dtypes"] = self.column_dtypes
+        header["type-serialized"] = pickle.dumps(type(self))
         return header, frames
 
-    @staticmethod
-    def deserialize(header, frames):
-        cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns)
-
-        gpu_data = Buffer.deserialize(header["data"], frames)
-
-        dbuf = DeviceBuffer(
-            ptr=gpu_data.get_ptr(mode="write"),
-            size=gpu_data.nbytes
-        )
-
-        cdef cpp_contiguous_split.packed_columns data
-        data.metadata = move(
-            make_unique[vector[uint8_t]](
-                move(<vector[uint8_t]>header.get("metadata", []))
-            )
-        )
-        data.gpu_data = move(dbuf.c_obj)
-
-        p.c_obj = move(data)
-        p.column_names = header["column-names"]
-        p.index_names = header["index-names"]
-
+    @classmethod
+    def deserialize(cls, header, frames):
         column_dtypes = {}
         for name, dtype in header["column-dtypes"].items():
             dtype_header, (start, stop) = dtype
             column_dtypes[name] = pickle.loads(
                 dtype_header["type-serialized"]
             ).deserialize(dtype_header, frames[start:stop])
-        p.column_dtypes = column_dtypes
+        return cls(
+            plc.contiguous_split.pack(
+                plc.contiguous_split.unpack_from_memoryviews(
+                    memoryview(header["metadata"]),
+                    plc.gpumemoryview(frames[0]),
+                )
+            ),
+            header["column-names"],
+            header["index-names"],
+            column_dtypes,
+        )
 
-        return p
+    @classmethod
+    def from_py_table(cls, input_table, keep_index=True):
+        if keep_index and (
+            not isinstance(input_table.index, cudf.RangeIndex)
+            or input_table.index.start != 0
+            or input_table.index.stop != len(input_table)
+            or input_table.index.step != 1
+        ):
+            columns = input_table._index._columns + input_table._columns
+            index_names = input_table._index_names
+        else:
+            columns = input_table._columns
+            index_names = None
+
+        column_names = input_table._column_names
+        column_dtypes = {}
+        for name, col in input_table._column_labels_and_values:
+            if isinstance(
+                col.dtype,
+                (cudf.core.dtypes._BaseDtype, cudf.core.dtypes.CategoricalDtype)
+            ):
+                column_dtypes[name] = col.dtype
+
+        return cls(
+            plc.contiguous_split.pack(
+                plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read") for col in columns
+                    ]
+                )
+            ),
+            column_names,
+            index_names,
+            column_dtypes,
+        )
 
     def unpack(self):
-        output_table = cudf.DataFrame._from_data(*data_from_table_view(
-            cpp_contiguous_split.unpack(self.c_obj),
-            self,
+        output_table = cudf.DataFrame._from_data(*data_from_pylibcudf_table(
+            plc.contiguous_split.unpack_from_memoryviews(
+                self._metadata,
+                self._gpu_data
+            ),
             self.column_names,
             self.index_names
         ))
-
         for name, dtype in self.column_dtypes.items():
             output_table._data[name] = (
                 output_table._data[name]._with_type_metadata(dtype)
@@ -455,46 +450,6 @@ cdef class _CPackedColumns:
         return output_table
 
 
-class PackedColumns(Serializable):
-    """
-    A packed representation of a Frame, with all columns residing
-    in a single GPU memory buffer.
-    """
-
-    def __init__(self, data):
-        self._data = data
-
-    def __reduce__(self):
-        return self.deserialize, self.serialize()
-
-    @property
-    def __cuda_array_interface__(self):
-        return {
-            "data": (self._data.gpu_data_ptr, False),
-            "shape": (self._data.gpu_data_size,),
-            "strides": None,
-            "typestr": "|u1",
-            "version": 0
-        }
-
-    def serialize(self):
-        header, frames = self._data.serialize()
-        header["type-serialized"] = pickle.dumps(type(self))
-
-        return header, frames
-
-    @classmethod
-    def deserialize(cls, header, frames):
-        return cls(_CPackedColumns.deserialize(header, frames))
-
-    @classmethod
-    def from_py_table(cls, input_table, keep_index=True):
-        return cls(_CPackedColumns.from_py_table(input_table, keep_index))
-
-    def unpack(self):
-        return self._data.unpack()
-
-
 def pack(input_table, keep_index=True):
     """
     Pack the columns of a cudf Frame into a single GPU memory buffer.
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
deleted file mode 100644
index 89309b36371..00000000000
--- a/python/cudf/cudf/_lib/hash.pyx
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pylibcudf as plc
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.table cimport Table
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def hash_partition(list source_columns, list columns_to_hash,
-                   int num_partitions):
-    plc_table, offsets = plc.partitioning.hash_partition(
-        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),
-        columns_to_hash,
-        num_partitions
-    )
-    return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets
-
-
-@acquire_spill_lock()
-def hash(list source_columns, str method, int seed=0):
-    cdef Table ctbl = Table(
-        [c.to_pylibcudf(mode="read") for c in source_columns]
-    )
-    if method == "murmur3":
-        return Column.from_pylibcudf(plc.hashing.murmurhash3_x86_32(ctbl, seed))
-    elif method == "xxhash64":
-        return Column.from_pylibcudf(plc.hashing.xxhash_64(ctbl, seed))
-    elif method == "md5":
-        return Column.from_pylibcudf(plc.hashing.md5(ctbl))
-    elif method == "sha1":
-        return Column.from_pylibcudf(plc.hashing.sha1(ctbl))
-    elif method == "sha224":
-        return Column.from_pylibcudf(plc.hashing.sha224(ctbl))
-    elif method == "sha256":
-        return Column.from_pylibcudf(plc.hashing.sha256(ctbl))
-    elif method == "sha384":
-        return Column.from_pylibcudf(plc.hashing.sha384(ctbl))
-    elif method == "sha512":
-        return Column.from_pylibcudf(plc.hashing.sha512(ctbl))
-    else:
-        raise ValueError(
-            f"Unsupported hashing algorithm {method}."
-        )
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
deleted file mode 100644
index 524bfd3b2e8..00000000000
--- a/python/cudf/cudf/_lib/labeling.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool as cbool
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-from cudf.core.buffer import acquire_spill_lock
-
-
-# Note that the parameter input shadows a Python built-in in the local scope,
-# but I'm not too concerned about that since there's no use-case for actual
-# input in this context.
-@acquire_spill_lock()
-def label_bins(Column input, Column left_edges, cbool left_inclusive,
-               Column right_edges, cbool right_inclusive):
-    plc_column = plc.labeling.label_bins(
-        input.to_pylibcudf(mode="read"),
-        left_edges.to_pylibcudf(mode="read"),
-        plc.labeling.Inclusive.YES if left_inclusive else plc.labeling.Inclusive.NO,
-        right_edges.to_pylibcudf(mode="read"),
-        plc.labeling.Inclusive.YES if right_inclusive else plc.labeling.Inclusive.NO,
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 1589e23f716..a163bb07888 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -3,12 +3,10 @@
 from numba.np import numpy_support
 
 import cudf
-from cudf.core._internals.expressions import parse_expression
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.utils import cudautils
 
 from pylibcudf cimport transform as plc_transform
-from pylibcudf.expressions cimport Expression
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
@@ -93,7 +91,7 @@ def one_hot_encode(Column input_column, Column categories):
 
 
 @acquire_spill_lock()
-def compute_column(list columns, tuple column_names, expr: str):
+def compute_column(list columns, tuple column_names, str expr):
     """Compute a new column by evaluating an expression on a set of columns.
 
     Parameters
@@ -108,12 +106,8 @@ def compute_column(list columns, tuple column_names, expr: str):
     expr : str
         The expression to evaluate.
     """
-    visitor = parse_expression(expr, column_names)
-
-    # At the end, all the stack contains is the expression to evaluate.
-    cdef Expression cudf_expr = visitor.expression
     result = plc_transform.compute_column(
         plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
-        cudf_expr,
+        plc.expressions.to_expression(expr, column_names),
     )
     return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx
deleted file mode 100644
index d5602fd5a1c..00000000000
--- a/python/cudf/cudf/_lib/unary.pyx
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf._lib.column cimport Column
-from cudf._lib.types cimport dtype_to_pylibcudf_type
-
-import numpy as np
-
-import pylibcudf
-
-from cudf.api.types import is_decimal_dtype
-from cudf.core.buffer import acquire_spill_lock
-
-
-@acquire_spill_lock()
-def unary_operation(Column input, object op):
-    return Column.from_pylibcudf(
-        pylibcudf.unary.unary_operation(input.to_pylibcudf(mode="read"), op)
-    )
-
-
-@acquire_spill_lock()
-def is_null(Column input):
-    return Column.from_pylibcudf(
-        pylibcudf.unary.is_null(input.to_pylibcudf(mode="read"))
-    )
-
-
-@acquire_spill_lock()
-def is_valid(Column input):
-    return Column.from_pylibcudf(
-        pylibcudf.unary.is_valid(input.to_pylibcudf(mode="read"))
-    )
-
-
-@acquire_spill_lock()
-def cast(Column input, object dtype=np.float64):
-    result = Column.from_pylibcudf(
-        pylibcudf.unary.cast(
-            input.to_pylibcudf(mode="read"),
-            dtype_to_pylibcudf_type(dtype)
-        )
-    )
-
-    if is_decimal_dtype(result.dtype):
-        result.dtype.precision = dtype.precision
-    return result
-
-
-@acquire_spill_lock()
-def is_nan(Column input):
-    return Column.from_pylibcudf(
-        pylibcudf.unary.is_nan(input.to_pylibcudf(mode="read"))
-    )
-
-
-@acquire_spill_lock()
-def is_non_nan(Column input):
-    return Column.from_pylibcudf(
-        pylibcudf.unary.is_not_nan(input.to_pylibcudf(mode="read"))
-    )
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 623c5064a1a..6db3036d514 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -10,7 +10,7 @@ from pylibcudf.libcudf.table.table cimport table, table_view
 
 cdef data_from_unique_ptr(
     unique_ptr[table] c_tbl, column_names, index_names=*)
-cdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
+cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
 cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *)
 cdef data_from_table_view(
     table_view tv, object owner, object column_names, object index_names=*)
@@ -18,5 +18,5 @@ cdef table_view table_view_from_columns(columns) except *
 cdef table_view table_view_from_table(tbl, ignore_index=*) except*
 cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
 cdef columns_from_table_view(table_view tv, object owners)
-cdef columns_from_pylibcudf_table(tbl)
+cpdef columns_from_pylibcudf_table(tbl)
 cdef _data_from_columns(columns, column_names, index_names=*)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 292de82e4c4..244d7fdc006 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -229,7 +229,7 @@ cdef columns_from_unique_ptr(
     return columns
 
 
-cdef columns_from_pylibcudf_table(tbl):
+cpdef columns_from_pylibcudf_table(tbl):
     """Convert a pylibcudf table into list of columns.
 
     Parameters
@@ -309,7 +309,7 @@ cdef data_from_unique_ptr(
     )
 
 
-cdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
+cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
     return _data_from_columns(
         columns_from_pylibcudf_table(tbl),
         column_names,
diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
deleted file mode 100644
index 90d9118027a..00000000000
--- a/python/cudf/cudf/core/_internals/expressions.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-from __future__ import annotations
-
-import ast
-import functools
-
-import pyarrow as pa
-
-import pylibcudf as plc
-from pylibcudf.expressions import (
-    ASTOperator,
-    ColumnReference,
-    Expression,
-    Literal,
-    Operation,
-)
-
-# This dictionary encodes the mapping from Python AST operators to their cudf
-# counterparts.
-python_cudf_operator_map = {
-    # Binary operators
-    ast.Add: ASTOperator.ADD,
-    ast.Sub: ASTOperator.SUB,
-    ast.Mult: ASTOperator.MUL,
-    ast.Div: ASTOperator.DIV,
-    ast.FloorDiv: ASTOperator.FLOOR_DIV,
-    ast.Mod: ASTOperator.PYMOD,
-    ast.Pow: ASTOperator.POW,
-    ast.Eq: ASTOperator.EQUAL,
-    ast.NotEq: ASTOperator.NOT_EQUAL,
-    ast.Lt: ASTOperator.LESS,
-    ast.Gt: ASTOperator.GREATER,
-    ast.LtE: ASTOperator.LESS_EQUAL,
-    ast.GtE: ASTOperator.GREATER_EQUAL,
-    ast.BitXor: ASTOperator.BITWISE_XOR,
-    # TODO: The mapping of logical/bitwise operators here is inconsistent with
-    # pandas. In pandas, Both `BitAnd` and `And` map to
-    # `ASTOperator.LOGICAL_AND` for booleans, while they map to
-    # `ASTOperator.BITWISE_AND` for integers. However, there is no good way to
-    # encode this at present because expressions can be arbitrarily nested so
-    # we won't know the dtype of the input without inserting a much more
-    # complex traversal of the expression tree to determine the output types at
-    # each node. For now, we'll rely on users to use the appropriate operator.
-    ast.BitAnd: ASTOperator.BITWISE_AND,
-    ast.BitOr: ASTOperator.BITWISE_OR,
-    ast.And: ASTOperator.LOGICAL_AND,
-    ast.Or: ASTOperator.LOGICAL_OR,
-    # Unary operators
-    ast.Invert: ASTOperator.BIT_INVERT,
-    ast.Not: ASTOperator.NOT,
-    # TODO: Missing USub, possibility other unary ops?
-}
-
-
-# Mapping between Python function names encode in an ast.Call node and the
-# corresponding libcudf C++ AST operators.
-python_cudf_function_map = {
-    # TODO: Operators listed on
-    # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval  # noqa: E501
-    # that we don't support yet:
-    # expm1, log1p, arctan2 and log10.
-    "isnull": ASTOperator.IS_NULL,
-    "isna": ASTOperator.IS_NULL,
-    "sin": ASTOperator.SIN,
-    "cos": ASTOperator.COS,
-    "tan": ASTOperator.TAN,
-    "arcsin": ASTOperator.ARCSIN,
-    "arccos": ASTOperator.ARCCOS,
-    "arctan": ASTOperator.ARCTAN,
-    "sinh": ASTOperator.SINH,
-    "cosh": ASTOperator.COSH,
-    "tanh": ASTOperator.TANH,
-    "arcsinh": ASTOperator.ARCSINH,
-    "arccosh": ASTOperator.ARCCOSH,
-    "arctanh": ASTOperator.ARCTANH,
-    "exp": ASTOperator.EXP,
-    "log": ASTOperator.LOG,
-    "sqrt": ASTOperator.SQRT,
-    "abs": ASTOperator.ABS,
-    "ceil": ASTOperator.CEIL,
-    "floor": ASTOperator.FLOOR,
-    # TODO: Operators supported by libcudf with no Python function analog.
-    # ast.rint: ASTOperator.RINT,
-    # ast.cbrt: ASTOperator.CBRT,
-}
-
-
-class libcudfASTVisitor(ast.NodeVisitor):
-    """A NodeVisitor specialized for constructing a libcudf expression tree.
-
-    This visitor is designed to handle AST nodes that have libcudf equivalents.
-    It constructs column references from names and literals from constants,
-    then builds up operations. The final result can be accessed using the
-    `expression` property. The visitor must be kept in scope for as long as the
-    expression is needed because all of the underlying libcudf expressions will
-    be destroyed when the libcudfASTVisitor is.
-
-    Parameters
-    ----------
-    col_names : Tuple[str]
-        The column names used to map the names in an expression.
-    """
-
-    def __init__(self, col_names: tuple[str]):
-        self.stack: list[Expression] = []
-        self.nodes: list[Expression] = []
-        self.col_names = col_names
-
-    @property
-    def expression(self):
-        """Expression: The result of parsing an AST."""
-        assert len(self.stack) == 1
-        return self.stack[-1]
-
-    def visit_Name(self, node):
-        try:
-            col_id = self.col_names.index(node.id)
-        except ValueError:
-            raise ValueError(f"Unknown column name {node.id}")
-        self.stack.append(ColumnReference(col_id))
-
-    def visit_Constant(self, node):
-        if not isinstance(node.value, (float, int, str, complex)):
-            raise ValueError(
-                f"Unsupported literal {repr(node.value)} of type "
-                "{type(node.value).__name__}"
-            )
-        self.stack.append(
-            Literal(plc.interop.from_arrow(pa.scalar(node.value)))
-        )
-
-    def visit_UnaryOp(self, node):
-        self.visit(node.operand)
-        self.nodes.append(self.stack.pop())
-        if isinstance(node.op, ast.USub):
-            # TODO: Except for leaf nodes, we won't know the type of the
-            # operand, so there's no way to know whether this should be a float
-            # or an int. We should maybe see what Spark does, and this will
-            # probably require casting.
-            self.nodes.append(Literal(plc.interop.from_arrow(pa.scalar(-1))))
-            op = ASTOperator.MUL
-            self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2]))
-        elif isinstance(node.op, ast.UAdd):
-            self.stack.append(self.nodes[-1])
-        else:
-            op = python_cudf_operator_map[type(node.op)]
-            self.stack.append(Operation(op, self.nodes[-1]))
-
-    def visit_BinOp(self, node):
-        self.visit(node.left)
-        self.visit(node.right)
-        self.nodes.append(self.stack.pop())
-        self.nodes.append(self.stack.pop())
-
-        op = python_cudf_operator_map[type(node.op)]
-        self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2]))
-
-    def _visit_BoolOp_Compare(self, operators, operands, has_multiple_ops):
-        # Helper function handling the common components of parsing BoolOp and
-        # Compare AST nodes. These two types of nodes both support chaining
-        # (e.g. `a > b > c` is equivalent to `a > b and b > c`, so this
-        # function helps standardize that.
-
-        # TODO: Whether And/Or and BitAnd/BitOr actually correspond to
-        # logical or bitwise operators depends on the data types that they
-        # are applied to. We'll need to add logic to map to that.
-        inner_ops = []
-        for op, (left, right) in zip(operators, operands):
-            # Note that this will lead to duplicate nodes, e.g. if
-            # the comparison is `a < b < c` that will be encoded as
-            # `a < b and b < c`. We could potentially optimize by caching
-            # expressions by name so that we only construct them once.
-            self.visit(left)
-            self.visit(right)
-
-            self.nodes.append(self.stack.pop())
-            self.nodes.append(self.stack.pop())
-
-            op = python_cudf_operator_map[type(op)]
-            inner_ops.append(Operation(op, self.nodes[-1], self.nodes[-2]))
-
-        self.nodes.extend(inner_ops)
-
-        # If we have more than one comparator, we need to link them
-        # together with LOGICAL_AND operators.
-        if has_multiple_ops:
-            op = ASTOperator.LOGICAL_AND
-
-            def _combine_compare_ops(left, right):
-                self.nodes.append(Operation(op, left, right))
-                return self.nodes[-1]
-
-            functools.reduce(_combine_compare_ops, inner_ops)
-
-        self.stack.append(self.nodes[-1])
-
-    def visit_BoolOp(self, node):
-        operators = [node.op] * (len(node.values) - 1)
-        operands = zip(node.values[:-1], node.values[1:])
-        self._visit_BoolOp_Compare(operators, operands, len(node.values) > 2)
-
-    def visit_Compare(self, node):
-        operands = (node.left, *node.comparators)
-        has_multiple_ops = len(operands) > 2
-        operands = zip(operands[:-1], operands[1:])
-        self._visit_BoolOp_Compare(node.ops, operands, has_multiple_ops)
-
-    def visit_Call(self, node):
-        try:
-            op = python_cudf_function_map[node.func.id]
-        except KeyError:
-            raise ValueError(f"Unsupported function {node.func}.")
-        # Assuming only unary functions are supported, which is checked above.
-        if len(node.args) != 1 or node.keywords:
-            raise ValueError(
-                f"Function {node.func} only accepts one positional "
-                "argument."
-            )
-        self.visit(node.args[0])
-
-        self.nodes.append(self.stack.pop())
-        self.stack.append(Operation(op, self.nodes[-1]))
-
-
-@functools.lru_cache(256)
-def parse_expression(expr: str, col_names: tuple[str]):
-    visitor = libcudfASTVisitor(col_names)
-    visitor.visit(ast.parse(expr))
-    return visitor
diff --git a/python/cudf/cudf/core/_internals/unary.py b/python/cudf/cudf/core/_internals/unary.py
new file mode 100644
index 00000000000..3b8e3db60a7
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/unary.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pylibcudf as plc
+
+from cudf._lib.types import dtype_to_pylibcudf_type
+from cudf.api.types import is_decimal_dtype
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from cudf._typing import Dtype
+    from cudf.core.column import ColumnBase
+
+
+@acquire_spill_lock()
+def unary_operation(
+    col: ColumnBase, op: plc.unary.UnaryOperator
+) -> ColumnBase:
+    return type(col).from_pylibcudf(
+        plc.unary.unary_operation(col.to_pylibcudf(mode="read"), op)
+    )
+
+
+@acquire_spill_lock()
+def is_null(col: ColumnBase) -> ColumnBase:
+    return type(col).from_pylibcudf(
+        plc.unary.is_null(col.to_pylibcudf(mode="read"))
+    )
+
+
+@acquire_spill_lock()
+def is_valid(col: ColumnBase) -> ColumnBase:
+    return type(col).from_pylibcudf(
+        plc.unary.is_valid(col.to_pylibcudf(mode="read"))
+    )
+
+
+@acquire_spill_lock()
+def cast(col: ColumnBase, dtype: Dtype) -> ColumnBase:
+    result = type(col).from_pylibcudf(
+        plc.unary.cast(
+            col.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype)
+        )
+    )
+
+    if is_decimal_dtype(result.dtype):
+        result.dtype.precision = dtype.precision  # type: ignore[union-attr]
+    return result
+
+
+@acquire_spill_lock()
+def is_nan(col: ColumnBase) -> ColumnBase:
+    return type(col).from_pylibcudf(
+        plc.unary.is_nan(col.to_pylibcudf(mode="read"))
+    )
+
+
+@acquire_spill_lock()
+def is_non_nan(col: ColumnBase) -> ColumnBase:
+    return type(col).from_pylibcudf(
+        plc.unary.is_not_nan(col.to_pylibcudf(mode="read"))
+    )
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 087d0ed65f5..7354b917f90 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -14,6 +14,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.transform import bools_to_mask
+from cudf.core._internals import unary
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
@@ -1018,12 +1019,12 @@ def isnull(self) -> ColumnBase:
         """
         Identify missing values in a CategoricalColumn.
         """
-        result = libcudf.unary.is_null(self)
+        result = unary.is_null(self)
 
         if self.categories.dtype.kind == "f":
             # Need to consider `np.nan` values in case
             # of an underlying float column
-            categories = libcudf.unary.is_nan(self.categories)
+            categories = unary.is_nan(self.categories)
             if categories.any():
                 code = self._encode(np.nan)
                 result = result | (self.codes == cudf.Scalar(code))
@@ -1034,12 +1035,12 @@ def notnull(self) -> ColumnBase:
         """
         Identify non-missing values in a CategoricalColumn.
         """
-        result = libcudf.unary.is_valid(self)
+        result = unary.is_valid(self)
 
         if self.categories.dtype.kind == "f":
             # Need to consider `np.nan` values in case
             # of an underlying float column
-            categories = libcudf.unary.is_nan(self.categories)
+            categories = unary.is_nan(self.categories)
             if categories.any():
                 code = self._encode(np.nan)
                 result = result & (self.codes != cudf.Scalar(code))
@@ -1203,9 +1204,7 @@ def _concat(
         elif newsize == 0:
             codes_col = column.column_empty(0, head.codes.dtype, masked=True)
         else:
-            # Filter out inputs that have 0 length, then concatenate.
-            codes = [o for o in codes if len(o)]
-            codes_col = libcudf.concat.concat_columns(objs)
+            codes_col = column.concat_columns(codes)  # type: ignore[arg-type]
 
         codes_col = as_unsigned_codes(
             len(cats),
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index d2f9d208c77..f6eaea4b783 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -19,6 +19,7 @@
 from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
 from typing_extensions import Self
 
+import pylibcudf as plc
 import rmm
 
 import cudf
@@ -47,6 +48,7 @@
     is_string_dtype,
 )
 from cudf.core._compat import PANDAS_GE_210
+from cudf.core._internals import unary
 from cudf.core._internals.timezones import get_compatible_timezone
 from cudf.core.abc import Serializable
 from cudf.core.buffer import (
@@ -713,12 +715,12 @@ def isnull(self) -> ColumnBase:
         if not self.has_nulls(include_nan=self.dtype.kind == "f"):
             return as_column(False, length=len(self))
 
-        result = libcudf.unary.is_null(self)
+        result = unary.is_null(self)
 
         if self.dtype.kind == "f":
             # Need to consider `np.nan` values in case
             # of a float column
-            result = result | libcudf.unary.is_nan(self)
+            result = result | unary.is_nan(self)
 
         return result
 
@@ -727,12 +729,12 @@ def notnull(self) -> ColumnBase:
         if not self.has_nulls(include_nan=self.dtype.kind == "f"):
             return as_column(True, length=len(self))
 
-        result = libcudf.unary.is_valid(self)
+        result = unary.is_valid(self)
 
         if self.dtype.kind == "f":
             # Need to consider `np.nan` values in case
             # of a float column
-            result = result & libcudf.unary.is_non_nan(self)
+            result = result & unary.is_non_nan(self)
 
         return result
 
@@ -2299,4 +2301,10 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
         return column_empty(0, head.dtype, masked=True)
 
     # Filter out inputs that have 0 length, then concatenate.
-    return libcudf.concat.concat_columns([o for o in objs if len(o)])
+    objs_with_len = [o for o in objs if len(o)]
+    with acquire_spill_lock():
+        return Column.from_pylibcudf(
+            plc.concatenate.concatenate(
+                [col.to_pylibcudf(mode="read") for col in objs_with_len]
+            )
+        )
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index bd0d72b9bc0..16124cf0a7d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -14,17 +14,19 @@
 import pandas as pd
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.labeling import label_bins
 from cudf._lib.search import search_sorted
 from cudf.core._compat import PANDAS_GE_220
+from cudf.core._internals import unary
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
     get_compatible_timezone,
     get_tz_data,
 )
-from cudf.core.buffer import Buffer
+from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
@@ -490,7 +492,7 @@ def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
                 "Cannot use .astype to convert from timezone-naive dtype to timezone-aware dtype. "
                 "Use tz_localize instead."
             )
-        return libcudf.unary.cast(self, dtype=dtype)
+        return unary.cast(self, dtype=dtype)  # type: ignore[return-value]
 
     def as_timedelta_column(self, dtype: Dtype) -> None:  # type: ignore[override]
         raise TypeError(
@@ -818,13 +820,16 @@ def _find_ambiguous_and_nonexistent(
         # The end of an ambiguous time period is what Clock 2 reads at
         # the moment of transition:
         ambiguous_end = clock_2.apply_boolean_mask(cond)
-        ambiguous = label_bins(
-            self,
-            left_edges=ambiguous_begin,
-            left_inclusive=True,
-            right_edges=ambiguous_end,
-            right_inclusive=False,
-        ).notnull()
+        with acquire_spill_lock():
+            plc_column = plc.labeling.label_bins(
+                self.to_pylibcudf(mode="read"),
+                ambiguous_begin.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.YES,
+                ambiguous_end.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.NO,
+            )
+            ambiguous = libcudf.column.Column.from_pylibcudf(plc_column)
+        ambiguous = ambiguous.notnull()
 
         # At the start of a non-existent time period, Clock 2 reads less
         # than Clock 1 (which has been turned forward):
@@ -834,13 +839,16 @@ def _find_ambiguous_and_nonexistent(
         # The end of the non-existent time period is what Clock 1 reads
         # at the moment of transition:
         nonexistent_end = clock_1.apply_boolean_mask(cond)
-        nonexistent = label_bins(
-            self,
-            left_edges=nonexistent_begin,
-            left_inclusive=True,
-            right_edges=nonexistent_end,
-            right_inclusive=False,
-        ).notnull()
+        with acquire_spill_lock():
+            plc_column = plc.labeling.label_bins(
+                self.to_pylibcudf(mode="read"),
+                nonexistent_begin.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.YES,
+                nonexistent_end.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.NO,
+            )
+            nonexistent = libcudf.column.Column.from_pylibcudf(plc_column)
+        nonexistent = nonexistent.notnull()
 
         return ambiguous, nonexistent
 
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 8ae06f72d1e..ce7aa91f775 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import warnings
-from collections.abc import Sequence
 from decimal import Decimal
 from typing import TYPE_CHECKING, cast
 
@@ -17,6 +16,7 @@
     from_decimal as cpp_from_decimal,
 )
 from cudf.api.types import is_scalar
+from cudf.core._internals import unary
 from cudf.core.buffer import as_buffer
 from cudf.core.column import ColumnBase
 from cudf.core.dtypes import (
@@ -85,7 +85,7 @@ def as_decimal_column(
 
         if dtype == self.dtype:
             return self
-        return libcudf.unary.cast(self, dtype)
+        return unary.cast(self, dtype)  # type: ignore[return-value]
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
@@ -216,23 +216,10 @@ def normalize_binop_value(self, other):
             )
         return NotImplemented
 
-    def _decimal_quantile(
-        self, q: float | Sequence[float], interpolation: str, exact: bool
-    ) -> ColumnBase:
-        quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q
-        # get sorted indices and exclude nulls
-        indices = libcudf.sort.order_by(
-            [self], [True], "first", stable=True
-        ).slice(self.null_count, len(self))
-        result = libcudf.quantiles.quantile(
-            self, quant, interpolation, indices, exact
-        )
-        return result._with_type_metadata(self.dtype)
-
     def as_numerical_column(
         self, dtype: Dtype
     ) -> "cudf.core.column.NumericalColumn":
-        return libcudf.unary.cast(self, dtype)
+        return unary.cast(self, dtype)  # type: ignore[return-value]
 
 
 class Decimal32Column(DecimalBaseColumn):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f79496ed0ec..36d1bdb45b6 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -14,6 +14,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf.api.types import is_integer, is_scalar
+from cudf.core._internals import unary
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
@@ -125,7 +126,7 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn:
             and self.dtype.kind in {"c", "f"}
             and np.isnan(value)
         ):
-            nan_col = libcudf.unary.is_nan(self)
+            nan_col = unary.is_nan(self)
             return nan_col.indices_of(True)
         else:
             return super().indices_of(value)
@@ -184,7 +185,7 @@ def unary_operator(self, unaryop: str | Callable) -> ColumnBase:
         unaryop = unaryop.upper()
         unaryop = _unaryop_map.get(unaryop, unaryop)
         unaryop = pylibcudf.unary.UnaryOperator[unaryop]
-        return libcudf.unary.unary_operation(self, unaryop)
+        return unary.unary_operation(self, unaryop)
 
     def __invert__(self):
         if self.dtype.kind in "ui":
@@ -388,13 +389,13 @@ def as_timedelta_column(
     def as_decimal_column(
         self, dtype: Dtype
     ) -> "cudf.core.column.DecimalBaseColumn":
-        return libcudf.unary.cast(self, dtype)
+        return unary.cast(self, dtype)  # type: ignore[return-value]
 
     def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
         dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
-        return libcudf.unary.cast(self, dtype)
+        return unary.cast(self, dtype)  # type: ignore[return-value]
 
     def all(self, skipna: bool = True) -> bool:
         # If all entries are null the result is True, including when the column
@@ -421,7 +422,7 @@ def any(self, skipna: bool = True) -> bool:
     def nan_count(self) -> int:
         if self.dtype.kind != "f":
             return 0
-        nan_col = libcudf.unary.is_nan(self)
+        nan_col = unary.is_nan(self)
         return nan_col.sum()
 
     def _process_values_for_isin(
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index f6ab91f2f01..6d639337401 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -7,9 +7,11 @@
 
 import numpy as np
 
+import pylibcudf as plc
+
 import cudf
 from cudf import _lib as libcudf
-from cudf.core.buffer import Buffer
+from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase
 from cudf.core.missing import NA
 from cudf.core.mixins import Scannable
@@ -145,9 +147,15 @@ def quantile(
             indices = libcudf.sort.order_by(
                 [self], [True], "first", stable=True
             ).slice(self.null_count, len(self))
-            result = libcudf.quantiles.quantile(
-                self, q, interpolation, indices, exact
-            )
+            with acquire_spill_lock():
+                plc_column = plc.quantiles.quantile(
+                    self.to_pylibcudf(mode="read"),
+                    q,
+                    plc.types.Interpolation[interpolation.upper()],
+                    indices.to_pylibcudf(mode="read"),
+                    exact,
+                )
+                result = type(self).from_pylibcudf(plc_column)  # type: ignore[assignment]
         if return_scalar:
             scalar_result = result.element_indexing(0)
             if interpolation in {"lower", "higher", "nearest"}:
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index c3ad09cf898..620fe31c30f 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -13,6 +13,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf.api.types import is_scalar
+from cudf.core._internals import unary
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import np_to_pa_dtype
@@ -307,7 +308,7 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
     def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
         if dtype == self.dtype:
             return self
-        return libcudf.unary.cast(self, dtype=dtype)
+        return unary.cast(self, dtype=dtype)  # type: ignore[return-value]
 
     def find_and_replace(
         self,
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index c9b1fa2669c..a4d12cfc7f0 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -6,8 +6,12 @@
 import numpy as np
 import pandas as pd
 
+import pylibcudf as plc
+
 import cudf
+from cudf._lib.column import Column
 from cudf.api.types import is_list_like
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import as_column
 from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
 from cudf.core.index import IntervalIndex, interval_range
@@ -256,9 +260,19 @@ def cut(
         # the input arr must be changed to the same type as the edges
         input_arr = input_arr.astype(left_edges.dtype)
     # get the indexes for the appropriate number
-    index_labels = cudf._lib.labeling.label_bins(
-        input_arr, left_edges, left_inclusive, right_edges, right_inclusive
-    )
+    with acquire_spill_lock():
+        plc_column = plc.labeling.label_bins(
+            input_arr.to_pylibcudf(mode="read"),
+            left_edges.to_pylibcudf(mode="read"),
+            plc.labeling.Inclusive.YES
+            if left_inclusive
+            else plc.labeling.Inclusive.NO,
+            right_edges.to_pylibcudf(mode="read"),
+            plc.labeling.Inclusive.YES
+            if right_inclusive
+            else plc.labeling.Inclusive.NO,
+        )
+        index_labels = Column.from_pylibcudf(plc_column)
 
     if labels is False:
         # if labels is false we return the index labels, we return them
@@ -283,7 +297,7 @@ def cut(
             # should allow duplicate categories.
             return interval_labels[index_labels]
 
-    index_labels = as_unsigned_codes(len(interval_labels), index_labels)
+    index_labels = as_unsigned_codes(len(interval_labels), index_labels)  # type: ignore[arg-type]
 
     col = CategoricalColumn(
         data=None,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bf1c39b23da..bd78d5dd9f1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -26,6 +26,8 @@
 from pandas.io.formats.printing import pprint_thing
 from typing_extensions import Self, assert_never
 
+import pylibcudf as plc
+
 import cudf
 import cudf.core.common
 from cudf import _lib as libcudf
@@ -43,6 +45,7 @@
 from cudf.core import column, df_protocol, indexing_utils, reshape
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
@@ -1784,11 +1787,32 @@ def _concat(
             )
 
         # Concatenate the Tables
-        out = cls._from_data(
-            *libcudf.concat.concat_tables(
-                tables, ignore_index=ignore_index or are_all_range_index
+        ignore = ignore_index or are_all_range_index
+        index_names = None if ignore else tables[0]._index_names
+        column_names = tables[0]._column_names
+        with acquire_spill_lock():
+            plc_tables = [
+                plc.Table(
+                    [
+                        c.to_pylibcudf(mode="read")
+                        for c in (
+                            table._columns
+                            if ignore
+                            else itertools.chain(
+                                table._index._columns, table._columns
+                            )
+                        )
+                    ]
+                )
+                for table in tables
+            ]
+
+            concatted = libcudf.utils.data_from_pylibcudf_table(
+                plc.concatenate.concatenate(plc_tables),
+                column_names=column_names,
+                index_names=index_names,
             )
-        )
+        out = cls._from_data(*concatted)
 
         # If ignore_index is True, all input frames are empty, and at
         # least one input frame has an index, assign a new RangeIndex
@@ -4962,7 +4986,9 @@ def apply_chunks(
         )
 
     @_performance_tracking
-    def partition_by_hash(self, columns, nparts, keep_index=True):
+    def partition_by_hash(
+        self, columns, nparts: int, keep_index: bool = True
+    ) -> list[DataFrame]:
         """Partition the dataframe by the hashed value of data in *columns*.
 
         Parameters
@@ -4986,13 +5012,21 @@ def partition_by_hash(self, columns, nparts, keep_index=True):
         else:
             cols = [*self._columns]
 
-        output_columns, offsets = libcudf.hash.hash_partition(
-            cols, key_indices, nparts
-        )
+        with acquire_spill_lock():
+            plc_table, offsets = plc.partitioning.hash_partition(
+                plc.Table([col.to_pylibcudf(mode="read") for col in cols]),
+                key_indices,
+                nparts,
+            )
+            output_columns = [
+                libcudf.column.Column.from_pylibcudf(col)
+                for col in plc_table.columns()
+            ]
+
         outdf = self._from_columns_like_self(
             output_columns,
             self._column_names,
-            self._index_names if keep_index else None,
+            self._index_names if keep_index else None,  # type: ignore[arg-type]
         )
         # Slice into partitions. Notice, `hash_partition` returns the start
         # offset of each partition thus we skip the first offset
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 2b4a17f9559..30868924bcd 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -799,15 +799,20 @@ def _quantile_table(
 
         null_precedence = [plc.types.NullOrder[key] for key in null_precedence]
 
-        return self._from_columns_like_self(
-            libcudf.quantiles.quantile_table(
-                [*self._columns],
+        with acquire_spill_lock():
+            plc_table = plc.quantiles.quantiles(
+                plc.Table(
+                    [c.to_pylibcudf(mode="read") for c in self._columns]
+                ),
                 q,
                 interpolation,
                 is_sorted,
                 column_order,
                 null_precedence,
-            ),
+            )
+            columns = libcudf.utils.columns_from_pylibcudf_table(plc_table)
+        return self._from_columns_like_self(
+            columns,
             column_names=self._column_names,
         )
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e031f2a4e8e..9130779c3e9 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -21,7 +21,7 @@
 import pandas as pd
 from typing_extensions import Self
 
-import pylibcudf
+import pylibcudf as plc
 
 import cudf
 import cudf._lib as libcudf
@@ -2817,7 +2817,20 @@ def memory_usage(self, index=True, deep=False):
         """
         raise NotImplementedError
 
-    def hash_values(self, method="murmur3", seed=None):
+    def hash_values(
+        self,
+        method: Literal[
+            "murmur3",
+            "xxhash64",
+            "md5",
+            "sha1",
+            "sha224",
+            "sha256",
+            "sha384",
+            "sha512",
+        ] = "murmur3",
+        seed: int | None = None,
+    ) -> cudf.Series:
         """Compute the hash of values in this column.
 
         Parameters
@@ -2894,11 +2907,31 @@ def hash_values(self, method="murmur3", seed=None):
                 "Provided seed value has no effect for the hash method "
                 f"`{method}`. Only {seed_hash_methods} support seeds."
             )
-        # Note that both Series and DataFrame return Series objects from this
-        # calculation, necessitating the unfortunate circular reference to the
-        # child class here.
+        with acquire_spill_lock():
+            plc_table = plc.Table(
+                [c.to_pylibcudf(mode="read") for c in self._columns]
+            )
+            if method == "murmur3":
+                plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed)
+            elif method == "xxhash64":
+                plc_column = plc.hashing.xxhash_64(plc_table, seed)
+            elif method == "md5":
+                plc_column = plc.hashing.md5(plc_table)
+            elif method == "sha1":
+                plc_column = plc.hashing.sha1(plc_table)
+            elif method == "sha224":
+                plc_column = plc.hashing.sha224(plc_table)
+            elif method == "sha256":
+                plc_column = plc.hashing.sha256(plc_table)
+            elif method == "sha384":
+                plc_column = plc.hashing.sha384(plc_table)
+            elif method == "sha512":
+                plc_column = plc.hashing.sha512(plc_table)
+            else:
+                raise ValueError(f"Unsupported hashing algorithm {method}.")
+            result = libcudf.column.Column.from_pylibcudf(plc_column)
         return cudf.Series._from_column(
-            libcudf.hash.hash([*self._columns], method, seed),
+            result,
             index=self.index,
         )
 
@@ -6270,7 +6303,7 @@ def rank(
         if method not in {"average", "min", "max", "first", "dense"}:
             raise KeyError(method)
 
-        method_enum = pylibcudf.aggregation.RankMethod[method.upper()]
+        method_enum = plc.aggregation.RankMethod[method.upper()]
         if na_option not in {"keep", "top", "bottom"}:
             raise ValueError(
                 "na_option must be one of 'keep', 'top', or 'bottom'"
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index e0aee28bfeb..d95d252559f 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -22,9 +22,11 @@
 import numpy as np
 import pandas as pd
 
+import pylibcudf as plc
+
 import cudf
-import cudf._lib.labeling
-import cudf.core.index
+from cudf._lib.column import Column
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.groupby.groupby import (
     DataFrameGroupBy,
     GroupBy,
@@ -48,7 +50,7 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
             func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
         )
         if len(self.grouping.bin_labels) != len(result):
-            index = cudf.core.index.Index(
+            index = cudf.Index(
                 self.grouping.bin_labels, name=self.grouping.names[0]
             )
             return result._align_to_index(
@@ -125,7 +127,7 @@ class SeriesResampler(_Resampler, SeriesGroupBy):
 
 
 class _ResampleGrouping(_Grouping):
-    bin_labels: cudf.core.index.Index
+    bin_labels: cudf.Index
 
     def __init__(self, obj, by=None, level=None):
         self._freq = getattr(by, "freq", None)
@@ -170,7 +172,7 @@ def deserialize(cls, header, frames):
         out.names = names
         out._named_columns = _named_columns
         out._key_columns = key_columns
-        out.bin_labels = cudf.core.index.Index.deserialize(
+        out.bin_labels = cudf.Index.deserialize(
             header["__bin_labels"], frames[-header["__bin_labels_count"] :]
         )
         out._freq = header["_freq"]
@@ -268,13 +270,19 @@ def _handle_frequency_grouper(self, by):
             cast_bin_labels = bin_labels.astype(result_type)
 
         # bin the key column:
-        bin_numbers = cudf._lib.labeling.label_bins(
-            cast_key_column,
-            left_edges=cast_bin_labels[:-1]._column,
-            left_inclusive=(closed == "left"),
-            right_edges=cast_bin_labels[1:]._column,
-            right_inclusive=(closed == "right"),
-        )
+        with acquire_spill_lock():
+            plc_column = plc.labeling.label_bins(
+                cast_key_column.to_pylibcudf(mode="read"),
+                cast_bin_labels[:-1]._column.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.YES
+                if closed == "left"
+                else plc.labeling.Inclusive.NO,
+                cast_bin_labels[1:]._column.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.YES
+                if closed == "right"
+                else plc.labeling.Inclusive.NO,
+            )
+            bin_numbers = Column.from_pylibcudf(plc_column)
 
         if label == "right":
             cast_bin_labels = cast_bin_labels[1:]
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 6cecf3fa170..9a22045ff78 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -11,6 +11,7 @@
 from cudf import _lib as libcudf
 from cudf._lib import strings as libstrings
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
+from cudf.core._internals import unary
 from cudf.core.column import as_column
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.index import ensure_index
@@ -171,7 +172,7 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
             downcast_dtype = cudf.dtype(t)
             if downcast_dtype.itemsize <= col.dtype.itemsize:
                 if col.can_cast_safely(downcast_dtype):
-                    col = libcudf.unary.cast(col, downcast_dtype)
+                    col = unary.cast(col, downcast_dtype)
                     break
 
     if isinstance(arg, (cudf.Series, pd.Series)):
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index ce99f98b559..750c6cec180 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -368,6 +368,14 @@ def _process_dataset(
     file_list = paths
     if len(paths) == 1 and ioutils.is_directory(paths[0]):
         paths = ioutils.stringify_pathlike(paths[0])
+    elif (
+        filters is None
+        and isinstance(dataset_kwargs, dict)
+        and dataset_kwargs.get("partitioning") is None
+    ):
+        # Skip dataset processing if we have no filters
+        # or hive/directory partitioning to deal with.
+        return paths, row_groups, [], {}
 
     # Convert filters to ds.Expression
     if filters is not None:
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 668e7a77454..8d342f8e6c6 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -10,8 +10,8 @@
 from pandas import testing as tm
 
 import cudf
-from cudf._lib.unary import is_nan
 from cudf.api.types import is_numeric_dtype, is_string_dtype
+from cudf.core._internals.unary import is_nan
 from cudf.core.missing import NA, NaT
 
 
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 96512dacb69..659d2ebd89a 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3771,10 +3771,10 @@ def test_parquet_chunked_reader(
     chunk_read_limit, pass_read_limit, use_pandas_metadata, row_groups
 ):
     df = pd.DataFrame(
-        {"a": [1, 2, 3, 4] * 1000000, "b": ["av", "qw", "hi", "xyz"] * 1000000}
+        {"a": [1, 2, 3, None] * 10000, "b": ["av", "qw", None, "xyz"] * 10000}
     )
     buffer = BytesIO()
-    df.to_parquet(buffer)
+    df.to_parquet(buffer, row_group_size=10000)
     actual = read_parquet_chunked(
         [buffer],
         chunk_read_limit=chunk_read_limit,
@@ -3788,6 +3788,108 @@ def test_parquet_chunked_reader(
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000])
+@pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000])
+@pytest.mark.parametrize("num_rows", [997, 2997, None])
+def test_parquet_chunked_reader_structs(
+    chunk_read_limit,
+    pass_read_limit,
+    num_rows,
+):
+    data = [
+        {
+            "a": "g",
+            "b": {
+                "b_a": 10,
+                "b_b": {"b_b_b": None, "b_b_a": 2},
+            },
+            "c": None,
+        },
+        {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]},
+        {"a": "j", "b": None, "c": [8, 10]},
+        {"a": None, "b": {"b_a": None, "b_b": None}, "c": None},
+        None,
+        {
+            "a": None,
+            "b": {"b_a": None, "b_b": {"b_b_b": 1}},
+            "c": [18, 19],
+        },
+        {"a": None, "b": None, "c": None},
+    ] * 1000
+
+    pa_struct = pa.Table.from_pydict({"struct": data})
+    df = cudf.DataFrame.from_arrow(pa_struct)
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+
+    # Number of rows to read
+    nrows = num_rows if num_rows is not None else len(df)
+
+    actual = read_parquet_chunked(
+        [buffer],
+        chunk_read_limit=chunk_read_limit,
+        pass_read_limit=pass_read_limit,
+        nrows=nrows,
+    )
+    expected = cudf.read_parquet(
+        buffer,
+        nrows=nrows,
+    )
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000])
+@pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000])
+@pytest.mark.parametrize("num_rows", [4997, 9997, None])
+@pytest.mark.parametrize(
+    "str_encoding",
+    [
+        "PLAIN",
+        "DELTA_BYTE_ARRAY",
+        "DELTA_LENGTH_BYTE_ARRAY",
+    ],
+)
+def test_parquet_chunked_reader_string_decoders(
+    chunk_read_limit,
+    pass_read_limit,
+    num_rows,
+    str_encoding,
+):
+    df = pd.DataFrame(
+        {
+            "i64": [1, 2, 3, None] * 10000,
+            "str": ["av", "qw", "asd", "xyz"] * 10000,
+            "list": list(
+                [["ad", "cd"], ["asd", "fd"], None, ["asd", None]] * 10000
+            ),
+        }
+    )
+    buffer = BytesIO()
+    # Write 4 Parquet row groups with string column encoded
+    df.to_parquet(
+        buffer,
+        row_group_size=10000,
+        use_dictionary=False,
+        column_encoding={"str": str_encoding},
+    )
+
+    # Number of rows to read
+    nrows = num_rows if num_rows is not None else len(df)
+
+    # Check with num_rows specified
+    actual = read_parquet_chunked(
+        [buffer],
+        chunk_read_limit=chunk_read_limit,
+        pass_read_limit=pass_read_limit,
+        nrows=nrows,
+    )
+    expected = cudf.read_parquet(
+        buffer,
+        nrows=nrows,
+    )
+    assert_eq(expected, actual)
+
+
 @pytest.mark.parametrize(
     "nrows,skip_rows",
     [
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 2a9104d8c82..7a759eea2e9 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -47,6 +47,9 @@ def pytest_configure(config: pytest.Config) -> None:
 EXPECTED_FAILURES: Mapping[str, str] = {
     "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed",
     "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
+    "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning",
+    "tests/unit/io/test_delta.py::test_scan_delta_relative": "Need to expose hive partitioning",
+    "tests/unit/io/test_delta.py::test_read_delta_version": "Need to expose hive partitioning",
     "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
     "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
     "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing",
@@ -64,7 +67,6 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
-    "tests/unit/io/test_lazy_parquet.py::test_glob_n_rows": "https://github.com/rapidsai/cudf/issues/17311",
     "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394",
     "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394",
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 4fefe25e91e..f050a7c568a 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.11,<1.14",
+    "polars>=1.11,<1.15",
     "pylibcudf==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index 61925b21a97..9c58a24c065 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -112,22 +112,7 @@ def test_scan(
         n_rows=n_rows,
     )
     engine = pl.GPUEngine(raise_on_fail=True, parquet_options={"chunked": is_chunked})
-    if (
-        is_chunked
-        and (columns is None or columns[0] != "a")
-        and (
-            # When we mask with the slice, it happens to remove the
-            # bad row
-            (mask is None and slice is not None)
-            # When we both slice and read a subset of rows it also
-            # removes the bad row
-            or (slice is None and n_rows is not None)
-        )
-    ):
-        # slice read produces wrong result for string column
-        request.applymarker(
-            pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/17311")
-        )
+
     if slice is not None:
         q = q.slice(*slice)
     if mask is not None:
@@ -377,13 +362,6 @@ def large_df(df, tmpdir_factory, chunked_slice):
 def test_scan_parquet_chunked(
     request, chunked_slice, large_df, chunk_read_limit, pass_read_limit
 ):
-    if chunked_slice in {"skip_partial", "partial"} and (
-        chunk_read_limit == 0 and pass_read_limit != 0
-    ):
-        request.applymarker(
-            pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/17311")
-        )
-
     assert_gpu_result_equal(
         large_df,
         engine=pl.GPUEngine(
diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py
index 4cbd7244751..166b7d98592 100644
--- a/python/custreamz/custreamz/kafka.py
+++ b/python/custreamz/custreamz/kafka.py
@@ -151,9 +151,14 @@ def read_gdf(
             "parquet": cudf.io.read_parquet,
         }
 
-        result = cudf_readers[message_format](
-            kafka_datasource, engine="cudf", lines=True
-        )
+        if message_format == "json":
+            result = cudf_readers[message_format](
+                kafka_datasource, engine="cudf", lines=True
+            )
+        else:
+            result = cudf_readers[message_format](
+                kafka_datasource, engine="cudf"
+            )
 
         # Close up the cudf datasource instance
         # TODO: Ideally the C++ destructor should handle the
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
index 39ac6474958..c0638e4a1c3 100644
--- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
@@ -86,7 +86,8 @@ def _read_paths(
             )
 
         dataset_kwargs = dataset_kwargs or {}
-        dataset_kwargs["partitioning"] = partitioning or "hive"
+        if partitions:
+            dataset_kwargs["partitioning"] = partitioning or "hive"
 
         # Use cudf to read in data
         try:
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index fb02e0ac772..9c5d5523019 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -700,140 +700,10 @@ def from_dict(
         )
 
     @staticmethod
-    def read_parquet(path, *args, filesystem="fsspec", engine=None, **kwargs):
-        import dask_expr as dx
-        import fsspec
-
-        if (
-            isinstance(filesystem, fsspec.AbstractFileSystem)
-            or isinstance(filesystem, str)
-            and filesystem.lower() == "fsspec"
-        ):
-            # Default "fsspec" filesystem
-            from dask_cudf._legacy.io.parquet import CudfEngine
+    def read_parquet(*args, **kwargs):
+        from dask_cudf.io.parquet import read_parquet as read_parquet_expr
 
-            _raise_unsupported_parquet_kwargs(**kwargs)
-            return _default_backend(
-                dx.read_parquet,
-                path,
-                *args,
-                filesystem=filesystem,
-                engine=CudfEngine,
-                **kwargs,
-            )
-
-        else:
-            # EXPERIMENTAL filesystem="arrow" support.
-            # This code path uses PyArrow for IO, which is only
-            # beneficial for remote storage (e.g. S3)
-
-            from fsspec.utils import stringify_path
-            from pyarrow import fs as pa_fs
-
-            # CudfReadParquetPyarrowFS requires import of distributed beforehand
-            # (See: https://github.com/dask/dask/issues/11352)
-            import distributed  # noqa: F401
-            from dask.core import flatten
-            from dask.dataframe.utils import pyarrow_strings_enabled
-
-            from dask_cudf.io.parquet import CudfReadParquetPyarrowFS
-
-            if args:
-                raise ValueError(f"Unexpected positional arguments: {args}")
-
-            if not (
-                isinstance(filesystem, pa_fs.FileSystem)
-                or isinstance(filesystem, str)
-                and filesystem.lower() in ("arrow", "pyarrow")
-            ):
-                raise ValueError(f"Unexpected filesystem value: {filesystem}.")
-
-            if not PYARROW_GE_15:
-                raise NotImplementedError(
-                    "Experimental Arrow filesystem support requires pyarrow>=15"
-                )
-
-            if not isinstance(path, str):
-                path = stringify_path(path)
-
-            # Extract kwargs
-            columns = kwargs.pop("columns", None)
-            filters = kwargs.pop("filters", None)
-            categories = kwargs.pop("categories", None)
-            index = kwargs.pop("index", None)
-            storage_options = kwargs.pop("storage_options", None)
-            dtype_backend = kwargs.pop("dtype_backend", None)
-            calculate_divisions = kwargs.pop("calculate_divisions", False)
-            ignore_metadata_file = kwargs.pop("ignore_metadata_file", False)
-            metadata_task_size = kwargs.pop("metadata_task_size", None)
-            split_row_groups = kwargs.pop("split_row_groups", "infer")
-            blocksize = kwargs.pop("blocksize", "default")
-            aggregate_files = kwargs.pop("aggregate_files", None)
-            parquet_file_extension = kwargs.pop(
-                "parquet_file_extension", (".parq", ".parquet", ".pq")
-            )
-            arrow_to_pandas = kwargs.pop("arrow_to_pandas", None)
-            open_file_options = kwargs.pop("open_file_options", None)
-
-            # Validate and normalize kwargs
-            kwargs["dtype_backend"] = dtype_backend
-            if arrow_to_pandas is not None:
-                raise ValueError(
-                    "arrow_to_pandas not supported for the 'cudf' backend."
-                )
-            if open_file_options is not None:
-                raise ValueError(
-                    "The open_file_options argument is no longer supported "
-                    "by the 'cudf' backend."
-                )
-            if filters is not None:
-                for filter in flatten(filters, container=list):
-                    _, op, val = filter
-                    if op == "in" and not isinstance(val, (set, list, tuple)):
-                        raise TypeError(
-                            "Value of 'in' filter must be a list, set or tuple."
-                        )
-            if metadata_task_size is not None:
-                raise NotImplementedError(
-                    "metadata_task_size is not supported when using the pyarrow filesystem."
-                )
-            if split_row_groups != "infer":
-                raise NotImplementedError(
-                    "split_row_groups is not supported when using the pyarrow filesystem."
-                )
-            if parquet_file_extension != (".parq", ".parquet", ".pq"):
-                raise NotImplementedError(
-                    "parquet_file_extension is not supported when using the pyarrow filesystem."
-                )
-            if blocksize is not None and blocksize != "default":
-                warnings.warn(
-                    "blocksize is not supported when using the pyarrow filesystem."
-                    "blocksize argument will be ignored."
-                )
-            if aggregate_files is not None:
-                warnings.warn(
-                    "aggregate_files is not supported when using the pyarrow filesystem. "
-                    "Please use the 'dataframe.parquet.minimum-partition-size' config."
-                    "aggregate_files argument will be ignored."
-                )
-
-            return dx.new_collection(
-                CudfReadParquetPyarrowFS(
-                    path,
-                    columns=dx._util._convert_to_list(columns),
-                    filters=filters,
-                    categories=categories,
-                    index=index,
-                    calculate_divisions=calculate_divisions,
-                    storage_options=storage_options,
-                    filesystem=filesystem,
-                    ignore_metadata_file=ignore_metadata_file,
-                    arrow_to_pandas=arrow_to_pandas,
-                    pyarrow_strings_enabled=pyarrow_strings_enabled(),
-                    kwargs=kwargs,
-                    _series=isinstance(columns, str),
-                )
-            )
+        return read_parquet_expr(*args, **kwargs)
 
     @staticmethod
     def read_csv(
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index 1e0f24d78ce..212951336c9 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from dask_cudf import _deprecated_api
+from dask_cudf import _deprecated_api, QUERY_PLANNING_ON
 
 from . import csv, orc, json, parquet, text  # noqa: F401
 
@@ -22,9 +22,14 @@
 read_text = _deprecated_api(
     "dask_cudf.io.read_text", new_api="dask_cudf.read_text"
 )
-read_parquet = _deprecated_api(
-    "dask_cudf.io.read_parquet", new_api="dask_cudf.read_parquet"
-)
+if QUERY_PLANNING_ON:
+    read_parquet = parquet.read_parquet
+else:
+    read_parquet = _deprecated_api(
+        "The legacy dask_cudf.io.read_parquet API",
+        new_api="dask_cudf.read_parquet",
+        rec="",
+    )
 to_parquet = _deprecated_api(
     "dask_cudf.io.to_parquet",
     new_api="dask_cudf._legacy.io.parquet.to_parquet",
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index a7a116875ea..bf8fae552c2 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -1,58 +1,252 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
 import functools
+import itertools
+import math
+import os
+import warnings
+from typing import TYPE_CHECKING, Any
 
+import numpy as np
 import pandas as pd
-from dask_expr.io.io import FusedParquetIO
-from dask_expr.io.parquet import FragmentWrapper, ReadParquetPyarrowFS
+from dask_expr._expr import Elemwise
+from dask_expr._util import _convert_to_list
+from dask_expr.io.io import FusedIO, FusedParquetIO
+from dask_expr.io.parquet import (
+    FragmentWrapper,
+    ReadParquetFSSpec,
+    ReadParquetPyarrowFS,
+)
 
 from dask._task_spec import Task
+from dask.dataframe.io.parquet.arrow import _filters_to_expression
+from dask.dataframe.io.parquet.core import ParquetFunctionWrapper
+from dask.tokenize import tokenize
+from dask.utils import parse_bytes
 
 import cudf
 
-from dask_cudf import _deprecated_api
+from dask_cudf import QUERY_PLANNING_ON, _deprecated_api
 
 # Dask-expr imports CudfEngine from this module
 from dask_cudf._legacy.io.parquet import CudfEngine  # noqa: F401
 
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+
+
+_DEVICE_SIZE_CACHE: int | None = None
+
+
+def _get_device_size():
+    try:
+        # Use PyNVML to find the worker device size.
+        import pynvml
+
+        pynvml.nvmlInit()
+        index = os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
+        if index and not index.isnumeric():
+            # This means index is UUID. This works for both MIG and non-MIG device UUIDs.
+            handle = pynvml.nvmlDeviceGetHandleByUUID(str.encode(index))
+        else:
+            # This is a device index
+            handle = pynvml.nvmlDeviceGetHandleByIndex(int(index))
+        return pynvml.nvmlDeviceGetMemoryInfo(handle).total
+
+    except (ImportError, ValueError):
+        # Fall back to a conservative 8GiB default
+        return 8 * 1024**3
+
+
+def _normalize_blocksize(fraction: float = 0.03125):
+    # Set the blocksize to fraction * <device-size>.
+    # We use the smallest worker device to set <device-size>.
+    # (Default blocksize is 1/32 * <device-size>)
+    global _DEVICE_SIZE_CACHE
+
+    if _DEVICE_SIZE_CACHE is None:
+        try:
+            # Check distributed workers (if a client exists)
+            from distributed import get_client
+
+            client = get_client()
+            # TODO: Check "GPU" worker resources only.
+            # Depends on (https://github.com/rapidsai/dask-cuda/pull/1401)
+            device_size = min(client.run(_get_device_size).values())
+        except (ImportError, ValueError):
+            device_size = _get_device_size()
+        _DEVICE_SIZE_CACHE = device_size
+
+    return int(_DEVICE_SIZE_CACHE * fraction)
+
+
+class NoOp(Elemwise):
+    # Workaround - Always wrap read_parquet operations
+    # in a NoOp to trigger tune_up optimizations.
+    _parameters = ["frame"]
+    _is_length_preserving = True
+    _projection_passthrough = True
+    _filter_passthrough = True
+    _preserves_partitioning_information = True
 
-class CudfFusedParquetIO(FusedParquetIO):
     @staticmethod
-    def _load_multiple_files(
-        frag_filters,
-        columns,
-        schema,
-        **to_pandas_kwargs,
-    ):
-        import pyarrow as pa
+    def operation(x):
+        return x
 
-        from dask.base import apply, tokenize
-        from dask.threaded import get
 
-        token = tokenize(frag_filters, columns, schema)
-        name = f"pq-file-{token}"
-        dsk = {
-            (name, i): (
-                CudfReadParquetPyarrowFS._fragment_to_table,
-                frag,
-                filter,
-                columns,
-                schema,
-            )
-            for i, (frag, filter) in enumerate(frag_filters)
-        }
-        dsk[name] = (
-            apply,
-            pa.concat_tables,
-            [list(dsk.keys())],
-            {"promote_options": "permissive"},
-        )
-        return CudfReadParquetPyarrowFS._table_to_pandas(
-            get(dsk, name),
-            **to_pandas_kwargs,
-        )
+class CudfReadParquetFSSpec(ReadParquetFSSpec):
+    _STATS_CACHE: MutableMapping[str, Any] = {}
+
+    def approx_statistics(self):
+        # Use a few files to approximate column-size statistics
+        key = tokenize(self._dataset_info["ds"].files[:10], self.filters)
+        try:
+            return self._STATS_CACHE[key]
+
+        except KeyError:
+            # Account for filters
+            ds_filters = None
+            if self.filters is not None:
+                ds_filters = _filters_to_expression(self.filters)
+
+            # Use average total_uncompressed_size of three files
+            n_sample = 3
+            column_sizes = {}
+            for i, frag in enumerate(
+                self._dataset_info["ds"].get_fragments(ds_filters)
+            ):
+                md = frag.metadata
+                for rg in range(md.num_row_groups):
+                    row_group = md.row_group(rg)
+                    for col in range(row_group.num_columns):
+                        column = row_group.column(col)
+                        name = column.path_in_schema
+                        if name not in column_sizes:
+                            column_sizes[name] = np.zeros(
+                                n_sample, dtype="int64"
+                            )
+                        column_sizes[name][i] += column.total_uncompressed_size
+                if (i + 1) >= n_sample:
+                    break
+
+            # Reorganize stats to look like arrow-fs version
+            self._STATS_CACHE[key] = {
+                "columns": [
+                    {
+                        "path_in_schema": name,
+                        "total_uncompressed_size": np.mean(sizes),
+                    }
+                    for name, sizes in column_sizes.items()
+                ]
+            }
+            return self._STATS_CACHE[key]
+
+    @functools.cached_property
+    def _fusion_compression_factor(self):
+        # Disable fusion when blocksize=None
+        if self.blocksize is None:
+            return 1
+
+        # At this point, we *may* have used `blockwise`
+        # already to split or aggregate files. We don't
+        # *know* if the current partitions correspond to
+        # individual/full files, multiple/aggregated files
+        # or partial/split files.
+        #
+        # Therefore, we need to use the statistics from
+        # a few files to estimate the current partition
+        # size. This size should be similar to `blocksize`
+        # *if* aggregate_files is True or if the files
+        # are *smaller* than `blocksize`.
+
+        # Step 1: Sample statistics
+        approx_stats = self.approx_statistics()
+        projected_size, original_size = 0, 0
+        col_op = self.operand("columns") or self.columns
+        for col in approx_stats["columns"]:
+            original_size += col["total_uncompressed_size"]
+            if col["path_in_schema"] in col_op or (
+                (split_name := col["path_in_schema"].split("."))
+                and split_name[0] in col_op
+            ):
+                projected_size += col["total_uncompressed_size"]
+        if original_size < 1 or projected_size < 1:
+            return 1
+
+        # Step 2: Estimate the correction factor
+        # (Correct for possible pre-optimization fusion/splitting)
+        blocksize = parse_bytes(self.blocksize)
+        if original_size > blocksize:
+            # Input files are bigger than blocksize
+            # and we already split these large files.
+            # (correction_factor > 1)
+            correction_factor = original_size / blocksize
+        elif self.aggregate_files:
+            # Input files are smaller than blocksize
+            # and we already aggregate small files.
+            # (correction_factor == 1)
+            correction_factor = 1
+        else:
+            # Input files are smaller than blocksize
+            # but we haven't aggregate small files yet.
+            # (correction_factor < 1)
+            correction_factor = original_size / blocksize
+
+        # Step 3. Estimate column-projection factor
+        if self.operand("columns") is None:
+            projection_factor = 1
+        else:
+            projection_factor = projected_size / original_size
+
+        return max(projection_factor * correction_factor, 0.001)
+
+    def _tune_up(self, parent):
+        if self._fusion_compression_factor >= 1:
+            return
+        if isinstance(parent, FusedIO):
+            return
+        return parent.substitute(self, CudfFusedIO(self))
 
 
 class CudfReadParquetPyarrowFS(ReadParquetPyarrowFS):
+    _parameters = [
+        "path",
+        "columns",
+        "filters",
+        "categories",
+        "index",
+        "storage_options",
+        "filesystem",
+        "blocksize",
+        "ignore_metadata_file",
+        "calculate_divisions",
+        "arrow_to_pandas",
+        "pyarrow_strings_enabled",
+        "kwargs",
+        "_partitions",
+        "_series",
+        "_dataset_info_cache",
+    ]
+    _defaults = {
+        "columns": None,
+        "filters": None,
+        "categories": None,
+        "index": None,
+        "storage_options": None,
+        "filesystem": None,
+        "blocksize": "256 MiB",
+        "ignore_metadata_file": True,
+        "calculate_divisions": False,
+        "arrow_to_pandas": None,
+        "pyarrow_strings_enabled": True,
+        "kwargs": None,
+        "_partitions": None,
+        "_series": False,
+        "_dataset_info_cache": None,
+    }
+
     @functools.cached_property
     def _dataset_info(self):
         from dask_cudf._legacy.io.parquet import (
@@ -86,11 +280,92 @@ def _dataset_info(self):
 
     @staticmethod
     def _table_to_pandas(table, index_name):
-        df = cudf.DataFrame.from_arrow(table)
-        if index_name is not None:
-            df = df.set_index(index_name)
+        if isinstance(table, cudf.DataFrame):
+            df = table
+        else:
+            df = cudf.DataFrame.from_arrow(table)
+            if index_name is not None:
+                return df.set_index(index_name)
         return df
 
+    @staticmethod
+    def _fragments_to_cudf_dataframe(
+        fragment_wrappers,
+        filters,
+        columns,
+        schema,
+    ):
+        from dask.dataframe.io.utils import _is_local_fs
+
+        from cudf.io.parquet import _apply_post_filters, _normalize_filters
+
+        if not isinstance(fragment_wrappers, list):
+            fragment_wrappers = [fragment_wrappers]
+
+        filesystem = None
+        paths, row_groups = [], []
+        for fw in fragment_wrappers:
+            frag = fw.fragment if isinstance(fw, FragmentWrapper) else fw
+            paths.append(frag.path)
+            row_groups.append(
+                [rg.id for rg in frag.row_groups] if frag.row_groups else None
+            )
+            if filesystem is None:
+                filesystem = frag.filesystem
+
+        if _is_local_fs(filesystem):
+            filesystem = None
+        else:
+            from fsspec.implementations.arrow import ArrowFSWrapper
+
+            filesystem = ArrowFSWrapper(filesystem)
+            protocol = filesystem.protocol
+            paths = [f"{protocol}://{path}" for path in paths]
+
+        filters = _normalize_filters(filters)
+        projected_columns = None
+        if columns and filters:
+            projected_columns = [c for c in columns if c is not None]
+            columns = sorted(
+                set(v[0] for v in itertools.chain.from_iterable(filters))
+                | set(projected_columns)
+            )
+
+        if row_groups == [None for path in paths]:
+            row_groups = None
+
+        df = cudf.read_parquet(
+            paths,
+            columns=columns,
+            filters=filters,
+            row_groups=row_groups,
+            dataset_kwargs={"schema": schema},
+        )
+
+        # Apply filters (if any are defined)
+        df = _apply_post_filters(df, filters)
+        if projected_columns:
+            # Elements of `projected_columns` may now be in the index.
+            # We must filter these names from our projection
+            projected_columns = [
+                col for col in projected_columns if col in df._column_names
+            ]
+            df = df[projected_columns]
+
+        # TODO: Deal with hive partitioning.
+        # Note that ReadParquetPyarrowFS does NOT support this yet anyway.
+        return df
+
+    @functools.cached_property
+    def _use_device_io(self):
+        from dask.dataframe.io.utils import _is_local_fs
+
+        # Use host for remote filesystem only
+        # (Unless we are using kvikio-S3)
+        return _is_local_fs(self.fs) or (
+            self.fs.type_name == "s3" and cudf.get_option("kvikio_remote_io")
+        )
+
     def _filtered_task(self, name, index: int):
         columns = self.columns.copy()
         index_name = self.index.name
@@ -101,12 +376,17 @@ def _filtered_task(self, name, index: int):
             if columns is None:
                 columns = list(schema.names)
             columns.append(index_name)
+
+        frag_to_table = self._fragment_to_table
+        if self._use_device_io:
+            frag_to_table = self._fragments_to_cudf_dataframe
+
         return Task(
             name,
             self._table_to_pandas,
             Task(
                 None,
-                self._fragment_to_table,
+                frag_to_table,
                 fragment_wrapper=FragmentWrapper(
                     self.fragments[index], filesystem=self.fs
                 ),
@@ -117,18 +397,441 @@ def _filtered_task(self, name, index: int):
             index_name=index_name,
         )
 
+    @property
+    def _fusion_compression_factor(self):
+        blocksize = self.blocksize
+        if blocksize is None:
+            return 1
+        elif blocksize == "default":
+            blocksize = "256MiB"
+
+        projected_size = 0
+        approx_stats = self.approx_statistics()
+        col_op = self.operand("columns") or self.columns
+        for col in approx_stats["columns"]:
+            if col["path_in_schema"] in col_op or (
+                (split_name := col["path_in_schema"].split("."))
+                and split_name[0] in col_op
+            ):
+                projected_size += col["total_uncompressed_size"]
+
+        if projected_size < 1:
+            return 1
+
+        aggregate_files = max(1, int(parse_bytes(blocksize) / projected_size))
+        return max(1 / aggregate_files, 0.001)
+
     def _tune_up(self, parent):
         if self._fusion_compression_factor >= 1:
             return
-        if isinstance(parent, CudfFusedParquetIO):
+        fused_cls = (
+            CudfFusedParquetIO
+            if self._use_device_io
+            else CudfFusedParquetIOHost
+        )
+        if isinstance(parent, fused_cls):
             return
-        return parent.substitute(self, CudfFusedParquetIO(self))
+        return parent.substitute(self, fused_cls(self))
 
 
-read_parquet = _deprecated_api(
-    "dask_cudf.io.parquet.read_parquet",
-    new_api="dask_cudf.read_parquet",
-)
+class CudfFusedIO(FusedIO):
+    def _task(self, name, index: int):
+        expr = self.operand("_expr")
+        bucket = self._fusion_buckets[index]
+        io_func = expr._filtered_task(name, 0).func
+        if not isinstance(
+            io_func, ParquetFunctionWrapper
+        ) or io_func.common_kwargs.get("partitions", None):
+            # Just use "simple" fusion if we have an unexpected
+            # callable, or we are dealing with hive partitioning.
+            return Task(
+                name,
+                cudf.concat,
+                [expr._filtered_task(name, i) for i in bucket],
+            )
+
+        pieces = []
+        for i in bucket:
+            piece = expr._filtered_task(name, i).args[0]
+            if isinstance(piece, list):
+                pieces.extend(piece)
+            else:
+                pieces.append(piece)
+        return Task(name, io_func, pieces)
+
+
+class CudfFusedParquetIO(FusedParquetIO):
+    @functools.cached_property
+    def _fusion_buckets(self):
+        partitions = self.operand("_expr")._partitions
+        npartitions = len(partitions)
+
+        step = math.ceil(1 / self.operand("_expr")._fusion_compression_factor)
+
+        # TODO: Heuristic to limit fusion should probably
+        # account for the number of workers. For now, just
+        # limiting fusion to 100 partitions at once.
+        step = min(step, 100)
+
+        buckets = [
+            partitions[i : i + step] for i in range(0, npartitions, step)
+        ]
+        return buckets
+
+    @classmethod
+    def _load_multiple_files(
+        cls,
+        frag_filters,
+        columns,
+        schema,
+        **to_pandas_kwargs,
+    ):
+        frag_to_table = CudfReadParquetPyarrowFS._fragments_to_cudf_dataframe
+        return CudfReadParquetPyarrowFS._table_to_pandas(
+            frag_to_table(
+                [frag[0] for frag in frag_filters],
+                frag_filters[0][1],  # TODO: Check for consistent filters?
+                columns,
+                schema,
+            ),
+            **to_pandas_kwargs,
+        )
+
+
+class CudfFusedParquetIOHost(CudfFusedParquetIO):
+    @classmethod
+    def _load_multiple_files(
+        cls,
+        frag_filters,
+        columns,
+        schema,
+        **to_pandas_kwargs,
+    ):
+        import pyarrow as pa
+
+        from dask.base import apply, tokenize
+        from dask.threaded import get
+
+        token = tokenize(frag_filters, columns, schema)
+        name = f"pq-file-{token}"
+        dsk = {
+            (name, i): (
+                CudfReadParquetPyarrowFS._fragment_to_table,
+                frag,
+                filter,
+                columns,
+                schema,
+            )
+            for i, (frag, filter) in enumerate(frag_filters)
+        }
+        dsk[name] = (
+            apply,
+            pa.concat_tables,
+            [list(dsk.keys())],
+            {"promote_options": "permissive"},
+        )
+
+        return CudfReadParquetPyarrowFS._table_to_pandas(
+            get(dsk, name),
+            **to_pandas_kwargs,
+        )
+
+
+def read_parquet_expr(
+    path,
+    *args,
+    columns=None,
+    filters=None,
+    categories=None,
+    index=None,
+    storage_options=None,
+    dtype_backend=None,
+    calculate_divisions=False,
+    ignore_metadata_file=False,
+    metadata_task_size=None,
+    split_row_groups="infer",
+    blocksize="default",
+    aggregate_files=None,
+    parquet_file_extension=(".parq", ".parquet", ".pq"),
+    filesystem="fsspec",
+    engine=None,
+    arrow_to_pandas=None,
+    open_file_options=None,
+    **kwargs,
+):
+    """
+    Read a Parquet file into a Dask-cuDF DataFrame.
+
+    This reads a directory of Parquet data into a DataFrame collection.
+    Partitioning behavior mostly depends on the ``blocksize`` argument.
+
+    .. note::
+        Dask may automatically resize partitions at optimization time.
+        Please set ``blocksize=None`` to disable this behavior in Dask cuDF.
+        (NOTE: This will not disable fusion for the "pandas" backend)
+
+    .. note::
+        Specifying ``filesystem="arrow"`` leverages a complete reimplementation of
+        the Parquet reader that is solely based on PyArrow. It is faster than the
+        legacy implementation in some cases, but doesn't yet support all features.
+
+    Parameters
+    ----------
+    path : str or list
+        Source directory for data, or path(s) to individual parquet files.
+        Prefix with a protocol like ``s3://`` to read from alternative
+        filesystems. To read from multiple files you can pass a globstring or a
+        list of paths, with the caveat that they must all have the same
+        protocol.
+    columns : str or list, default None
+        Field name(s) to read in as columns in the output. By default all
+        non-index fields will be read (as determined by the pandas parquet
+        metadata, if present). Provide a single field name instead of a list to
+        read in the data as a Series.
+    filters : Union[List[Tuple[str, str, Any]], List[List[Tuple[str, str, Any]]]], default None
+        List of filters to apply, like ``[[('col1', '==', 0), ...], ...]``.
+        Using this argument will result in row-wise filtering of the final partitions.
+
+        Predicates can be expressed in disjunctive normal form (DNF). This means that
+        the inner-most tuple describes a single column predicate. These inner predicates
+        are combined with an AND conjunction into a larger predicate. The outer-most
+        list then combines all of the combined filters with an OR disjunction.
+
+        Predicates can also be expressed as a ``List[Tuple]``. These are evaluated
+        as an AND conjunction. To express OR in predicates, one must use the
+        (preferred for "pyarrow") ``List[List[Tuple]]`` notation.
+    index : str, list or False, default None
+        Field name(s) to use as the output frame index. By default will be
+        inferred from the pandas parquet file metadata, if present. Use ``False``
+        to read all fields as columns.
+    categories : list or dict, default None
+        For any fields listed here, if the parquet encoding is Dictionary,
+        the column will be created with dtype category. Use only if it is
+        guaranteed that the column is encoded as dictionary in all row-groups.
+        If a list, assumes up to 2**16-1 labels; if a dict, specify the number
+        of labels expected; if None, will load categories automatically for
+        data written by dask, not otherwise.
+    storage_options : dict, default None
+        Key/value pairs to be passed on to the file-system backend, if any.
+        Note that the default file-system backend can be configured with the
+        ``filesystem`` argument, described below.
+    calculate_divisions : bool, default False
+        Whether to use min/max statistics from the footer metadata (or global
+        ``_metadata`` file) to calculate divisions for the output DataFrame
+        collection. Divisions will not be calculated if statistics are missing.
+        This option will be ignored if ``index`` is not specified and there is
+        no physical index column specified in the custom "pandas" Parquet
+        metadata. Note that ``calculate_divisions=True`` may be extremely slow
+        when no global ``_metadata`` file is present, especially when reading
+        from remote storage. Set this to ``True`` only when known divisions
+        are needed for your workload (see :ref:`dataframe-design-partitions`).
+    ignore_metadata_file : bool, default False
+        Whether to ignore the global ``_metadata`` file (when one is present).
+        If ``True``, or if the global ``_metadata`` file is missing, the parquet
+        metadata may be gathered and processed in parallel. Parallel metadata
+        processing is currently supported for ``ArrowDatasetEngine`` only.
+    metadata_task_size : int, default configurable
+        If parquet metadata is processed in parallel (see ``ignore_metadata_file``
+        description above), this argument can be used to specify the number of
+        dataset files to be processed by each task in the Dask graph.  If this
+        argument is set to ``0``, parallel metadata processing will be disabled.
+        The default values for local and remote filesystems can be specified
+        with the "metadata-task-size-local" and "metadata-task-size-remote"
+        config fields, respectively (see "dataframe.parquet").
+    split_row_groups : 'infer', 'adaptive', bool, or int, default 'infer'
+        WARNING: The ``split_row_groups`` argument is now deprecated, please use
+        ``blocksize`` instead.
+
+    blocksize : int, float or str, default 'default'
+        The desired size of each output ``DataFrame`` partition in terms of total
+        (uncompressed) parquet storage space. This argument may be used to split
+        large files or aggregate small files into the same partition. Use ``None``
+        for a simple 1:1 mapping between files and partitions. Use a float value
+        less than 1.0 to specify the fractional size of the partitions with
+        respect to the total memory of the first NVIDIA GPU on your machine.
+        Default is 1/32 the total memory of a single GPU.
+    aggregate_files : bool or str, default None
+        WARNING: The behavior of ``aggregate_files=True`` is now obsolete
+        when query-planning is enabled (the default). Small files are now
+        aggregated automatically according to the ``blocksize`` setting.
+        Please expect this argument to be deprecated in a future release.
+
+        WARNING: Passing a string argument to ``aggregate_files`` will result
+        in experimental behavior that may be removed at any time.
+
+    parquet_file_extension: str, tuple[str], or None, default (".parq", ".parquet", ".pq")
+        A file extension or an iterable of extensions to use when discovering
+        parquet files in a directory. Files that don't match these extensions
+        will be ignored. This argument only applies when ``paths`` corresponds
+        to a directory and no ``_metadata`` file is present (or
+        ``ignore_metadata_file=True``). Passing in ``parquet_file_extension=None``
+        will treat all files in the directory as parquet files.
+
+        The purpose of this argument is to ensure that the engine will ignore
+        unsupported metadata files (like Spark's '_SUCCESS' and 'crc' files).
+        It may be necessary to change this argument if the data files in your
+        parquet dataset do not end in ".parq", ".parquet", or ".pq".
+    filesystem: "fsspec", "arrow", or fsspec.AbstractFileSystem backend to use.
+    dataset: dict, default None
+        Dictionary of options to use when creating a ``pyarrow.dataset.Dataset`` object.
+        These options may include a "filesystem" key to configure the desired
+        file-system backend. However, the top-level ``filesystem`` argument will always
+        take precedence.
+
+        **Note**: The ``dataset`` options may include a "partitioning" key.
+        However, since ``pyarrow.dataset.Partitioning``
+        objects cannot be serialized, the value can be a dict of key-word
+        arguments for the ``pyarrow.dataset.partitioning`` API
+        (e.g. ``dataset={"partitioning": {"flavor": "hive", "schema": ...}}``).
+        Note that partitioned columns will not be converted to categorical
+        dtypes when a custom partitioning schema is specified in this way.
+    read: dict, default None
+        Dictionary of options to pass through to ``CudfEngine.read_partitions``
+        using the ``read`` key-word argument.
+    """
+
+    import dask_expr as dx
+    from fsspec.utils import stringify_path
+    from pyarrow import fs as pa_fs
+
+    from dask.core import flatten
+    from dask.dataframe.utils import pyarrow_strings_enabled
+
+    from dask_cudf.backends import PYARROW_GE_15
+
+    if args:
+        raise ValueError(f"Unexpected positional arguments: {args}")
+
+    if open_file_options is not None:
+        raise ValueError(
+            "The open_file_options argument is no longer supported "
+            "by the 'cudf' backend."
+        )
+    if dtype_backend is not None:
+        raise NotImplementedError(
+            "dtype_backend is not supported by the 'cudf' backend."
+        )
+    if arrow_to_pandas is not None:
+        raise NotImplementedError(
+            "arrow_to_pandas is not supported by the 'cudf' backend."
+        )
+    if engine not in (None, "cudf", CudfEngine):
+        raise NotImplementedError(
+            "engine={engine} is not supported by the 'cudf' backend."
+        )
+
+    if not isinstance(path, str):
+        path = stringify_path(path)
+
+    kwargs["dtype_backend"] = None
+    if arrow_to_pandas:
+        kwargs["arrow_to_pandas"] = None
+
+    if filters is not None:
+        for filter in flatten(filters, container=list):
+            _, op, val = filter
+            if op == "in" and not isinstance(val, (set, list, tuple)):
+                raise TypeError(
+                    "Value of 'in' filter must be a list, set or tuple."
+                )
+
+    # Normalize blocksize input
+    if blocksize == "default":
+        blocksize = _normalize_blocksize()
+    elif isinstance(blocksize, float) and blocksize < 1:
+        blocksize = _normalize_blocksize(blocksize)
+
+    if (
+        isinstance(filesystem, pa_fs.FileSystem)
+        or isinstance(filesystem, str)
+        and filesystem.lower() in ("arrow", "pyarrow")
+    ):
+        # EXPERIMENTAL filesystem="arrow" support.
+        # This code path may use PyArrow for remote IO.
+
+        # CudfReadParquetPyarrowFS requires import of distributed beforehand
+        # (See: https://github.com/dask/dask/issues/11352)
+        import distributed  # noqa: F401
+
+        if not PYARROW_GE_15:
+            raise ValueError(
+                "pyarrow>=15.0.0 is required to use the pyarrow filesystem."
+            )
+        if metadata_task_size is not None:
+            warnings.warn(
+                "metadata_task_size is not supported when using the pyarrow filesystem."
+                " This argument will be ignored!"
+            )
+        if aggregate_files is not None:
+            warnings.warn(
+                "aggregate_files is not supported when using the pyarrow filesystem."
+                " This argument will be ignored!"
+            )
+        if split_row_groups != "infer":
+            warnings.warn(
+                "split_row_groups is not supported when using the pyarrow filesystem."
+                " This argument will be ignored!"
+            )
+        if parquet_file_extension != (".parq", ".parquet", ".pq"):
+            raise NotImplementedError(
+                "parquet_file_extension is not supported when using the pyarrow filesystem."
+            )
+
+        return dx.new_collection(
+            NoOp(
+                CudfReadParquetPyarrowFS(
+                    path,
+                    columns=_convert_to_list(columns),
+                    filters=filters,
+                    categories=categories,
+                    index=index,
+                    calculate_divisions=calculate_divisions,
+                    storage_options=storage_options,
+                    filesystem=filesystem,
+                    blocksize=blocksize,
+                    ignore_metadata_file=ignore_metadata_file,
+                    arrow_to_pandas=None,
+                    pyarrow_strings_enabled=pyarrow_strings_enabled(),
+                    kwargs=kwargs,
+                    _series=isinstance(columns, str),
+                ),
+            )
+        )
+
+    return dx.new_collection(
+        NoOp(
+            CudfReadParquetFSSpec(
+                path,
+                columns=_convert_to_list(columns),
+                filters=filters,
+                categories=categories,
+                index=index,
+                blocksize=blocksize,
+                storage_options=storage_options,
+                calculate_divisions=calculate_divisions,
+                ignore_metadata_file=ignore_metadata_file,
+                metadata_task_size=metadata_task_size,
+                split_row_groups=split_row_groups,
+                aggregate_files=aggregate_files,
+                parquet_file_extension=parquet_file_extension,
+                filesystem=filesystem,
+                engine=CudfEngine,
+                kwargs=kwargs,
+                _series=isinstance(columns, str),
+            ),
+        )
+    )
+
+
+if QUERY_PLANNING_ON:
+    read_parquet = read_parquet_expr
+    read_parquet.__doc__ = read_parquet_expr.__doc__
+else:
+    read_parquet = _deprecated_api(
+        "The legacy dask_cudf.io.parquet.read_parquet API",
+        new_api="dask_cudf.read_parquet",
+        rec="",
+    )
 to_parquet = _deprecated_api(
     "dask_cudf.io.parquet.to_parquet",
     new_api="dask_cudf._legacy.io.parquet.to_parquet",
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 522a21e12a5..6efe6c4f388 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -46,7 +46,7 @@ def test_roundtrip_backend_dispatch(tmpdir):
     tmpdir = str(tmpdir)
     ddf.to_parquet(tmpdir, engine="pyarrow")
     with dask.config.set({"dataframe.backend": "cudf"}):
-        ddf2 = dd.read_parquet(tmpdir, index=False)
+        ddf2 = dd.read_parquet(tmpdir, index=False, blocksize=None)
     assert isinstance(ddf2, dask_cudf.DataFrame)
     dd.assert_eq(ddf.reset_index(drop=False), ddf2)
 
@@ -100,7 +100,7 @@ def test_roundtrip_from_dask_index_false(tmpdir):
     tmpdir = str(tmpdir)
     ddf.to_parquet(tmpdir, engine="pyarrow")
 
-    ddf2 = dask_cudf.read_parquet(tmpdir, index=False)
+    ddf2 = dask_cudf.read_parquet(tmpdir, index=False, blocksize=None)
     dd.assert_eq(ddf.reset_index(drop=False), ddf2)
 
 
@@ -667,7 +667,7 @@ def test_to_parquet_append(tmpdir, write_metadata_file):
         write_metadata_file=write_metadata_file,
         write_index=False,
     )
-    ddf2 = dask_cudf.read_parquet(tmpdir)
+    ddf2 = dask_cudf.read_parquet(tmpdir, blocksize=None)
     dd.assert_eq(cudf.concat([df, df]), ddf2)
 
 
@@ -677,13 +677,17 @@ def test_deprecated_api_paths(tmpdir):
     with pytest.warns(match="dask_cudf.io.to_parquet is now deprecated"):
         dask_cudf.io.to_parquet(df, tmpdir)
 
-    # Encourage top-level read_parquet import only
-    with pytest.warns(match="dask_cudf.io.read_parquet is now deprecated"):
+    if dask_cudf.QUERY_PLANNING_ON:
         df2 = dask_cudf.io.read_parquet(tmpdir)
-    dd.assert_eq(df, df2, check_divisions=False)
+        dd.assert_eq(df, df2, check_divisions=False)
 
-    with pytest.warns(
-        match="dask_cudf.io.parquet.read_parquet is now deprecated"
-    ):
         df2 = dask_cudf.io.parquet.read_parquet(tmpdir)
-    dd.assert_eq(df, df2, check_divisions=False)
+        dd.assert_eq(df, df2, check_divisions=False)
+    else:
+        with pytest.warns(match="legacy dask_cudf.io.read_parquet"):
+            df2 = dask_cudf.io.read_parquet(tmpdir)
+            dd.assert_eq(df, df2, check_divisions=False)
+
+        with pytest.warns(match="legacy dask_cudf.io.parquet.read_parquet"):
+            df2 = dask_cudf.io.parquet.read_parquet(tmpdir)
+            dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/libcudf/libcudf/load.py b/python/libcudf/libcudf/load.py
index a91fbb7aecf..c3ff5534e87 100644
--- a/python/libcudf/libcudf/load.py
+++ b/python/libcudf/libcudf/load.py
@@ -76,9 +76,15 @@ def load_library():
         # Prefer the libraries bundled in this package. If they aren't found
         # (which might be the case in builds where the library was prebuilt before
         # packaging the wheel), look for a system installation.
-        libcudf_lib = _load_wheel_installation(soname)
-        if libcudf_lib is None:
-            libcudf_lib = _load_system_installation(soname)
+        try:
+            libcudf_lib = _load_wheel_installation(soname)
+            if libcudf_lib is None:
+                libcudf_lib = _load_system_installation(soname)
+        except OSError:
+            # If none of the searches above succeed, just silently return None
+            # and rely on other mechanisms (like RPATHs on other DSOs) to
+            # help the loader find the library.
+            pass
 
     # The caller almost never needs to do anything with this library, but no
     # harm in offering the option since this object at least provides a handle
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/contiguous_split.pxd
index 2a10cb5b3d5..3745e893c3e 100644
--- a/python/pylibcudf/pylibcudf/contiguous_split.pxd
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pxd
@@ -12,6 +12,7 @@ cdef class PackedColumns:
 
     @staticmethod
     cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data)
+    cpdef tuple release(self)
 
 cpdef PackedColumns pack(Table input)
 
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx
index 94873e079c9..2a40d42e6e9 100644
--- a/python/pylibcudf/pylibcudf/contiguous_split.pyx
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx
@@ -63,6 +63,7 @@ cdef class HostBuffer:
     def __releasebuffer__(self, Py_buffer *buffer):
         pass
 
+
 cdef class PackedColumns:
     """Column data in a serialized format.
 
@@ -87,7 +88,7 @@ cdef class PackedColumns:
         out.c_obj = move(data)
         return out
 
-    def release(self):
+    cpdef tuple release(self):
         """Releases and returns the underlying serialized metadata and gpu data.
 
         The ownership of the memory are transferred to the returned buffers. After
diff --git a/python/pylibcudf/pylibcudf/expressions.pyi b/python/pylibcudf/pylibcudf/expressions.pyi
index 12b473d8605..4dcccaaa1fc 100644
--- a/python/pylibcudf/pylibcudf/expressions.pyi
+++ b/python/pylibcudf/pylibcudf/expressions.pyi
@@ -77,3 +77,5 @@ class Operation(Expression):
         left: Expression,
         right: Expression | None = None,
     ): ...
+
+def to_expression(expr: str, column_names: tuple[str, ...]) -> Expression: ...
diff --git a/python/pylibcudf/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx
index 0f12cfe313c..31121785e27 100644
--- a/python/pylibcudf/pylibcudf/expressions.pyx
+++ b/python/pylibcudf/pylibcudf/expressions.pyx
@@ -1,4 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+import ast
+import functools
+
+import pyarrow as pa
+
 from pylibcudf.libcudf.expressions import \
     ast_operator as ASTOperator  # no-cython-lint
 from pylibcudf.libcudf.expressions import \
@@ -46,6 +51,8 @@ from .scalar cimport Scalar
 from .traits cimport is_chrono, is_numeric
 from .types cimport DataType
 
+from .interop import from_arrow
+
 # Aliases for simplicity
 ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
 
@@ -57,6 +64,7 @@ __all__ = [
     "Literal",
     "Operation",
     "TableReference",
+    "to_expression"
 ]
 
 # Define this class just to have a docstring for it
@@ -261,3 +269,217 @@ cdef class ColumnNameReference(Expression):
             move(make_unique[libcudf_exp.column_name_reference](
                 <string>(name.encode("utf-8"))
             ))
+
+
+# This dictionary encodes the mapping from Python AST operators to their cudf
+# counterparts.
+_python_cudf_operator_map = {
+    # Binary operators
+    ast.Add: ASTOperator.ADD,
+    ast.Sub: ASTOperator.SUB,
+    ast.Mult: ASTOperator.MUL,
+    ast.Div: ASTOperator.DIV,
+    ast.FloorDiv: ASTOperator.FLOOR_DIV,
+    ast.Mod: ASTOperator.PYMOD,
+    ast.Pow: ASTOperator.POW,
+    ast.Eq: ASTOperator.EQUAL,
+    ast.NotEq: ASTOperator.NOT_EQUAL,
+    ast.Lt: ASTOperator.LESS,
+    ast.Gt: ASTOperator.GREATER,
+    ast.LtE: ASTOperator.LESS_EQUAL,
+    ast.GtE: ASTOperator.GREATER_EQUAL,
+    ast.BitXor: ASTOperator.BITWISE_XOR,
+    # TODO: The mapping of logical/bitwise operators here is inconsistent with
+    # pandas. In pandas, Both `BitAnd` and `And` map to
+    # `ASTOperator.LOGICAL_AND` for booleans, while they map to
+    # `ASTOperator.BITWISE_AND` for integers. However, there is no good way to
+    # encode this at present because expressions can be arbitrarily nested so
+    # we won't know the dtype of the input without inserting a much more
+    # complex traversal of the expression tree to determine the output types at
+    # each node. For now, we'll rely on users to use the appropriate operator.
+    ast.BitAnd: ASTOperator.BITWISE_AND,
+    ast.BitOr: ASTOperator.BITWISE_OR,
+    ast.And: ASTOperator.LOGICAL_AND,
+    ast.Or: ASTOperator.LOGICAL_OR,
+    # Unary operators
+    ast.Invert: ASTOperator.BIT_INVERT,
+    ast.Not: ASTOperator.NOT,
+    # TODO: Missing USub, possibility other unary ops?
+}
+
+
+# Mapping between Python function names encode in an ast.Call node and the
+# corresponding libcudf C++ AST operators.
+_python_cudf_function_map = {
+    # TODO: Operators listed on
+    # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval  # noqa: E501
+    # that we don't support yet:
+    # expm1, log1p, arctan2 and log10.
+    "isnull": ASTOperator.IS_NULL,
+    "isna": ASTOperator.IS_NULL,
+    "sin": ASTOperator.SIN,
+    "cos": ASTOperator.COS,
+    "tan": ASTOperator.TAN,
+    "arcsin": ASTOperator.ARCSIN,
+    "arccos": ASTOperator.ARCCOS,
+    "arctan": ASTOperator.ARCTAN,
+    "sinh": ASTOperator.SINH,
+    "cosh": ASTOperator.COSH,
+    "tanh": ASTOperator.TANH,
+    "arcsinh": ASTOperator.ARCSINH,
+    "arccosh": ASTOperator.ARCCOSH,
+    "arctanh": ASTOperator.ARCTANH,
+    "exp": ASTOperator.EXP,
+    "log": ASTOperator.LOG,
+    "sqrt": ASTOperator.SQRT,
+    "abs": ASTOperator.ABS,
+    "ceil": ASTOperator.CEIL,
+    "floor": ASTOperator.FLOOR,
+    # TODO: Operators supported by libcudf with no Python function analog.
+    # ast.rint: ASTOperator.RINT,
+    # ast.cbrt: ASTOperator.CBRT,
+}
+
+
+class ExpressionTransformer(ast.NodeVisitor):
+    """A NodeVisitor specialized for constructing a libcudf expression tree.
+
+    This visitor is designed to handle AST nodes that have libcudf equivalents.
+    It constructs column references from names and literals from constants,
+    then builds up operations. The resulting expression is returned by the
+    `visit` method
+
+    Parameters
+    ----------
+    column_mapping : dict[str, ColumnNameReference | ColumnReference]
+        Mapping from names to column references or column name references.
+        The former can be used for `compute_column` the latter in IO filters.
+    """
+
+    def __init__(self, dict column_mapping):
+        self.column_mapping = column_mapping
+
+    def generic_visit(self, node):
+        raise ValueError(
+            f"Not expecting expression to have node of type {node.__class__.__name__}"
+        )
+
+    def visit_Module(self, node):
+        try:
+            expr, = node.body
+        except ValueError:
+            raise ValueError(
+                f"Expecting exactly one expression, not {len(node.body)}"
+            )
+        return self.visit(expr)
+
+    def visit_Expr(self, node):
+        return self.visit(node.value)
+
+    def visit_Name(self, node):
+        try:
+            return self.column_mapping[node.id]
+        except KeyError:
+            raise ValueError(f"Unknown column name {node.id}")
+
+    def visit_Constant(self, node):
+        if not isinstance(node.value, (float, int, str, complex)):
+            raise ValueError(
+                f"Unsupported literal {repr(node.value)} of type "
+                "{type(node.value).__name__}"
+            )
+        return Literal(from_arrow(pa.scalar(node.value)))
+
+    def visit_UnaryOp(self, node):
+        operand = self.visit(node.operand)
+        if isinstance(node.op, ast.USub):
+            # TODO: Except for leaf nodes, we won't know the type of the
+            # operand, so there's no way to know whether this should be a float
+            # or an int. We should maybe see what Spark does, and this will
+            # probably require casting.
+            minus_one = Literal(from_arrow(pa.scalar(-1)))
+            return Operation(ASTOperator.MUL, minus_one, operand)
+        elif isinstance(node.op, ast.UAdd):
+            return operand
+        else:
+            op = _python_cudf_operator_map[type(node.op)]
+            return Operation(op, operand)
+
+    def visit_BinOp(self, node):
+        left = self.visit(node.left)
+        right = self.visit(node.right)
+        op = _python_cudf_operator_map[type(node.op)]
+        return Operation(op, left, right)
+
+    def visit_BoolOp(self, node):
+        return functools.reduce(
+            functools.partial(Operation, ASTOperator.LOGICAL_AND),
+            (
+                Operation(
+                    _python_cudf_operator_map[type(node.op)],
+                    self.visit(left),
+                    self.visit(right),
+                )
+                for left, right in zip(
+                    node.values[:-1], node.values[1:], strict=True
+                )
+            )
+        )
+
+    def visit_Compare(self, node):
+        operands = [node.left, *node.comparators]
+        return functools.reduce(
+            functools.partial(Operation, ASTOperator.LOGICAL_AND),
+            (
+                Operation(
+                    _python_cudf_operator_map[type(op)],
+                    self.visit(left),
+                    self.visit(right),
+                )
+                for op, left, right in zip(
+                    node.ops, operands[:-1], operands[1:], strict=True
+                )
+            )
+        )
+
+    def visit_Call(self, node):
+        try:
+            op = _python_cudf_function_map[node.func.id]
+        except KeyError:
+            raise ValueError(f"Unsupported function {node.func}.")
+        # Assuming only unary functions are supported, which is checked above.
+        if len(node.args) != 1 or node.keywords:
+            raise ValueError(
+                f"Function {node.func} only accepts one positional "
+                "argument."
+            )
+        return Operation(op, self.visit(node.args[0]))
+
+
+@functools.lru_cache(256)
+def to_expression(str expr, tuple column_names):
+    """
+    Create an expression for `pylibcudf.transform.compute_column`.
+
+    Parameters
+    ----------
+    expr : str
+        The expression to evaluate. In (restricted) Python syntax.
+    column_names : tuple[str]
+        Ordered tuple of names. When calling `compute_column` on the resulting
+        expression, the provided table must have columns in the same order
+        as given here.
+
+    Notes
+    -----
+    This function keeps a small cache of recently used expressions.
+
+    Returns
+    -------
+    Expression
+        Expression for the given expr and col_names
+    """
+    visitor = ExpressionTransformer(
+        {name: ColumnReference(i) for i, name in enumerate(column_names)}
+    )
+    return visitor.visit(ast.parse(expr))