Skip to content

Commit

Permalink
Merge pull request #17379 from bdice/branch-25.02-merge-24.12
Browse files Browse the repository at this point in the history
Forward-merge branch-24.12 to branch-25.02
  • Loading branch information
raydouglass authored Nov 20, 2024
2 parents 6498fc3 + 698a716 commit 664fd87
Show file tree
Hide file tree
Showing 83 changed files with 2,693 additions and 1,443 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ concurrency:
cancel-in-progress: true

jobs:
# Please keep pr-builder as the top job here
pr-builder:
needs:
- changed-files
Expand Down Expand Up @@ -37,13 +38,23 @@ jobs:
- unit-tests-cudf-pandas
- pandas-tests
- pandas-tests-diff
- telemetry-setup
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
if: always()
with:
needs: ${{ toJSON(needs) }}
telemetry-setup:
continue-on-error: true
runs-on: ubuntu-latest
env:
OTEL_SERVICE_NAME: 'pr-cudf'
steps:
- name: Telemetry setup
uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
changed-files:
secrets: inherit
needs: telemetry-setup
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
files_yaml: |
Expand Down Expand Up @@ -91,9 +102,11 @@ jobs:
- '!notebooks/**'
checks:
secrets: inherit
needs: telemetry-setup
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
enable_check_generated_files: false
ignored_pr_jobs: "telemetry-summarize"
conda-cpp-build:
needs: checks
secrets: inherit
Expand Down Expand Up @@ -259,6 +272,7 @@ jobs:
script: ci/test_wheel_dask_cudf.sh
devcontainer:
secrets: inherit
needs: telemetry-setup
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
arch: '["amd64"]'
Expand Down Expand Up @@ -298,3 +312,18 @@ jobs:
node_type: cpu4
build_type: pull-request
run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh"

telemetry-summarize:
runs-on: ubuntu-latest
needs: pr-builder
if: always()
continue-on-error: true
steps:
- name: Load stashed telemetry env vars
uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main
with:
load_service_name: true
- name: Telemetry summarize
uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main
with:
cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}"
6 changes: 0 additions & 6 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,6 @@ jobs:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
# This selects "ARCH=amd64 + the latest supported Python + CUDA".
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
Expand Down Expand Up @@ -153,8 +151,6 @@ jobs:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
# This selects "ARCH=amd64 + the latest supported Python + CUDA".
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
Expand All @@ -164,8 +160,6 @@ jobs:
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
# This selects "ARCH=amd64 + the latest supported Python + CUDA".
matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
Expand Down
21 changes: 21 additions & 0 deletions ci/run_cudf_polars_polars_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,29 @@ DESELECTED_TESTS=(
"tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode
"tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error
"tests/docs/test_user_guide.py" # No dot binary in CI image
"tests/unit/test_polars_import.py::test_fork_safety" # test started to hang in polars-1.14
"tests/unit/operations/test_join.py::test_join_4_columns_with_validity" # fails in some systems, see https://github.com/pola-rs/polars/issues/19870
"tests/unit/io/test_csv.py::test_read_web_file" # fails in rockylinux8 due to SSL CA issues
)

if [[ $(arch) == "aarch64" ]]; then
# The binary used for TPC-H generation is compiled for x86_64, not aarch64.
DESELECTED_TESTS+=("tests/benchmark/test_pdsh.py::test_pdsh")
# The connectorx package is not available on arm
DESELECTED_TESTS+=("tests/unit/io/database/test_read.py::test_read_database")
# The necessary timezone information cannot be found in our CI image.
DESELECTED_TESTS+=("tests/unit/io/test_parquet.py::test_parametric_small_page_mask_filtering")
DESELECTED_TESTS+=("tests/unit/testing/test_assert_series_equal.py::test_assert_series_equal_parametric")
DESELECTED_TESTS+=("tests/unit/operations/test_join.py::test_join_4_columns_with_validity")
else
# Ensure that we don't run dbgen when it uses newer symbols than supported by the glibc version in the CI image.
glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
latest_glibc_symbol_found=$(nm py-polars/tests/benchmark/data/pdsh/dbgen/dbgen | grep GLIBC | grep -o "[0-9]\.[0-9]\+" | sort --version-sort | tail -1 | cut -d "." -f 2)
if [[ ${glibc_minor_version} -lt ${latest_glibc_symbol_found} ]]; then
DESELECTED_TESTS+=("tests/benchmark/test_pdsh.py::test_pdsh")
fi
fi

DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}")
python -m pytest \
--import-mode=importlib \
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ dependencies:
- pandas
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.11,<1.14
- polars>=1.11,<1.15
- pre-commit
- ptxcompiler
- pyarrow>=14.0.0,<19.0.0a0
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ dependencies:
- pandas
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.11,<1.14
- polars>=1.11,<1.15
- pre-commit
- pyarrow>=14.0.0,<19.0.0a0
- pydata-sphinx-theme!=0.14.2
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf-polars/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ requirements:
run:
- python
- pylibcudf ={{ version }}
- polars >=1.11,<1.14
- polars >=1.11,<1.15
- {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}

test:
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,7 @@ add_library(
src/io/avro/avro_gpu.cu
src/io/avro/reader_impl.cu
src/io/comp/brotli_dict.cpp
src/io/comp/comp.cpp
src/io/comp/cpu_unbz2.cpp
src/io/comp/debrotli.cu
src/io/comp/gpuinflate.cu
Expand Down
7 changes: 3 additions & 4 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -354,10 +354,7 @@ ConfigureNVBench(

# ##################################################################################################
# * strings benchmark -------------------------------------------------------------------
ConfigureBench(
STRINGS_BENCH string/factory.cu string/repeat_strings.cpp string/replace.cpp string/translate.cpp
string/url_decode.cu
)
ConfigureBench(STRINGS_BENCH string/factory.cu string/repeat_strings.cpp string/url_decode.cu)

ConfigureNVBench(
STRINGS_NVBENCH
Expand All @@ -381,11 +378,13 @@ ConfigureNVBench(
string/lengths.cpp
string/like.cpp
string/make_strings_column.cu
string/replace.cpp
string/replace_re.cpp
string/reverse.cpp
string/slice.cpp
string/split.cpp
string/split_re.cpp
string/translate.cpp
)

# ##################################################################################################
Expand Down
56 changes: 46 additions & 10 deletions cpp/benchmarks/io/json/json_reader_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,19 @@

#include <nvbench/nvbench.cuh>

// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
// run on most GPUs, but large enough to allow highest throughput
constexpr size_t data_size = 512 << 20;
// Default size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks
// to run on most GPUs, but large enough to allow highest throughput
constexpr size_t default_data_size = 512 << 20;
constexpr cudf::size_type num_cols = 64;

void json_read_common(cuio_source_sink_pair& source_sink,
cudf::size_type num_rows_to_read,
nvbench::state& state)
nvbench::state& state,
cudf::io::compression_type comptype = cudf::io::compression_type::NONE,
size_t data_size = default_data_size)
{
cudf::io::json_reader_options read_opts =
cudf::io::json_reader_options::builder(source_sink.make_source_info());
cudf::io::json_reader_options::builder(source_sink.make_source_info()).compression(comptype);

auto mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
Expand All @@ -57,15 +59,21 @@ void json_read_common(cuio_source_sink_pair& source_sink,
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

cudf::size_type json_write_bm_data(cudf::io::sink_info sink,
std::vector<cudf::type_id> const& dtypes)
cudf::size_type json_write_bm_data(
cudf::io::sink_info sink,
std::vector<cudf::type_id> const& dtypes,
cudf::io::compression_type comptype = cudf::io::compression_type::NONE,
size_t data_size = default_data_size)
{
auto const tbl = create_random_table(
cycle_dtypes(dtypes, num_cols), table_size_bytes{data_size}, data_profile_builder());
auto const view = tbl->view();

cudf::io::json_writer_options const write_opts =
cudf::io::json_writer_options::builder(sink, view).na_rep("null").rows_per_chunk(100'000);
cudf::io::json_writer_options::builder(sink, view)
.na_rep("null")
.rows_per_chunk(100'000)
.compression(comptype);
cudf::io::write_json(write_opts);
return view.num_rows();
}
Expand All @@ -87,6 +95,26 @@ void BM_json_read_io(nvbench::state& state, nvbench::type_list<nvbench::enum_typ
json_read_common(source_sink, num_rows, state);
}

template <cudf::io::compression_type comptype, io_type IO>
void BM_json_read_compressed_io(
nvbench::state& state, nvbench::type_list<nvbench::enum_type<comptype>, nvbench::enum_type<IO>>)
{
size_t const data_size = state.get_int64("data_size");
cuio_source_sink_pair source_sink(IO);
auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
static_cast<int32_t>(data_type::FLOAT),
static_cast<int32_t>(data_type::DECIMAL),
static_cast<int32_t>(data_type::TIMESTAMP),
static_cast<int32_t>(data_type::DURATION),
static_cast<int32_t>(data_type::STRING),
static_cast<int32_t>(data_type::LIST),
static_cast<int32_t>(data_type::STRUCT)});
auto const num_rows =
json_write_bm_data(source_sink.make_sink_info(), d_type, comptype, data_size);

json_read_common(source_sink, num_rows, state, comptype, data_size);
}

template <data_type DataType, io_type IO>
void BM_json_read_data_type(
nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IO>>)
Expand All @@ -110,8 +138,9 @@ using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
using io_list =
nvbench::enum_type_list<io_type::FILEPATH, io_type::HOST_BUFFER, io_type::DEVICE_BUFFER>;

using compression_list =
nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;
using compression_list = nvbench::enum_type_list<cudf::io::compression_type::GZIP,
cudf::io::compression_type::SNAPPY,
cudf::io::compression_type::NONE>;

NVBENCH_BENCH_TYPES(BM_json_read_data_type,
NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list<io_type::DEVICE_BUFFER>))
Expand All @@ -123,3 +152,10 @@ NVBENCH_BENCH_TYPES(BM_json_read_io, NVBENCH_TYPE_AXES(io_list))
.set_name("json_read_io")
.set_type_axes_names({"io"})
.set_min_samples(4);

NVBENCH_BENCH_TYPES(BM_json_read_compressed_io,
NVBENCH_TYPE_AXES(compression_list, nvbench::enum_type_list<io_type::FILEPATH>))
.set_name("json_read_compressed_io")
.set_type_axes_names({"compression_type", "io"})
.add_int64_power_of_two_axis("data_size", nvbench::range(20, 29, 1))
.set_min_samples(4);
1 change: 1 addition & 0 deletions cpp/benchmarks/io/nvbench_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
[](auto value) {
switch (value) {
case cudf::io::compression_type::SNAPPY: return "SNAPPY";
case cudf::io::compression_type::GZIP: return "GZIP";
case cudf::io::compression_type::NONE: return "NONE";
default: return "Unknown";
}
Expand Down
Loading

0 comments on commit 664fd87

Please sign in to comment.