From b36bca2c00e3cbe0fcbb5c4064e325ebf12d58ef Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Sep 2023 11:32:46 -0700 Subject: [PATCH] Squash --- .flake8 | 24 - .github/copy-pr-bot.yaml | 4 + .github/ops-bot.yaml | 1 - .github/workflows/build.yaml | 33 +- .github/workflows/pr.yaml | 47 +- .github/workflows/test.yaml | 25 +- .gitignore | 1 + .pre-commit-config.yaml | 19 +- CHANGELOG.md | 428 ++++ CONTRIBUTING.md | 2 +- README.md | 4 +- build.sh | 34 +- ci/build_cpp.sh | 4 +- ci/build_docs.sh | 36 +- ci/build_python.sh | 9 +- ci/build_wheel.sh | 55 + ci/build_wheel_cudf.sh | 16 + ci/build_wheel_dask_cudf.sh | 11 + ci/check_style.sh | 2 +- ci/checks/copyright.py | 1 - ci/checks/doxygen.sh | 8 +- ci/docs/build.sh | 61 - ci/release/apply_wheel_modifications.sh | 32 - ci/release/update-version.sh | 43 +- ci/test_cpp.sh | 44 +- ci/test_wheel_cudf.sh | 17 + ci/test_wheel_dask_cudf.sh | 19 + .../all_cuda-118_arch-x86_64.yaml | 47 +- .../all_cuda-120_arch-x86_64.yaml | 98 + conda/recipes/cudf/conda_build_config.yaml | 5 +- conda/recipes/cudf/meta.yaml | 60 +- .../cudf_kafka/conda_build_config.yaml | 3 + conda/recipes/cudf_kafka/meta.yaml | 11 +- conda/recipes/custreamz/meta.yaml | 15 +- conda/recipes/dask-cudf/meta.yaml | 18 +- conda/recipes/dask-cudf/run_test.sh | 8 +- conda/recipes/libcudf/build.sh | 8 +- conda/recipes/libcudf/conda_build_config.yaml | 41 +- conda/recipes/libcudf/meta.yaml | 104 +- conda/recipes/libcudf/nvcomp.txt | 3 - conda/recipes/libcudf/post-link.sh | 6 - cpp/CMakeLists.txt | 85 +- cpp/benchmarks/CMakeLists.txt | 40 +- cpp/benchmarks/binaryop/compiled_binaryop.cpp | 4 + cpp/benchmarks/common/generate_input.cu | 47 +- cpp/benchmarks/common/generate_input.hpp | 15 + cpp/benchmarks/copying/contiguous_split.cu | 94 +- cpp/benchmarks/copying/copy_if_else.cpp | 8 + cpp/benchmarks/copying/gather.cu | 4 +- cpp/benchmarks/fixture/benchmark_fixture.hpp | 12 +- cpp/benchmarks/fixture/nvbench_fixture.hpp | 94 + cpp/benchmarks/fixture/nvbench_main.cpp | 21 +- cpp/benchmarks/fixture/rmm_pool_raii.hpp | 81 - .../fixture/templated_benchmark_fixture.hpp | 4 +- cpp/benchmarks/groupby/group_max.cpp | 3 +- cpp/benchmarks/groupby/group_no_requests.cpp | 4 +- cpp/benchmarks/groupby/group_nth.cpp | 2 +- cpp/benchmarks/groupby/group_nunique.cpp | 3 +- cpp/benchmarks/groupby/group_rank.cpp | 1 - cpp/benchmarks/groupby/group_scan.cpp | 4 +- cpp/benchmarks/groupby/group_shift.cpp | 4 +- cpp/benchmarks/groupby/group_struct_keys.cpp | 9 +- cpp/benchmarks/groupby/group_sum.cpp | 4 +- cpp/benchmarks/hashing/hash.cpp | 84 +- cpp/benchmarks/hashing/partition.cpp | 7 + cpp/benchmarks/io/csv/csv_reader_input.cpp | 1 - cpp/benchmarks/io/csv/csv_reader_options.cpp | 1 - cpp/benchmarks/io/csv/csv_writer.cpp | 2 +- cpp/benchmarks/io/fst.cu | 55 +- cpp/benchmarks/io/json/json_reader_input.cpp | 48 +- cpp/benchmarks/io/json/json_writer.cpp | 2 +- cpp/benchmarks/io/json/nested_json.cpp | 11 +- cpp/benchmarks/io/orc/orc_reader_input.cpp | 3 +- cpp/benchmarks/io/orc/orc_reader_options.cpp | 3 +- cpp/benchmarks/io/orc/orc_writer.cpp | 3 +- cpp/benchmarks/io/orc/orc_writer_chunks.cpp | 3 +- .../io/parquet/parquet_reader_input.cpp | 48 +- .../io/parquet/parquet_reader_options.cpp | 3 +- cpp/benchmarks/io/parquet/parquet_writer.cpp | 3 +- .../io/parquet/parquet_writer_chunks.cpp | 3 +- cpp/benchmarks/io/text/multibyte_split.cpp | 1 - cpp/benchmarks/iterator/iterator.cu | 4 +- cpp/benchmarks/join/generate_input_tables.cuh | 34 +- cpp/benchmarks/join/join.cu | 1 - cpp/benchmarks/join/join_common.hpp | 4 +- cpp/benchmarks/join/mixed_join.cu | 1 - cpp/benchmarks/lists/copying/scatter_lists.cu | 24 +- cpp/benchmarks/lists/set_operations.cpp | 1 - cpp/benchmarks/null_mask/set_null_mask.cpp | 2 +- cpp/benchmarks/quantiles/quantiles.cpp | 6 +- cpp/benchmarks/reduction/anyall.cpp | 2 +- cpp/benchmarks/reduction/dictionary.cpp | 2 +- cpp/benchmarks/reduction/minmax.cpp | 2 +- cpp/benchmarks/reduction/rank.cpp | 1 - cpp/benchmarks/reduction/reduce.cpp | 2 +- cpp/benchmarks/reduction/scan_structs.cpp | 1 - cpp/benchmarks/reduction/segmented_reduce.cpp | 4 +- cpp/benchmarks/search/contains.cpp | 1 - cpp/benchmarks/sort/nested_types_common.hpp | 11 +- cpp/benchmarks/sort/rank.cpp | 2 +- cpp/benchmarks/sort/rank_structs.cpp | 2 +- cpp/benchmarks/sort/segmented_sort.cpp | 1 - cpp/benchmarks/sort/sort.cpp | 4 +- cpp/benchmarks/sort/sort_lists.cpp | 2 +- .../stream_compaction/apply_boolean_mask.cpp | 6 +- cpp/benchmarks/stream_compaction/distinct.cpp | 1 - .../stream_compaction/stable_distinct.cpp | 96 + cpp/benchmarks/stream_compaction/unique.cpp | 1 - .../stream_compaction/unique_count.cpp | 1 - cpp/benchmarks/string/case.cpp | 7 +- cpp/benchmarks/string/char_types.cpp | 66 + cpp/benchmarks/string/contains.cpp | 103 +- cpp/benchmarks/string/convert_durations.cpp | 4 +- cpp/benchmarks/string/convert_fixed_point.cpp | 18 +- cpp/benchmarks/string/count.cpp | 62 + cpp/benchmarks/string/extract.cpp | 71 +- cpp/benchmarks/string/gather.cpp | 59 + cpp/benchmarks/string/join_strings.cpp | 58 + cpp/benchmarks/string/json.cu | 4 +- cpp/benchmarks/string/lengths.cpp | 7 +- cpp/benchmarks/string/like.cpp | 60 +- cpp/benchmarks/string/replace_re.cpp | 84 +- cpp/benchmarks/string/reverse.cpp | 7 +- cpp/benchmarks/string/slice.cpp | 10 +- cpp/benchmarks/string/split.cpp | 86 +- cpp/benchmarks/string/split_re.cpp | 60 + .../synchronization/synchronization.cpp | 4 +- cpp/benchmarks/text/edit_distance.cpp | 58 + cpp/benchmarks/text/hash_ngrams.cpp | 60 + cpp/benchmarks/text/jaccard.cpp | 62 + cpp/benchmarks/text/minhash.cpp | 18 +- cpp/benchmarks/text/normalize.cpp | 69 +- cpp/benchmarks/text/normalize_spaces.cpp | 66 - cpp/benchmarks/text/replace.cpp | 59 +- cpp/benchmarks/text/subword.cpp | 9 +- cpp/benchmarks/text/tokenize.cpp | 108 +- .../type_dispatcher/type_dispatcher.cu | 12 +- cpp/cmake/thirdparty/get_arrow.cmake | 36 +- cpp/cmake/thirdparty/get_cufile.cmake | 4 +- cpp/cmake/thirdparty/get_libcudacxx.cmake | 37 + cpp/cmake/thirdparty/get_thrust.cmake | 6 +- .../patches/nvbench_global_setup.diff | 12 +- .../thirdparty/patches/nvbench_override.json | 5 - cpp/doxygen/Doxyfile | 168 +- .../developer_guide/DEVELOPER_GUIDE.md | 25 +- cpp/doxygen/developer_guide/TESTING.md | 66 + cpp/examples/basic/CMakeLists.txt | 4 +- cpp/examples/strings/CMakeLists.txt | 4 +- cpp/examples/strings/common.hpp | 4 +- cpp/examples/strings/custom_prealloc.cu | 2 +- .../cudf/ast/detail/expression_parser.hpp | 27 +- .../ast/detail/expression_transformer.hpp | 64 + cpp/include/cudf/ast/detail/operators.hpp | 47 + cpp/include/cudf/ast/expressions.hpp | 88 +- cpp/include/cudf/column/column.hpp | 87 +- .../cudf/column/column_device_view.cuh | 6 +- cpp/include/cudf/column/column_factories.hpp | 4 +- cpp/include/cudf/column/column_view.hpp | 62 +- cpp/include/cudf/concatenate.hpp | 42 +- cpp/include/cudf/contiguous_split.hpp | 151 +- cpp/include/cudf/copying.hpp | 168 +- cpp/include/cudf/datetime.hpp | 4 +- .../cudf/detail/aggregation/result_cache.hpp | 6 +- ...{concatenate.cuh => concatenate_masks.hpp} | 27 +- cpp/include/cudf/detail/copy_if.cuh | 26 +- cpp/include/cudf/detail/copy_if_else.cuh | 8 +- cpp/include/cudf/detail/copy_range.cuh | 27 +- cpp/include/cudf/detail/indexalator.cuh | 6 +- cpp/include/cudf/detail/join.hpp | 4 +- cpp/include/cudf/detail/null_mask.cuh | 29 +- cpp/include/cudf/detail/null_mask.hpp | 15 +- cpp/include/cudf/detail/nvtx/nvtx3.hpp | 6 +- cpp/include/cudf/detail/scatter.hpp | 4 +- .../cudf/detail/sizes_to_offsets_iterator.cuh | 8 +- cpp/include/cudf/detail/stream_compaction.hpp | 19 +- cpp/include/cudf/detail/tdigest/tdigest.hpp | 95 +- cpp/include/cudf/detail/transform.hpp | 4 +- cpp/include/cudf/detail/utilities/cuda.cuh | 60 +- .../cudf/detail/utilities/device_atomics.cuh | 124 +- .../detail/utilities/device_operators.cuh | 49 +- .../cudf/detail/utilities/hash_functions.cuh | 381 ---- .../cudf/detail/utilities/int_fastdiv.h | 36 +- .../cudf/detail/utilities/integer_utils.hpp | 8 +- .../detail/utilities/pinned_host_vector.hpp | 6 +- .../cudf/detail/utilities/stacktrace.hpp | 47 + .../detail/utilities/vector_factories.hpp | 4 +- cpp/include/cudf/detail/valid_if.cuh | 6 +- cpp/include/cudf/filling.hpp | 25 +- cpp/include/cudf/fixed_point/fixed_point.hpp | 5 +- cpp/include/cudf/fixed_point/temporary.hpp | 8 +- cpp/include/cudf/groupby.hpp | 11 +- cpp/include/cudf/hashing.hpp | 112 +- .../cudf/hashing/detail/default_hash.cuh | 35 + .../cudf/hashing/detail/hash_functions.cuh | 71 + .../cudf/{ => hashing}/detail/hashing.hpp | 47 +- .../hashing/detail/murmurhash3_x64_128.cuh | 223 +++ .../hashing/detail/murmurhash3_x86_32.cuh | 194 ++ cpp/include/cudf/io/arrow_io_source.hpp | 85 + cpp/include/cudf/io/csv.hpp | 30 +- cpp/include/cudf/io/data_sink.hpp | 10 +- cpp/include/cudf/io/datasource.hpp | 149 +- cpp/include/cudf/io/detail/data_casting.cuh | 6 +- cpp/include/cudf/io/detail/json.hpp | 2 +- cpp/include/cudf/io/detail/orc.hpp | 4 +- cpp/include/cudf/io/detail/parquet.hpp | 12 +- cpp/include/cudf/io/detail/tokenize_json.hpp | 2 + cpp/include/cudf/io/json.hpp | 41 +- cpp/include/cudf/io/orc.hpp | 90 +- cpp/include/cudf/io/orc_metadata.hpp | 4 +- cpp/include/cudf/io/parquet.hpp | 172 +- cpp/include/cudf/io/parquet_metadata.hpp | 231 +++ .../io/text/data_chunk_source_factories.hpp | 4 +- cpp/include/cudf/io/types.hpp | 142 +- cpp/include/cudf/join.hpp | 37 +- cpp/include/cudf/lists/combine.hpp | 4 +- cpp/include/cudf/lists/detail/gather.cuh | 27 +- cpp/include/cudf/lists/detail/scatter.cuh | 8 +- cpp/include/cudf/lists/lists_column_view.hpp | 8 +- cpp/include/cudf/null_mask.hpp | 18 + cpp/include/cudf/reduction.hpp | 2 +- .../cudf/reduction/detail/reduction.hpp | 40 + .../reduction/detail/reduction_operators.cuh | 26 +- cpp/include/cudf/replace.hpp | 25 +- cpp/include/cudf/rolling.hpp | 28 +- .../cudf/rolling/range_window_bounds.hpp | 4 +- cpp/include/cudf/scalar/scalar.hpp | 4 +- .../cudf/scalar/scalar_device_view.cuh | 6 +- cpp/include/cudf/search.hpp | 11 +- cpp/include/cudf/stream_compaction.hpp | 49 +- cpp/include/cudf/strings/combine.hpp | 8 +- .../cudf/strings/convert/convert_datetime.hpp | 2 +- .../cudf/strings/convert/convert_integers.hpp | 4 +- .../cudf/strings/convert/convert_lists.hpp | 4 +- .../cudf/strings/detail/char_tables.hpp | 6 +- .../strings/detail/convert/fixed_point.cuh | 4 +- .../cudf/strings/detail/convert/is_float.cuh | 4 +- .../detail/convert/string_to_float.cuh | 6 +- .../strings/detail/convert/string_to_int.cuh | 4 +- cpp/include/cudf/strings/detail/gather.cuh | 70 +- .../cudf/strings/detail/split_utils.cuh | 62 +- .../cudf/strings/detail/strings_children.cuh | 4 +- .../detail/strings_column_factories.cuh | 2 +- cpp/include/cudf/strings/detail/strip.cuh | 4 +- cpp/include/cudf/strings/detail/utf8.hpp | 2 +- cpp/include/cudf/strings/detail/utilities.cuh | 24 +- cpp/include/cudf/strings/find.hpp | 29 +- cpp/include/cudf/strings/repeat_strings.hpp | 7 +- cpp/include/cudf/strings/slice.hpp | 93 - cpp/include/cudf/strings/split/split.hpp | 56 +- cpp/include/cudf/strings/string_view.cuh | 57 +- cpp/include/cudf/strings/string_view.hpp | 57 +- .../cudf/strings/strings_column_view.hpp | 6 +- .../cudf/table/experimental/row_operators.cuh | 34 +- cpp/include/cudf/table/row_operators.cuh | 10 +- .../cudf/tdigest/tdigest_column_view.hpp | 8 +- cpp/include/cudf/types.hpp | 11 +- cpp/include/cudf/utilities/error.hpp | 32 +- cpp/include/cudf/utilities/span.hpp | 4 +- cpp/include/cudf/wrappers/dictionary.hpp | 2 +- cpp/include/cudf_test/base_fixture.hpp | 29 +- cpp/include/cudf_test/column_utilities.hpp | 4 +- cpp/include/cudf_test/column_wrapper.hpp | 47 +- cpp/include/cudf_test/cudf_gtest.hpp | 2 +- cpp/include/cudf_test/cxxopts.hpp | 262 +-- cpp/include/cudf_test/file_utilities.hpp | 6 +- .../stream_checking_resource_adaptor.hpp | 8 + cpp/include/cudf_test/tdigest_utilities.cuh | 4 +- cpp/include/doxygen_groups.h | 5 +- cpp/include/nvtext/bpe_tokenize.hpp | 15 +- cpp/include/nvtext/detail/generate_ngrams.hpp | 37 + cpp/include/nvtext/generate_ngrams.hpp | 35 +- cpp/include/nvtext/jaccard.hpp | 79 + cpp/include/nvtext/minhash.hpp | 91 +- cpp/include/nvtext/subword_tokenize.hpp | 9 +- cpp/libcudf_kafka/CMakeLists.txt | 4 +- .../include/cudf_kafka/kafka_callback.hpp | 4 +- cpp/src/ast/expression_parser.cpp | 12 +- cpp/src/ast/expressions.cpp | 27 +- cpp/src/binaryop/binaryop.cpp | 5 +- cpp/src/binaryop/compiled/binary_ops.cu | 11 +- cpp/src/binaryop/compiled/binary_ops.cuh | 4 +- cpp/src/bitmask/null_mask.cu | 51 +- cpp/src/column/column.cu | 27 +- cpp/src/column/column_factories.cpp | 56 +- cpp/src/column/column_factories.cu | 2 +- cpp/src/column/column_view.cpp | 18 +- cpp/src/copying/concatenate.cu | 110 +- cpp/src/copying/contiguous_split.cu | 1527 +++++++++++---- cpp/src/copying/copy.cpp | 10 +- cpp/src/copying/copy.cu | 18 +- cpp/src/copying/copy_range.cu | 12 +- cpp/src/copying/gather.cu | 13 +- cpp/src/copying/get_element.cu | 3 +- cpp/src/copying/purge_nonempty_nulls.cu | 14 +- cpp/src/copying/reverse.cu | 14 +- cpp/src/copying/sample.cu | 5 +- cpp/src/copying/scatter.cu | 55 +- cpp/src/copying/shift.cu | 3 +- cpp/src/copying/slice.cu | 28 +- cpp/src/copying/split.cpp | 24 +- cpp/src/datetime/datetime_ops.cu | 4 +- cpp/src/datetime/timezone.cpp | 8 +- cpp/src/dictionary/add_keys.cu | 2 +- cpp/src/dictionary/detail/concatenate.cu | 10 +- cpp/src/dictionary/dictionary_factories.cu | 5 +- cpp/src/dictionary/remove_keys.cu | 5 +- cpp/src/filling/calendrical_month_sequence.cu | 5 +- cpp/src/filling/fill.cu | 9 +- cpp/src/filling/repeat.cu | 24 +- cpp/src/filling/sequence.cu | 6 +- cpp/src/groupby/groupby.cu | 17 +- cpp/src/groupby/hash/groupby.cu | 34 +- cpp/src/groupby/hash/multi_pass_kernels.cuh | 8 +- cpp/src/groupby/sort/aggregate.cpp | 10 +- cpp/src/groupby/sort/group_collect.cu | 4 +- cpp/src/groupby/sort/group_merge_lists.cu | 6 +- cpp/src/groupby/sort/group_scan_util.cuh | 3 +- .../sort/group_single_pass_reduction_util.cuh | 20 +- cpp/src/groupby/sort/group_std.cu | 38 +- cpp/src/groupby/sort/scan.cpp | 2 +- cpp/src/groupby/sort/sort_helper.cu | 79 +- cpp/src/hash/concurrent_unordered_map.cuh | 72 +- cpp/src/hash/hash_allocator.cuh | 124 +- cpp/src/hash/hashing.cu | 45 +- cpp/src/hash/helper_functions.cuh | 46 +- cpp/src/hash/managed.cuh | 88 +- cpp/src/hash/md5_hash.cu | 118 +- cpp/src/hash/murmurhash3_x64_128.cu | 150 ++ .../{murmur_hash.cu => murmurhash3_x86_32.cu} | 29 +- ...ur_hash.cu => spark_murmurhash3_x86_32.cu} | 87 +- cpp/src/hash/unordered_multiset.cuh | 64 +- cpp/src/hash/xxhash_64.cu | 337 ++++ cpp/src/interop/detail/arrow_allocator.cpp | 45 +- cpp/src/interop/detail/arrow_allocator.hpp | 6 +- cpp/src/interop/dlpack.cpp | 12 +- cpp/src/interop/from_arrow.cu | 29 +- cpp/src/interop/to_arrow.cu | 4 +- cpp/src/io/avro/avro.cpp | 17 +- cpp/src/io/avro/avro.hpp | 16 +- cpp/src/io/avro/avro_gpu.cu | 16 +- cpp/src/io/avro/reader_impl.cu | 39 +- cpp/src/io/comp/brotli_dict.cpp | 4 +- cpp/src/io/comp/brotli_dict.hpp | 4 +- cpp/src/io/comp/cpu_unbz2.cpp | 24 +- cpp/src/io/comp/debrotli.cu | 72 +- cpp/src/io/comp/gpuinflate.cu | 58 +- cpp/src/io/comp/gpuinflate.hpp | 30 +- cpp/src/io/comp/nvcomp_adapter.cpp | 4 +- cpp/src/io/comp/snap.cu | 18 +- cpp/src/io/comp/statistics.cu | 62 + cpp/src/io/comp/unbz2.hpp | 4 +- cpp/src/io/comp/uncomp.cpp | 65 +- cpp/src/io/comp/unsnap.cu | 14 +- cpp/src/io/csv/csv_gpu.cu | 47 +- cpp/src/io/csv/csv_gpu.hpp | 6 +- cpp/src/io/csv/datetime.cuh | 10 +- cpp/src/io/csv/durations.cu | 8 +- cpp/src/io/csv/reader_impl.cu | 58 +- cpp/src/io/csv/writer_impl.cu | 10 +- cpp/src/io/fst/agent_dfa.cuh | 58 +- cpp/src/io/fst/in_reg_array.cuh | 6 +- cpp/src/io/fst/logical_stack.cuh | 53 +- cpp/src/io/fst/lookup_tables.cuh | 591 ++++-- cpp/src/io/functions.cpp | 54 +- .../{experimental => }/byte_range_info.cu | 6 +- cpp/src/io/json/json_column.cu | 97 +- cpp/src/io/json/json_tree.cu | 113 +- cpp/src/io/json/{ => legacy}/json_gpu.cu | 49 +- cpp/src/io/json/{ => legacy}/json_gpu.hpp | 12 +- cpp/src/io/json/legacy/read_json.hpp | 33 + cpp/src/io/json/{ => legacy}/reader_impl.cu | 54 +- cpp/src/io/json/nested_json.hpp | 37 +- cpp/src/io/json/nested_json_gpu.cu | 1157 +++++++---- .../read_json.cpp => read_json.cu} | 50 +- .../io/json/{experimental => }/read_json.hpp | 6 +- cpp/src/io/json/write_json.cu | 117 +- cpp/src/io/orc/aggregate_orc_metadata.cpp | 26 +- cpp/src/io/orc/aggregate_orc_metadata.hpp | 13 +- cpp/src/io/orc/dict_enc.cu | 630 ++---- cpp/src/io/orc/orc.cpp | 20 +- cpp/src/io/orc/orc.hpp | 10 - cpp/src/io/orc/orc_field_reader.hpp | 8 +- cpp/src/io/orc/orc_field_writer.hpp | 18 +- cpp/src/io/orc/orc_gpu.hpp | 185 +- cpp/src/io/orc/reader_impl.cu | 1497 +++++++------- cpp/src/io/orc/reader_impl.hpp | 182 +- cpp/src/io/orc/stats_enc.cu | 34 +- cpp/src/io/orc/stripe_data.cu | 24 +- cpp/src/io/orc/stripe_enc.cu | 86 +- cpp/src/io/orc/stripe_init.cu | 18 +- cpp/src/io/orc/writer_impl.cu | 642 +++--- cpp/src/io/orc/writer_impl.hpp | 26 +- cpp/src/io/parquet/chunk_dict.cu | 16 +- .../io/parquet/compact_protocol_reader.cpp | 23 +- .../io/parquet/compact_protocol_reader.hpp | 19 +- .../io/parquet/compact_protocol_writer.cpp | 80 +- .../io/parquet/compact_protocol_writer.hpp | 49 +- cpp/src/io/parquet/decode_preprocess.cu | 417 ++++ cpp/src/io/parquet/delta_binary.cuh | 294 +++ cpp/src/io/parquet/page_data.cu | 1725 ++--------------- cpp/src/io/parquet/page_decode.cuh | 1363 +++++++++++++ cpp/src/io/parquet/page_delta_decode.cu | 176 ++ cpp/src/io/parquet/page_enc.cu | 365 ++-- cpp/src/io/parquet/page_hdr.cu | 64 +- cpp/src/io/parquet/page_string_decode.cu | 797 ++++++++ cpp/src/io/parquet/page_string_utils.cuh | 110 ++ cpp/src/io/parquet/parquet.hpp | 19 +- cpp/src/io/parquet/parquet_common.hpp | 1 + cpp/src/io/parquet/parquet_gpu.hpp | 201 +- cpp/src/io/parquet/predicate_pushdown.cpp | 530 +++++ cpp/src/io/parquet/reader.cpp | 3 +- cpp/src/io/parquet/reader_impl.cpp | 284 ++- cpp/src/io/parquet/reader_impl.hpp | 49 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 62 +- cpp/src/io/parquet/reader_impl_helpers.hpp | 105 +- cpp/src/io/parquet/reader_impl_preprocess.cu | 302 ++- cpp/src/io/parquet/rle_stream.cuh | 365 ++++ cpp/src/io/parquet/writer_impl.cu | 228 ++- cpp/src/io/parquet/writer_impl.hpp | 11 +- cpp/src/io/statistics/byte_array_view.cuh | 10 +- cpp/src/io/statistics/column_statistics.cuh | 36 +- .../io/statistics/orc_column_statistics.cu | 8 +- .../statistics/parquet_column_statistics.cu | 8 +- cpp/src/io/statistics/statistics.cuh | 8 +- .../statistics_type_identification.cuh | 6 +- .../io/statistics/typed_statistics_chunk.cuh | 12 +- cpp/src/io/text/bgzip_data_chunk_source.cu | 18 +- cpp/src/io/text/bgzip_utils.cpp | 4 +- .../io/text/data_chunk_source_factories.cpp | 10 +- cpp/src/io/text/multibyte_split.cu | 49 +- cpp/src/io/utilities/arrow_io_source.cpp | 85 + cpp/src/io/utilities/block_utils.cuh | 16 +- cpp/src/io/utilities/column_buffer.cpp | 190 +- cpp/src/io/utilities/column_buffer.hpp | 184 +- cpp/src/io/utilities/column_utils.cuh | 4 +- cpp/src/io/utilities/data_sink.cpp | 4 +- cpp/src/io/utilities/datasource.cpp | 87 +- cpp/src/io/utilities/file_io_utilities.cpp | 10 +- cpp/src/io/utilities/hostdevice_span.hpp | 22 +- cpp/src/io/utilities/hostdevice_vector.hpp | 41 +- cpp/src/io/utilities/output_builder.cuh | 18 +- cpp/src/io/utilities/parsing_utils.cu | 41 +- cpp/src/io/utilities/row_selection.cpp | 15 +- cpp/src/io/utilities/row_selection.hpp | 9 +- cpp/src/io/utilities/thread_pool.hpp | 20 +- cpp/src/io/utilities/trie.cu | 12 +- cpp/src/io/utilities/trie.cuh | 2 +- cpp/src/jit/cache.cpp | 4 +- cpp/src/jit/parser.cpp | 40 +- cpp/src/jit/parser.hpp | 42 +- cpp/src/jit/util.cpp | 8 +- cpp/src/jit/util.hpp | 4 +- cpp/src/join/conditional_join_kernels.cuh | 48 +- cpp/src/join/hash_join.cu | 13 +- cpp/src/join/join_common_utils.cuh | 20 +- cpp/src/join/join_common_utils.hpp | 11 +- cpp/src/join/mixed_join_common_utils.cuh | 3 +- cpp/src/join/mixed_join_size_kernel.cuh | 5 +- cpp/src/join/mixed_join_size_kernels_semi.cu | 5 +- cpp/src/labeling/label_bins.cu | 12 +- .../combine/concatenate_list_elements.cu | 53 +- cpp/src/lists/combine/concatenate_rows.cu | 19 +- cpp/src/lists/contains.cu | 337 +--- cpp/src/lists/copying/concatenate.cu | 16 +- cpp/src/lists/copying/copying.cu | 7 +- cpp/src/lists/copying/scatter_helper.cu | 14 +- cpp/src/lists/interleave_columns.cu | 36 +- cpp/src/lists/lists_column_factories.cu | 18 +- cpp/src/lists/reverse.cu | 2 +- cpp/src/lists/sequences.cu | 18 +- .../stream_compaction/apply_boolean_mask.cu | 10 +- cpp/src/lists/utilities.cu | 10 +- cpp/src/merge/merge.cu | 19 +- cpp/src/partitioning/partitioning.cu | 88 +- cpp/src/quantiles/tdigest/tdigest.cu | 18 +- .../quantiles/tdigest/tdigest_aggregation.cu | 46 +- cpp/src/reductions/all.cu | 16 +- cpp/src/reductions/any.cu | 16 +- cpp/src/reductions/minmax.cu | 6 +- .../reductions/nested_type_minmax_util.cuh | 182 ++ cpp/src/reductions/scan/scan.cuh | 12 +- cpp/src/reductions/scan/scan_exclusive.cu | 24 +- cpp/src/reductions/scan/scan_inclusive.cu | 56 +- cpp/src/reductions/segmented/simple.cuh | 2 +- cpp/src/reductions/simple.cuh | 14 +- cpp/src/reductions/struct_minmax_util.cuh | 155 -- cpp/src/replace/clamp.cu | 6 +- cpp/src/replace/nans.cu | 15 +- cpp/src/replace/nulls.cu | 27 +- cpp/src/replace/replace.cu | 103 +- cpp/src/reshape/interleave_columns.cu | 2 +- cpp/src/reshape/tile.cu | 6 +- cpp/src/rolling/detail/nth_element.cuh | 4 +- .../detail/optimized_unbounded_window.cpp | 161 ++ .../detail/optimized_unbounded_window.hpp | 56 + .../rolling/detail/range_comparator_utils.cuh | 143 ++ .../rolling/detail/range_window_bounds.hpp | 8 +- cpp/src/rolling/detail/rolling.cuh | 8 +- .../rolling/detail/rolling_collect_list.cu | 4 +- cpp/src/rolling/detail/rolling_jit.hpp | 10 +- cpp/src/rolling/grouped_rolling.cu | 132 +- cpp/src/rolling/jit/kernel.cu | 4 +- cpp/src/rolling/jit/operation.hpp | 6 +- cpp/src/rolling/range_window_bounds.cpp | 14 +- cpp/src/round/round.cu | 4 +- cpp/src/scalar/scalar.cpp | 6 +- cpp/src/search/contains_column.cu | 5 +- cpp/src/search/contains_scalar.cu | 4 +- cpp/src/search/contains_table.cu | 599 +++--- cpp/src/search/search_ordered.cu | 10 +- cpp/src/sort/segmented_sort_impl.cuh | 4 +- cpp/src/stream_compaction/distinct_count.cu | 48 +- cpp/src/stream_compaction/stable_distinct.cu | 37 +- .../stream_compaction_common.cuh | 2 +- .../stream_compaction_common.hpp | 1 - cpp/src/strings/attributes.cu | 10 +- cpp/src/strings/capitalize.cu | 12 +- cpp/src/strings/case.cu | 12 +- cpp/src/strings/char_types/char_cases.h | 4 +- cpp/src/strings/char_types/char_flags.h | 4 +- cpp/src/strings/char_types/char_types.cu | 93 +- cpp/src/strings/combine/concatenate.cu | 6 +- cpp/src/strings/combine/join.cu | 189 +- cpp/src/strings/combine/join_list_elements.cu | 6 +- cpp/src/strings/contains.cu | 7 +- cpp/src/strings/convert/convert_booleans.cu | 2 +- cpp/src/strings/convert/convert_datetime.cu | 4 +- cpp/src/strings/convert/convert_durations.cu | 22 +- .../strings/convert/convert_fixed_point.cu | 2 +- cpp/src/strings/convert/convert_floats.cu | 8 +- cpp/src/strings/convert/convert_hex.cu | 8 +- cpp/src/strings/convert/convert_integers.cu | 2 +- cpp/src/strings/convert/convert_ipv4.cu | 4 +- cpp/src/strings/convert/convert_lists.cu | 4 +- cpp/src/strings/convert/convert_urls.cu | 87 +- cpp/src/strings/copying/concatenate.cu | 16 +- cpp/src/strings/copying/shift.cu | 88 +- cpp/src/strings/count_matches.cu | 12 +- cpp/src/strings/extract/extract.cu | 25 +- cpp/src/strings/extract/extract_all.cu | 46 +- cpp/src/strings/json/json_path.cu | 34 +- cpp/src/strings/like.cu | 27 +- cpp/src/strings/padding.cu | 4 +- cpp/src/strings/regex/regcomp.cpp | 18 +- cpp/src/strings/regex/regcomp.h | 6 +- cpp/src/strings/regex/regex.cuh | 79 +- cpp/src/strings/regex/regex.inl | 58 +- cpp/src/strings/regex/regex_program_impl.h | 3 +- cpp/src/strings/regex/utilities.cuh | 4 +- cpp/src/strings/repeat_strings.cu | 15 +- cpp/src/strings/replace/backref_re.cuh | 51 +- cpp/src/strings/replace/multi_re.cu | 84 +- cpp/src/strings/replace/replace.cu | 2 +- cpp/src/strings/replace/replace_re.cu | 66 +- cpp/src/strings/reverse.cu | 6 +- cpp/src/strings/search/find.cu | 180 +- cpp/src/strings/search/find_multiple.cu | 6 +- cpp/src/strings/search/findall.cu | 23 +- cpp/src/strings/slice.cu | 157 -- cpp/src/strings/split/partition.cu | 2 +- cpp/src/strings/split/split.cu | 33 +- cpp/src/strings/split/split.cuh | 7 +- cpp/src/strings/split/split_re.cu | 41 +- cpp/src/strings/split/split_record.cu | 96 +- cpp/src/strings/strings_column_factories.cu | 10 +- cpp/src/strings/strings_column_view.cpp | 4 +- cpp/src/strings/utilities.cu | 14 +- cpp/src/structs/copying/concatenate.cu | 15 +- cpp/src/structs/utilities.cpp | 12 +- cpp/src/table/row_operators.cu | 420 ++-- cpp/src/text/edit_distance.cu | 194 +- cpp/src/text/generate_ngrams.cu | 113 +- cpp/src/text/jaccard.cu | 307 +++ cpp/src/text/minhash.cu | 265 ++- cpp/src/text/ngrams_tokenize.cu | 2 +- cpp/src/text/normalize.cu | 43 +- cpp/src/text/replace.cu | 2 +- cpp/src/text/stemmer.cu | 26 +- cpp/src/text/subword/bpe_tokenizer.cu | 67 +- cpp/src/text/subword/bpe_tokenizer.cuh | 78 +- cpp/src/text/subword/data_normalizer.cu | 51 +- .../text/subword/detail/codepoint_metadata.ah | 4 +- .../text/subword/detail/data_normalizer.hpp | 10 +- .../text/subword/detail/tokenizer_utils.cuh | 9 +- .../subword/detail/wordpiece_tokenizer.hpp | 10 +- cpp/src/text/subword/load_hash_file.cu | 14 +- cpp/src/text/subword/load_merges_file.cu | 51 +- cpp/src/text/subword/subword_tokenize.cu | 69 +- cpp/src/text/subword/wordpiece_tokenizer.cu | 47 +- cpp/src/text/tokenize.cu | 57 +- cpp/src/text/utilities/tokenize_ops.cuh | 55 +- cpp/src/transform/compute_column.cu | 8 +- cpp/src/transform/jit/kernel.cu | 15 +- cpp/src/transform/one_hot_encode.cu | 8 +- cpp/src/transform/row_bit_count.cu | 12 +- cpp/src/transform/transform.cpp | 2 +- cpp/src/transpose/transpose.cu | 8 +- cpp/src/utilities/stacktrace.cpp | 88 + cpp/tests/CMakeLists.txt | 63 +- cpp/tests/ast/transform_tests.cpp | 28 + cpp/tests/binaryop/binop-generic-ptx-test.cpp | 6 +- cpp/tests/bitmask/bitmask_tests.cpp | 12 +- cpp/tests/column/column_test.cpp | 86 +- .../column/column_view_device_span_test.cpp | 6 +- cpp/tests/column/factories_test.cpp | 22 +- cpp/tests/copying/concatenate_tests.cpp | 70 +- .../copying/copy_if_else_nested_tests.cpp | 2 +- cpp/tests/copying/copy_tests.cpp | 46 +- cpp/tests/copying/gather_str_tests.cpp | 18 +- cpp/tests/copying/gather_struct_tests.cpp | 4 +- cpp/tests/copying/get_value_tests.cpp | 20 +- .../copying/purge_nonempty_nulls_tests.cpp | 45 +- .../copying/scatter_list_scalar_tests.cpp | 8 +- cpp/tests/copying/scatter_tests.cpp | 10 +- cpp/tests/copying/shift_tests.cpp | 38 +- cpp/tests/copying/split_tests.cpp | 965 ++++++--- cpp/tests/copying/utility_tests.cpp | 20 +- cpp/tests/datetime/datetime_ops_test.cpp | 9 +- .../device_atomics/device_atomics_test.cu | 91 - cpp/tests/dictionary/decode_test.cpp | 2 +- cpp/tests/dictionary/remove_keys_test.cpp | 2 +- cpp/tests/dictionary/set_keys_test.cpp | 2 +- cpp/tests/filling/repeat_tests.cpp | 34 + cpp/tests/groupby/collect_list_tests.cpp | 10 +- cpp/tests/groupby/groupby_test_util.cpp | 2 +- cpp/tests/groupby/groupby_test_util.hpp | 1 + cpp/tests/groupby/max_tests.cpp | 71 + cpp/tests/groupby/min_tests.cpp | 71 + cpp/tests/groupby/tdigest_tests.cu | 8 +- cpp/tests/groupby/var_tests.cpp | 30 + cpp/tests/hash_map/map_test.cu | 5 +- cpp/tests/hashing/hash_test.cpp | 1109 ----------- cpp/tests/hashing/md5_test.cpp | 290 +++ .../hashing/murmurhash3_x64_128_test.cpp | 113 ++ cpp/tests/hashing/murmurhash3_x86_32_test.cpp | 405 ++++ .../hashing/spark_murmurhash3_x86_32_test.cpp | 576 ++++++ cpp/tests/hashing/xxhash_64_test.cpp | 177 ++ cpp/tests/interop/arrow_utils.hpp | 4 +- cpp/tests/interop/dlpack_test.cpp | 7 +- cpp/tests/interop/from_arrow_test.cpp | 9 +- cpp/tests/interop/to_arrow_test.cpp | 10 +- cpp/tests/io/arrow_io_source_test.cpp | 20 +- cpp/tests/io/comp/decomp_test.cpp | 22 +- cpp/tests/io/csv_test.cpp | 201 +- cpp/tests/io/fst/fst_test.cu | 21 +- cpp/tests/io/fst/logical_stack_test.cu | 4 +- cpp/tests/io/json_chunked_reader.cpp | 4 +- cpp/tests/io/json_test.cpp | 139 +- cpp/tests/io/json_tree.cpp | 21 +- cpp/tests/io/json_type_cast_test.cu | 60 +- cpp/tests/io/json_writer.cpp | 115 ++ cpp/tests/io/nested_json_test.cpp | 302 ++- cpp/tests/io/orc_test.cpp | 218 ++- cpp/tests/io/parquet_test.cpp | 1647 ++++++++++++++-- cpp/tests/io/row_selection_test.cpp | 2 +- cpp/tests/io/text/data_chunk_source_test.cpp | 6 +- cpp/tests/iterator/iterator_tests.cuh | 4 +- .../optional_iterator_test_numeric.cu | 4 +- .../iterator/pair_iterator_test_numeric.cu | 6 +- .../sizes_to_offsets_iterator_test.cu | 4 +- .../iterator/value_iterator_test_transform.cu | 6 +- cpp/tests/join/conditional_join_tests.cu | 10 +- cpp/tests/join/join_tests.cpp | 19 +- cpp/tests/join/mixed_join_tests.cu | 10 +- cpp/tests/join/semi_anti_join_tests.cpp | 7 +- .../concatenate_list_elements_tests.cpp | 315 +++ cpp/tests/lists/contains_tests.cpp | 4 +- cpp/tests/lists/extract_tests.cpp | 6 +- .../partitioning/hash_partition_test.cpp | 2 +- .../quantiles/percentile_approx_test.cpp | 4 +- cpp/tests/reductions/reduction_tests.cpp | 198 +- cpp/tests/reductions/scan_tests.cpp | 1 + cpp/tests/reductions/tdigest_tests.cu | 6 +- cpp/tests/replace/replace_nulls_tests.cpp | 23 +- cpp/tests/replace/replace_tests.cpp | 38 +- .../reshape/interleave_columns_tests.cpp | 2 +- .../rolling/grouped_rolling_range_test.cpp | 384 +++- cpp/tests/rolling/grouped_rolling_test.cpp | 73 +- cpp/tests/rolling/range_comparator_test.cu | 147 ++ cpp/tests/rolling/rolling_test.cpp | 8 +- cpp/tests/scalar/scalar_device_view_test.cu | 2 +- cpp/tests/search/search_test.cpp | 140 +- cpp/tests/sort/sort_nested_types_tests.cpp | 20 + .../distinct_count_tests.cpp | 10 + .../stream_compaction/distinct_tests.cpp | 14 +- .../stable_distinct_tests.cpp | 1354 +++++++++++++ cpp/tests/streams/concatenate_test.cpp | 51 + cpp/tests/streams/copying_test.cpp | 339 ++++ cpp/tests/streams/filling_test.cpp | 76 + cpp/tests/streams/groupby_test.cpp | 67 + cpp/tests/streams/hash_test.cpp | 54 + cpp/tests/streams/replace_test.cpp | 109 ++ cpp/tests/streams/search_test.cpp | 69 + cpp/tests/strings/array_tests.cpp | 32 +- cpp/tests/strings/attrs_tests.cpp | 8 +- cpp/tests/strings/booleans_tests.cpp | 13 +- cpp/tests/strings/case_tests.cpp | 16 +- cpp/tests/strings/chars_types_tests.cpp | 8 +- .../strings/combine/concatenate_tests.cpp | 24 +- .../strings/combine/join_strings_tests.cpp | 19 +- cpp/tests/strings/concatenate_tests.cpp | 14 +- cpp/tests/strings/contains_tests.cpp | 16 +- cpp/tests/strings/datetime_tests.cpp | 12 +- cpp/tests/strings/durations_tests.cpp | 10 +- cpp/tests/strings/extract_tests.cpp | 24 +- cpp/tests/strings/factories_test.cu | 24 +- cpp/tests/strings/fill_tests.cpp | 11 +- cpp/tests/strings/find_multiple_tests.cpp | 16 +- cpp/tests/strings/find_tests.cpp | 85 +- cpp/tests/strings/findall_tests.cpp | 4 +- cpp/tests/strings/floats_tests.cpp | 10 +- cpp/tests/strings/integers_tests.cpp | 19 +- cpp/tests/strings/ipv4_tests.cpp | 9 +- cpp/tests/strings/pad_tests.cpp | 26 +- cpp/tests/strings/repeat_strings_tests.cpp | 2 +- cpp/tests/strings/replace_regex_tests.cpp | 37 +- cpp/tests/strings/replace_tests.cpp | 30 +- cpp/tests/strings/reverse_tests.cpp | 4 +- cpp/tests/strings/slice_tests.cpp | 272 +-- cpp/tests/strings/split_tests.cpp | 76 +- cpp/tests/strings/strip_tests.cpp | 26 +- cpp/tests/strings/translate_tests.cpp | 12 +- cpp/tests/strings/urls_tests.cpp | 26 +- cpp/tests/structs/structs_column_tests.cpp | 4 +- cpp/tests/structs/utilities_tests.cpp | 44 - .../table/experimental_row_operator_tests.cu | 191 +- .../table/row_operator_tests_utilities.cu | 214 ++ .../table/row_operator_tests_utilities.hpp | 45 + cpp/tests/text/edit_distance_tests.cpp | 6 +- cpp/tests/text/jaccard_tests.cpp | 80 + cpp/tests/text/minhash_tests.cpp | 71 +- cpp/tests/text/ngrams_tests.cpp | 41 +- cpp/tests/text/ngrams_tokenize_tests.cpp | 2 +- cpp/tests/text/normalize_tests.cpp | 6 +- cpp/tests/text/replace_tests.cpp | 4 +- cpp/tests/text/stemmer_tests.cpp | 6 +- cpp/tests/text/subword_tests.cpp | 142 +- cpp/tests/text/tokenize_tests.cpp | 6 +- .../transform/integration/assert_unary.h | 6 +- .../integration/unary_transform_test.cpp | 16 +- cpp/tests/transform/mask_to_bools_test.cpp | 2 +- cpp/tests/transform/row_bit_count_test.cu | 47 +- cpp/tests/transpose/transpose_test.cpp | 6 +- cpp/tests/types/traits_test.cpp | 2 +- cpp/tests/unary/cast_tests.cpp | 61 +- cpp/tests/utilities/column_utilities.cu | 117 +- cpp/tests/utilities/identify_stream_usage.cpp | 109 +- cpp/tests/utilities/tdigest_utilities.cu | 6 +- .../column_utilities_tests.cpp | 28 +- cpp/tests/utilities_tests/span_tests.cu | 42 +- dependencies.yaml | 189 +- docs/cudf/Makefile | 2 +- ...lass_without_autosummary.rst => class.rst} | 3 + .../autosummary/class_with_autosummary.rst | 33 - docs/cudf/source/api_docs/dataframe.rst | 13 +- .../cudf/source/api_docs/extension_dtypes.rst | 170 ++ .../source/api_docs/general_functions.rst | 14 +- docs/cudf/source/api_docs/index.rst | 1 + docs/cudf/source/api_docs/index_objects.rst | 32 +- docs/cudf/source/api_docs/io.rst | 4 +- docs/cudf/source/api_docs/list_handling.rst | 6 + docs/cudf/source/api_docs/options.rst | 19 +- docs/cudf/source/api_docs/series.rst | 22 +- docs/cudf/source/api_docs/string_handling.rst | 6 + docs/cudf/source/api_docs/struct_handling.rst | 6 + .../cudf/source/api_docs/subword_tokenize.rst | 2 +- docs/cudf/source/conf.py | 52 +- .../developer_guide/contributing_guide.md | 7 +- .../source/developer_guide/documentation.md | 29 + docs/cudf/source/developer_guide/index.md | 1 + .../source/developer_guide/library_design.md | 26 +- docs/cudf/source/developer_guide/pylibcudf.md | 155 ++ docs/cudf/source/user_guide/10min.ipynb | 74 +- .../user_guide/cudf.CategoricalDtype.rst | 19 - .../user_guide/cudf.Decimal128Dtype.rst | 20 - .../source/user_guide/cudf.Decimal32Dtype.rst | 20 - .../source/user_guide/cudf.Decimal64Dtype.rst | 20 - .../cudf/source/user_guide/cudf.ListDtype.rst | 19 - .../source/user_guide/cudf.StructDtype.rst | 18 - .../cudf/source/user_guide/cupy-interop.ipynb | 14 +- docs/cudf/source/user_guide/data-types.md | 41 +- docs/cudf/source/user_guide/groupby.md | 5 +- .../source/user_guide/guide-to-udfs.ipynb | 4 +- docs/cudf/source/user_guide/io/io.md | 19 +- .../source/user_guide/pandas-comparison.md | 66 +- docs/dask_cudf/Makefile | 3 +- docs/dask_cudf/source/conf.py | 4 +- fetch_rapids.cmake | 2 +- java/ci/Dockerfile.centos7 | 2 +- java/ci/README.md | 4 +- java/pom.xml | 2 +- .../main/java/ai/rapids/cudf/ChunkedPack.java | 103 + .../java/ai/rapids/cudf/ColumnVector.java | 53 +- .../main/java/ai/rapids/cudf/ColumnView.java | 187 +- .../ai/rapids/cudf/ColumnWriterOptions.java | 30 + .../java/ai/rapids/cudf/ContiguousTable.java | 27 +- .../java/ai/rapids/cudf/CudaException.java | 26 +- .../ai/rapids/cudf/CudaFatalException.java | 12 +- .../cudf/CudfColumnSizeOverflowException.java | 34 + .../java/ai/rapids/cudf/CudfException.java | 16 +- .../cudf/DefaultHostMemoryAllocator.java | 36 + .../java/ai/rapids/cudf/HostColumnVector.java | 47 +- .../ai/rapids/cudf/HostMemoryAllocator.java | 39 + .../ai/rapids/cudf/HostMemoryReservation.java | 32 + .../ai/rapids/cudf/JCudfSerialization.java | 23 +- .../java/ai/rapids/cudf/MemoryCleaner.java | 41 +- .../ai/rapids/cudf/PackedColumnMetadata.java | 74 + .../java/ai/rapids/cudf/PinnedMemoryPool.java | 164 +- java/src/main/java/ai/rapids/cudf/Scalar.java | 21 +- java/src/main/java/ai/rapids/cudf/Schema.java | 13 +- java/src/main/java/ai/rapids/cudf/Table.java | 481 +++-- .../main/java/ai/rapids/cudf/TableDebug.java | 280 +++ .../main/java/ai/rapids/cudf/TableWriter.java | 76 +- .../ai/rapids/cudf/ast/UnaryOperator.java | 53 +- .../cudf/nvcomp/BatchedLZ4Compressor.java | 10 +- java/src/main/native/CMakeLists.txt | 6 +- java/src/main/native/include/jni_utils.hpp | 127 +- java/src/main/native/src/ChunkedPackJni.cpp | 75 + java/src/main/native/src/ColumnViewJni.cpp | 94 +- java/src/main/native/src/ColumnViewJni.cu | 8 +- java/src/main/native/src/ColumnViewJni.hpp | 4 +- .../main/native/src/CompiledExpression.cpp | 51 +- .../main/native/src/ContiguousTableJni.cpp | 30 +- java/src/main/native/src/CudfJni.cpp | 25 +- .../native/src/PackedColumnMetadataJni.cpp | 41 + java/src/main/native/src/RmmJni.cpp | 7 +- java/src/main/native/src/TableJni.cpp | 283 +-- java/src/main/native/src/cudf_jni_apis.hpp | 3 +- .../main/native/src/jni_writer_data_sink.hpp | 29 +- java/src/main/native/src/row_conversion.cu | 34 +- .../java/ai/rapids/cudf/ColumnVectorTest.java | 151 +- .../cudf/ColumnViewNonEmptyNullsTest.java | 6 +- .../test/java/ai/rapids/cudf/CuFileTest.java | 15 +- .../java/ai/rapids/cudf/CudaFatalTest.java | 6 +- .../test/java/ai/rapids/cudf/CudaTest.java | 6 +- .../java/ai/rapids/cudf/GatherMapTest.java | 8 +- .../java/ai/rapids/cudf/LargeTableTest.java | 66 + .../java/ai/rapids/cudf/MemoryBufferTest.java | 42 +- .../java/ai/rapids/cudf/ReductionTest.java | 11 + .../src/test/java/ai/rapids/cudf/RmmTest.java | 13 +- .../test/java/ai/rapids/cudf/TableTest.java | 291 ++- .../rapids/cudf/UnsafeMemoryAccessorTest.java | 4 +- .../cudf/ast/CompiledExpressionTest.java | 20 +- .../ai/rapids/cudf/nvcomp/NvcompTest.java | 8 +- pyproject.toml | 13 + python/cudf/CMakeLists.txt | 4 +- python/cudf/benchmarks/API/bench_dataframe.py | 34 + .../internal/bench_dataframe_internal.py | 6 +- .../cudf/cmake/Modules/ProtobufHelpers.cmake | 3 +- python/cudf/cmake/Modules/WheelHelpers.cmake | 4 +- python/cudf/cudf/__init__.py | 44 +- python/cudf/cudf/_fuzz_testing/utils.py | 12 +- python/cudf/cudf/_lib/CMakeLists.txt | 7 +- python/cudf/cudf/_lib/__init__.py | 1 + python/cudf/cudf/_lib/column.pxd | 6 +- python/cudf/cudf/_lib/column.pyi | 11 +- python/cudf/cudf/_lib/column.pyx | 180 +- python/cudf/cudf/_lib/concat.pyx | 7 +- python/cudf/cudf/_lib/copying.pyx | 37 +- python/cudf/cudf/_lib/cpp/CMakeLists.txt | 23 + python/cudf/cudf/_lib/cpp/column/column.pxd | 8 +- .../cudf/cudf/_lib/cpp/column/column_view.pxd | 3 +- python/cudf/cudf/_lib/cpp/copying.pxd | 6 +- python/cudf/cudf/_lib/cpp/copying.pyx | 0 python/cudf/cudf/_lib/cpp/expressions.pxd | 9 +- .../cudf/cudf/_lib/cpp/io/arrow_io_source.pxd | 15 + python/cudf/cudf/_lib/cpp/io/data_sink.pxd | 8 + python/cudf/cudf/_lib/cpp/io/datasource.pxd | 8 + python/cudf/cudf/_lib/cpp/io/orc.pxd | 16 +- python/cudf/cudf/_lib/cpp/io/parquet.pxd | 24 +- python/cudf/cudf/_lib/cpp/io/types.pxd | 29 +- python/cudf/cudf/_lib/cpp/libcpp/memory.pxd | 12 + python/cudf/cudf/_lib/cpp/null_mask.pxd | 10 +- .../cudf/_lib/cpp/nvtext/generate_ngrams.pxd | 7 +- python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd | 16 + python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd | 8 +- .../cudf/_lib/cpp/nvtext/subword_tokenize.pxd | 8 +- python/cudf/cudf/_lib/cpp/sorting.pxd | 29 +- .../cudf/cudf/_lib/cpp/stream_compaction.pxd | 16 +- python/cudf/cudf/_lib/cpp/table/table.pxd | 3 +- python/cudf/cudf/_lib/cpp/types.pxd | 68 +- python/cudf/cudf/_lib/cpp/types.pyx | 0 python/cudf/cudf/_lib/csv.pyx | 8 +- python/cudf/cudf/_lib/exception_handler.hpp | 80 - python/cudf/cudf/_lib/exception_handler.pxd | 66 +- python/cudf/cudf/_lib/expressions.pxd | 3 + python/cudf/cudf/_lib/expressions.pyx | 40 +- python/cudf/cudf/_lib/interop.pyx | 4 + python/cudf/cudf/_lib/io/datasource.pxd | 8 +- python/cudf/cudf/_lib/io/datasource.pyx | 7 +- python/cudf/cudf/_lib/io/utils.pxd | 10 +- python/cudf/cudf/_lib/io/utils.pyx | 6 +- python/cudf/cudf/_lib/join.pyx | 14 +- python/cudf/cudf/_lib/json.pyx | 15 +- python/cudf/cudf/_lib/null_mask.pyx | 13 +- python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 5 +- .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx | 20 +- python/cudf/cudf/_lib/nvtext/jaccard.pyx | 31 + python/cudf/cudf/_lib/nvtext/minhash.pyx | 36 +- python/cudf/cudf/_lib/nvtext/stemmer.pyx | 4 +- .../cudf/_lib/nvtext/subword_tokenize.pyx | 4 +- python/cudf/cudf/_lib/orc.pyx | 36 +- python/cudf/cudf/_lib/parquet.pyx | 46 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 21 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 18 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 16 + python/cudf/cudf/_lib/pylibcudf/column.pxd | 50 + python/cudf/cudf/_lib/pylibcudf/column.pyx | 194 ++ python/cudf/cudf/_lib/pylibcudf/copying.pxd | 15 + python/cudf/cudf/_lib/pylibcudf/copying.pyx | 57 + .../cudf/_lib/pylibcudf/gpumemoryview.pxd | 9 + .../cudf/_lib/pylibcudf/gpumemoryview.pyx | 27 + python/cudf/cudf/_lib/pylibcudf/table.pxd | 18 + python/cudf/cudf/_lib/pylibcudf/table.pyx | 62 + python/cudf/cudf/_lib/pylibcudf/types.pxd | 16 + python/cudf/cudf/_lib/pylibcudf/types.pyx | 45 + python/cudf/cudf/_lib/pylibcudf/utils.pxd | 7 + python/cudf/cudf/_lib/pylibcudf/utils.pyx | 13 + python/cudf/cudf/_lib/scalar.pyx | 68 +- python/cudf/cudf/_lib/sort.pyx | 274 ++- python/cudf/cudf/_lib/stream_compaction.pyx | 39 +- python/cudf/cudf/_lib/strings/__init__.py | 4 +- python/cudf/cudf/_lib/strings/char_types.pyx | 4 +- python/cudf/cudf/_lib/strings/combine.pyx | 6 +- .../strings/convert/convert_fixed_point.pyx | 12 +- python/cudf/cudf/_lib/strings/translate.pyx | 4 +- python/cudf/cudf/_lib/strings_udf.pyx | 4 +- python/cudf/cudf/_lib/transform.pyx | 2 +- python/cudf/cudf/_lib/types.pxd | 3 +- python/cudf/cudf/_lib/types.pyx | 82 +- python/cudf/cudf/_lib/utils.pxd | 3 +- python/cudf/cudf/_lib/utils.pyx | 28 +- python/cudf/cudf/api/extensions/__init__.py | 5 +- python/cudf/cudf/api/types.py | 28 +- python/cudf/cudf/core/_base_index.py | 376 +++- python/cudf/cudf/core/_compat.py | 1 + .../cudf/cudf/core/_internals/expressions.py | 2 + python/cudf/cudf/core/_internals/timezones.py | 40 +- python/cudf/cudf/core/_internals/where.py | 5 +- python/cudf/cudf/core/algorithms.py | 14 +- python/cudf/cudf/core/buffer/__init__.py | 2 +- python/cudf/cudf/core/buffer/buffer.py | 56 +- python/cudf/cudf/core/buffer/cow_buffer.py | 170 -- .../core/buffer/exposure_tracked_buffer.py | 311 +++ python/cudf/cudf/core/buffer/spill_manager.py | 2 +- .../cudf/cudf/core/buffer/spillable_buffer.py | 31 +- python/cudf/cudf/core/buffer/utils.py | 17 +- python/cudf/cudf/core/column/categorical.py | 90 +- python/cudf/cudf/core/column/column.py | 450 +++-- python/cudf/cudf/core/column/datetime.py | 102 +- python/cudf/cudf/core/column/decimal.py | 68 +- python/cudf/cudf/core/column/interval.py | 3 +- python/cudf/cudf/core/column/numerical.py | 92 +- .../cudf/cudf/core/column/numerical_base.py | 10 + python/cudf/cudf/core/column/string.py | 181 +- python/cudf/cudf/core/column/struct.py | 8 +- python/cudf/cudf/core/column/timedelta.py | 39 +- python/cudf/cudf/core/column_accessor.py | 45 +- python/cudf/cudf/core/copy_types.py | 171 ++ python/cudf/cudf/core/dataframe.py | 884 ++++++--- python/cudf/cudf/core/dtypes.py | 72 +- python/cudf/cudf/core/frame.py | 285 +-- python/cudf/cudf/core/groupby/groupby.py | 196 +- python/cudf/cudf/core/index.py | 635 +++--- python/cudf/cudf/core/indexed_frame.py | 485 ++++- python/cudf/cudf/core/indexing_utils.py | 243 +++ python/cudf/cudf/core/join/_join_helpers.py | 36 +- python/cudf/cudf/core/join/join.py | 27 +- python/cudf/cudf/core/missing.py | 6 +- python/cudf/cudf/core/multiindex.py | 121 +- python/cudf/cudf/core/resample.py | 61 +- python/cudf/cudf/core/reshape.py | 29 +- python/cudf/cudf/core/scalar.py | 27 +- python/cudf/cudf/core/series.py | 188 +- python/cudf/cudf/core/single_column_frame.py | 43 +- python/cudf/cudf/core/subword_tokenizer.py | 3 +- python/cudf/cudf/core/tools/datetimes.py | 46 +- python/cudf/cudf/core/udf/groupby_lowering.py | 61 +- python/cudf/cudf/core/udf/groupby_typing.py | 105 +- python/cudf/cudf/core/udf/groupby_utils.py | 54 +- python/cudf/cudf/core/udf/masked_lowering.py | 1 + python/cudf/cudf/core/udf/masked_typing.py | 29 + python/cudf/cudf/core/udf/utils.py | 152 +- python/cudf/cudf/errors.py | 6 +- python/cudf/cudf/io/csv.py | 7 +- python/cudf/cudf/io/json.py | 7 +- python/cudf/cudf/io/orc.py | 4 +- python/cudf/cudf/io/parquet.py | 148 +- python/cudf/cudf/options.py | 49 + python/cudf/cudf/testing/testing.py | 4 +- .../orc/TestOrcFile.Spark.EmptyDecompData.orc | Bin 0 -> 373 bytes ...tOrcFile.Spark.NestedNotNullableStruct.orc | Bin 0 -> 310 bytes .../tests/data/parquet/delta_encoding.parquet | Bin 577 -> 577 bytes .../data/parquet/fixed_len_byte_array.parquet | Bin 0 -> 259 bytes .../data/parquet/rle_boolean_encoding.parquet | Bin 0 -> 192 bytes .../cudf/tests/dataframe/test_conversion.py | 37 + .../tests/indexes/datetime/test_indexing.py | 19 + .../indexes/datetime/test_time_specific.py | 16 + .../cudf/cudf/tests/indexes/test_interval.py | 309 +++ .../cudf/cudf/tests/input_output/test_text.py | 165 +- .../cudf/cudf/tests/series/test_conversion.py | 33 + .../cudf/tests/series/test_datetimelike.py | 67 + python/cudf/cudf/tests/test_api_types.py | 24 +- python/cudf/cudf/tests/test_array_function.py | 24 +- python/cudf/cudf/tests/test_binops.py | 102 +- python/cudf/cudf/tests/test_categorical.py | 10 + python/cudf/cudf/tests/test_column.py | 4 +- python/cudf/cudf/tests/test_copying.py | 582 +++--- python/cudf/cudf/tests/test_csv.py | 18 +- .../cudf/cudf/tests/test_custom_accessor.py | 4 +- python/cudf/cudf/tests/test_dataframe.py | 266 ++- python/cudf/cudf/tests/test_dataframe_copy.py | 42 +- python/cudf/cudf/tests/test_datetime.py | 74 + python/cudf/cudf/tests/test_decimal.py | 2 +- .../cudf/tests/test_extension_compilation.py | 15 +- python/cudf/cudf/tests/test_factorize.py | 37 +- python/cudf/cudf/tests/test_feather.py | 12 +- python/cudf/cudf/tests/test_groupby.py | 325 +++- python/cudf/cudf/tests/test_index.py | 412 ++-- python/cudf/cudf/tests/test_indexing.py | 440 ++++- python/cudf/cudf/tests/test_interval.py | 52 +- python/cudf/cudf/tests/test_joining.py | 48 +- python/cudf/cudf/tests/test_json.py | 77 +- python/cudf/cudf/tests/test_list.py | 7 +- python/cudf/cudf/tests/test_monotonic.py | 13 +- python/cudf/cudf/tests/test_multiindex.py | 86 +- python/cudf/cudf/tests/test_numba_import.py | 46 + python/cudf/cudf/tests/test_numerical.py | 6 +- python/cudf/cudf/tests/test_onehot.py | 100 +- python/cudf/cudf/tests/test_options.py | 114 +- python/cudf/cudf/tests/test_orc.py | 28 +- python/cudf/cudf/tests/test_parquet.py | 194 +- python/cudf/cudf/tests/test_quantiles.py | 15 + python/cudf/cudf/tests/test_rank.py | 12 +- python/cudf/cudf/tests/test_reductions.py | 69 +- python/cudf/cudf/tests/test_replace.py | 13 +- python/cudf/cudf/tests/test_repr.py | 119 +- python/cudf/cudf/tests/test_reshape.py | 112 ++ python/cudf/cudf/tests/test_scalar.py | 28 +- python/cudf/cudf/tests/test_search.py | 14 +- python/cudf/cudf/tests/test_serialize.py | 11 + python/cudf/cudf/tests/test_series.py | 84 +- python/cudf/cudf/tests/test_setitem.py | 97 +- python/cudf/cudf/tests/test_sparse_df.py | 4 +- python/cudf/cudf/tests/test_string.py | 8 +- python/cudf/cudf/tests/test_string_udfs.py | 8 +- python/cudf/cudf/tests/test_struct.py | 19 +- .../cudf/cudf/tests/test_subword_tokenizer.py | 238 --- python/cudf/cudf/tests/test_timedelta.py | 42 + python/cudf/cudf/tests/test_udf_masked_ops.py | 36 + python/cudf/cudf/tests/test_unaops.py | 11 +- .../cudf/tests/text/test_subword_tokenizer.py | 239 ++- .../test_text_methods.py} | 303 +-- python/cudf/cudf/utils/_numba.py | 191 ++ python/cudf/cudf/utils/_ptxcompiler.py | 107 + python/cudf/cudf/utils/applyutils.py | 12 +- python/cudf/cudf/utils/cudautils.py | 144 +- python/cudf/cudf/utils/dtypes.py | 22 +- python/cudf/cudf/utils/ioutils.py | 19 +- python/cudf/cudf/utils/queryutils.py | 9 +- python/cudf/cudf/utils/utils.py | 187 +- python/cudf/pyproject.toml | 28 +- python/cudf/udf_cpp/CMakeLists.txt | 2 +- python/cudf/udf_cpp/shim.cu | 135 +- python/cudf_kafka/cudf_kafka/_lib/kafka.pxd | 4 +- python/cudf_kafka/cudf_kafka/_lib/kafka.pyx | 32 +- python/cudf_kafka/pyproject.toml | 10 +- python/custreamz/custreamz/kafka.py | 21 +- .../custreamz/tests/test_dataframes.py | 4 +- python/custreamz/pyproject.toml | 6 +- python/dask_cudf/dask_cudf/__init__.py | 2 +- python/dask_cudf/dask_cudf/backends.py | 36 + python/dask_cudf/dask_cudf/io/csv.py | 23 +- python/dask_cudf/dask_cudf/io/parquet.py | 105 +- .../dask_cudf/dask_cudf/io/tests/test_csv.py | 83 + .../dask_cudf/io/tests/test_parquet.py | 70 +- .../dask_cudf/tests/test_accessor.py | 34 +- .../dask_cudf/tests/test_dispatch.py | 65 +- .../dask_cudf/dask_cudf/tests/test_groupby.py | 30 +- python/dask_cudf/pyproject.toml | 14 +- 1080 files changed, 46293 insertions(+), 20963 deletions(-) delete mode 100644 .flake8 create mode 100644 .github/copy-pr-bot.yaml create mode 100755 ci/build_wheel.sh create mode 100755 ci/build_wheel_cudf.sh create mode 100755 ci/build_wheel_dask_cudf.sh delete mode 100755 ci/docs/build.sh delete mode 100755 ci/release/apply_wheel_modifications.sh create mode 100755 ci/test_wheel_cudf.sh create mode 100755 ci/test_wheel_dask_cudf.sh create mode 100644 conda/environments/all_cuda-120_arch-x86_64.yaml delete mode 100644 conda/recipes/libcudf/nvcomp.txt delete mode 100644 conda/recipes/libcudf/post-link.sh create mode 100644 cpp/benchmarks/fixture/nvbench_fixture.hpp delete mode 100644 cpp/benchmarks/fixture/rmm_pool_raii.hpp create mode 100644 cpp/benchmarks/stream_compaction/stable_distinct.cpp create mode 100644 cpp/benchmarks/string/char_types.cpp create mode 100644 cpp/benchmarks/string/count.cpp create mode 100644 cpp/benchmarks/string/gather.cpp create mode 100644 cpp/benchmarks/string/join_strings.cpp create mode 100644 cpp/benchmarks/string/split_re.cpp create mode 100644 cpp/benchmarks/text/edit_distance.cpp create mode 100644 cpp/benchmarks/text/hash_ngrams.cpp create mode 100644 cpp/benchmarks/text/jaccard.cpp delete mode 100644 cpp/benchmarks/text/normalize_spaces.cpp create mode 100644 cpp/cmake/thirdparty/get_libcudacxx.cmake create mode 100644 cpp/include/cudf/ast/detail/expression_transformer.hpp rename cpp/include/cudf/detail/{concatenate.cuh => concatenate_masks.hpp} (76%) delete mode 100644 cpp/include/cudf/detail/utilities/hash_functions.cuh create mode 100644 cpp/include/cudf/detail/utilities/stacktrace.hpp create mode 100644 cpp/include/cudf/hashing/detail/default_hash.cuh create mode 100644 cpp/include/cudf/hashing/detail/hash_functions.cuh rename cpp/include/cudf/{ => hashing}/detail/hashing.hpp (62%) create mode 100644 cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh create mode 100644 cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh create mode 100644 cpp/include/cudf/io/arrow_io_source.hpp create mode 100644 cpp/include/cudf/io/parquet_metadata.hpp create mode 100644 cpp/include/cudf/reduction/detail/reduction.hpp create mode 100644 cpp/include/nvtext/detail/generate_ngrams.hpp create mode 100644 cpp/include/nvtext/jaccard.hpp create mode 100644 cpp/src/hash/murmurhash3_x64_128.cu rename cpp/src/hash/{murmur_hash.cu => murmurhash3_x86_32.cu} (64%) rename cpp/src/hash/{spark_murmur_hash.cu => spark_murmurhash3_x86_32.cu} (81%) create mode 100644 cpp/src/hash/xxhash_64.cu create mode 100644 cpp/src/io/comp/statistics.cu rename cpp/src/io/json/{experimental => }/byte_range_info.cu (89%) rename cpp/src/io/json/{ => legacy}/json_gpu.cu (95%) rename cpp/src/io/json/{ => legacy}/json_gpu.hpp (95%) create mode 100644 cpp/src/io/json/legacy/read_json.hpp rename cpp/src/io/json/{ => legacy}/reader_impl.cu (94%) rename cpp/src/io/json/{experimental/read_json.cpp => read_json.cu} (80%) rename cpp/src/io/json/{experimental => }/read_json.hpp (91%) create mode 100644 cpp/src/io/parquet/decode_preprocess.cu create mode 100644 cpp/src/io/parquet/delta_binary.cuh create mode 100644 cpp/src/io/parquet/page_decode.cuh create mode 100644 cpp/src/io/parquet/page_delta_decode.cu create mode 100644 cpp/src/io/parquet/page_string_decode.cu create mode 100644 cpp/src/io/parquet/page_string_utils.cuh create mode 100644 cpp/src/io/parquet/predicate_pushdown.cpp create mode 100644 cpp/src/io/parquet/rle_stream.cuh create mode 100644 cpp/src/io/utilities/arrow_io_source.cpp create mode 100644 cpp/src/reductions/nested_type_minmax_util.cuh delete mode 100644 cpp/src/reductions/struct_minmax_util.cuh create mode 100644 cpp/src/rolling/detail/optimized_unbounded_window.cpp create mode 100644 cpp/src/rolling/detail/optimized_unbounded_window.hpp create mode 100644 cpp/src/rolling/detail/range_comparator_utils.cuh create mode 100644 cpp/src/text/jaccard.cu create mode 100644 cpp/src/utilities/stacktrace.cpp delete mode 100644 cpp/tests/hashing/hash_test.cpp create mode 100644 cpp/tests/hashing/md5_test.cpp create mode 100644 cpp/tests/hashing/murmurhash3_x64_128_test.cpp create mode 100644 cpp/tests/hashing/murmurhash3_x86_32_test.cpp create mode 100644 cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp create mode 100644 cpp/tests/hashing/xxhash_64_test.cpp create mode 100644 cpp/tests/rolling/range_comparator_test.cu create mode 100644 cpp/tests/stream_compaction/stable_distinct_tests.cpp create mode 100644 cpp/tests/streams/concatenate_test.cpp create mode 100644 cpp/tests/streams/copying_test.cpp create mode 100644 cpp/tests/streams/filling_test.cpp create mode 100644 cpp/tests/streams/groupby_test.cpp create mode 100644 cpp/tests/streams/hash_test.cpp create mode 100644 cpp/tests/streams/replace_test.cpp create mode 100644 cpp/tests/streams/search_test.cpp create mode 100644 cpp/tests/table/row_operator_tests_utilities.cu create mode 100644 cpp/tests/table/row_operator_tests_utilities.hpp create mode 100644 cpp/tests/text/jaccard_tests.cpp rename docs/cudf/source/_templates/autosummary/{class_without_autosummary.rst => class.rst} (50%) delete mode 100644 docs/cudf/source/_templates/autosummary/class_with_autosummary.rst create mode 100644 docs/cudf/source/api_docs/extension_dtypes.rst create mode 100644 docs/cudf/source/developer_guide/pylibcudf.md delete mode 100644 docs/cudf/source/user_guide/cudf.CategoricalDtype.rst delete mode 100644 docs/cudf/source/user_guide/cudf.Decimal128Dtype.rst delete mode 100644 docs/cudf/source/user_guide/cudf.Decimal32Dtype.rst delete mode 100644 docs/cudf/source/user_guide/cudf.Decimal64Dtype.rst delete mode 100644 docs/cudf/source/user_guide/cudf.ListDtype.rst delete mode 100644 docs/cudf/source/user_guide/cudf.StructDtype.rst create mode 100644 java/src/main/java/ai/rapids/cudf/ChunkedPack.java create mode 100755 java/src/main/java/ai/rapids/cudf/CudfColumnSizeOverflowException.java create mode 100644 java/src/main/java/ai/rapids/cudf/DefaultHostMemoryAllocator.java create mode 100644 java/src/main/java/ai/rapids/cudf/HostMemoryAllocator.java create mode 100644 java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java create mode 100644 java/src/main/java/ai/rapids/cudf/PackedColumnMetadata.java create mode 100644 java/src/main/java/ai/rapids/cudf/TableDebug.java create mode 100644 java/src/main/native/src/ChunkedPackJni.cpp create mode 100644 java/src/main/native/src/PackedColumnMetadataJni.cpp create mode 100644 java/src/test/java/ai/rapids/cudf/LargeTableTest.java create mode 100644 python/cudf/cudf/_lib/cpp/CMakeLists.txt create mode 100644 python/cudf/cudf/_lib/cpp/copying.pyx create mode 100644 python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd create mode 100644 python/cudf/cudf/_lib/cpp/io/data_sink.pxd create mode 100644 python/cudf/cudf/_lib/cpp/io/datasource.pxd create mode 100644 python/cudf/cudf/_lib/cpp/libcpp/memory.pxd create mode 100644 python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd create mode 100644 python/cudf/cudf/_lib/cpp/types.pyx delete mode 100644 python/cudf/cudf/_lib/exception_handler.hpp create mode 100644 python/cudf/cudf/_lib/nvtext/jaccard.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt create mode 100644 python/cudf/cudf/_lib/pylibcudf/__init__.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/__init__.py create mode 100644 python/cudf/cudf/_lib/pylibcudf/column.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/column.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/copying.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/copying.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/table.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/table.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/types.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/types.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/utils.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/utils.pyx delete mode 100644 python/cudf/cudf/core/buffer/cow_buffer.py create mode 100644 python/cudf/cudf/core/buffer/exposure_tracked_buffer.py create mode 100644 python/cudf/cudf/core/copy_types.py create mode 100644 python/cudf/cudf/core/indexing_utils.py create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.EmptyDecompData.orc create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.NestedNotNullableStruct.orc create mode 100644 python/cudf/cudf/tests/data/parquet/fixed_len_byte_array.parquet create mode 100644 python/cudf/cudf/tests/data/parquet/rle_boolean_encoding.parquet create mode 100644 python/cudf/cudf/tests/indexes/datetime/test_indexing.py create mode 100644 python/cudf/cudf/tests/test_numba_import.py delete mode 100644 python/cudf/cudf/tests/test_subword_tokenizer.py rename python/cudf/cudf/tests/{test_text.py => text/test_text_methods.py} (81%) create mode 100644 python/cudf/cudf/utils/_numba.py create mode 100644 python/cudf/cudf/utils/_ptxcompiler.py diff --git a/.flake8 b/.flake8 deleted file mode 100644 index e80e3afc443..00000000000 --- a/.flake8 +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2017-2023, NVIDIA CORPORATION. - -[flake8] -filename = *.py, *.pyx, *.pxd, *.pxi -exclude = __init__.py, *.egg, build, docs, .git -force-check = True -ignore = - # line break before binary operator - W503, - # whitespace before : - E203 -per-file-ignores = - # Rules ignored only in Cython: - # E211: whitespace before '(' (used in multi-line imports) - # E225: Missing whitespace around operators (breaks cython casting syntax like ) - # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*) - # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax) - # E275: Missing whitespace after keyword (Doesn't work with Cython except?) - # E402: invalid syntax (works for Python, not Cython) - # E999: invalid syntax (works for Python, not Cython) - # W504: line break after binary operator (breaks lines that end with a pointer) - *.pyx: E211, E225, E226, E227, E275, E402, E999, W504 - *.pxd: E211, E225, E226, E227, E275, E402, E999, W504 - *.pxi: E211, E225, E226, E227, E275, E402, E999, W504 diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml new file mode 100644 index 00000000000..895ba83ee54 --- /dev/null +++ b/.github/copy-pr-bot.yaml @@ -0,0 +1,4 @@ +# Configuration file for `copy-pr-bot` GitHub App +# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ + +enabled: true diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml index 2d1444c595d..9a0b4155035 100644 --- a/.github/ops-bot.yaml +++ b/.github/ops-bot.yaml @@ -5,5 +5,4 @@ auto_merger: true branch_checker: true label_checker: true release_drafter: true -copy_prs: true recently_updated: true diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 2d592d3f247..91ec0904103 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -54,31 +54,32 @@ jobs: sha: ${{ inputs.sha }} skip_upload_pkgs: libcudf-example docs-build: - if: github.ref_type == 'branch' && github.event_name == 'push' + if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: - build_type: branch - node_type: "gpu-v100-latest-1" arch: "amd64" + branch: ${{ inputs.branch }} + build_type: ${{ inputs.build_type || 'branch' }} container_image: "rapidsai/ci:latest" + date: ${{ inputs.date }} + node_type: "gpu-v100-latest-1" run_script: "ci/build_docs.sh" + sha: ${{ inputs.sha }} wheel-build-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-120-pip + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} date: ${{ inputs.date }} - package-name: cudf - package-dir: python/cudf - skbuild-configure-options: "-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF" + script: ci/build_wheel_cudf.sh wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@cuda-120-pip + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -88,18 +89,18 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@cuda-120-pip + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 with: + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} date: ${{ inputs.date }} - package-name: dask_cudf - package-dir: python/dask_cudf + script: ci/build_wheel_dask_cudf.sh wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-publish.yml@cuda-120-pip + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 040fac16b8d..b47a40b13d2 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -26,34 +26,34 @@ jobs: - wheel-build-dask-cudf - wheel-tests-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10 checks: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10 with: build_type: pull-request conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10 with: build_type: pull-request conda-python-cudf-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 with: build_type: pull-request test_script: "ci/test_python_cudf.sh" @@ -61,14 +61,14 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 with: build_type: pull-request test_script: "ci/test_python_other.sh" conda-java-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -78,7 +78,7 @@ jobs: conda-notebook-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -88,7 +88,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -98,37 +98,30 @@ jobs: wheel-build-cudf: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@cuda-120-pip + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 with: build_type: pull-request - package-name: cudf - package-dir: python/cudf - skbuild-configure-options: "-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF" + script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-120-pip + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 with: build_type: pull-request - package-name: cudf - test-unittest: "python -m pytest -n 8 ./python/cudf/cudf/tests" - test-smoketest: "python ./ci/wheel_smoke_test_cudf.py" + script: ci/test_wheel_cudf.sh wheel-build-dask-cudf: needs: wheel-tests-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-build.yml@cuda-120-pip + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10 with: + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request - package-name: dask_cudf - package-dir: python/dask_cudf - before-wheel: "RAPIDS_PY_WHEEL_NAME=cudf_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-cudf && python -m pip install --no-deps ./local-cudf/cudf*.whl" + script: "ci/build_wheel_dask_cudf.sh" wheel-tests-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@cuda-120-pip + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 with: + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request - package-name: dask_cudf - # Install the cudf we just built, and also test against latest dask/distributed/dask-cuda. - test-before: "RAPIDS_PY_WHEEL_NAME=cudf_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-cudf-dep && python -m pip install --no-deps ./local-cudf-dep/cudf*.whl && python -m pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06" - test-unittest: "python -m pytest -n 8 ./python/dask_cudf/dask_cudf/tests" + script: ci/test_wheel_dask_cudf.sh diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a6fbc522845..6bd2787d6dc 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -36,7 +36,7 @@ jobs: run_script: "ci/test_cpp_memcheck.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: test_script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -79,23 +79,20 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-120-pip + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - package-name: cudf - test-unittest: "python -m pytest -n 8 ./python/cudf/cudf/tests" + script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure-test.yml@cuda-120-pip + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 with: + matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - package-name: dask_cudf - # Test against latest dask/distributed/dask-cuda. - test-before: "python -m pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06" - test-unittest: "python -m pytest -n 8 ./python/dask_cudf/dask_cudf/tests" + script: ci/test_wheel_dask_cudf.sh diff --git a/.gitignore b/.gitignore index fb5c301fe3f..a9bf0854d65 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ DartConfiguration.tcl *.spec .nfs* .clangd +compile_commands.json ## Python build directories & artifacts dask-worker-space/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0ac54113278..b5165cf026f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,17 +30,8 @@ repos: files: python/.* # Explicitly specify the pyproject.toml at the repo root, not per-project. args: ["--config", "pyproject.toml"] - - repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 - hooks: - - id: flake8 - args: ["--config=.flake8"] - files: python/.*$ - types: [file] - types_or: [python, cython] - additional_dependencies: ["flake8-force"] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.1.10 + rev: v0.15.0 hooks: - id: cython-lint - repo: https://github.com/pre-commit/mirrors-mypy @@ -85,7 +76,7 @@ repos: (?x)^( ^cpp/include/cudf_test/cxxopts.hpp| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*| - ^python/cudf/cudf/tests/test_text.py + ^python/cudf/cudf/tests/text/test_text_methods.py ) - repo: local hooks: @@ -165,6 +156,12 @@ repos: hooks: - id: rapids-dependency-file-generator args: ["--clean"] + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.0.278 + hooks: + - id: ruff + files: python/.*$ + default_language_version: python: python3 diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c28dba6462..76abf241d96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,431 @@ +# cuDF 23.08.00 (9 Aug 2023) + +## 🚨 Breaking Changes + +- Enforce deprecations and add clarifications around existing deprecations ([#13710](https://github.com/rapidsai/cudf/pull/13710)) [@galipremsagar](https://github.com/galipremsagar) +- Separate MurmurHash32 from hash_functions.cuh ([#13681](https://github.com/rapidsai/cudf/pull/13681)) [@davidwendt](https://github.com/davidwendt) +- Avoid storing metadata in pointers in ORC and Parquet writers ([#13648](https://github.com/rapidsai/cudf/pull/13648)) [@vuule](https://github.com/vuule) +- Expose streams in all public copying APIs ([#13629](https://github.com/rapidsai/cudf/pull/13629)) [@vyasr](https://github.com/vyasr) +- Remove deprecated cudf::strings::slice_strings (by delimiter) functions ([#13628](https://github.com/rapidsai/cudf/pull/13628)) [@davidwendt](https://github.com/davidwendt) +- Remove deprecated cudf.set_allocator. ([#13591](https://github.com/rapidsai/cudf/pull/13591)) [@bdice](https://github.com/bdice) +- Change build.sh to use pip install instead of setup.py ([#13507](https://github.com/rapidsai/cudf/pull/13507)) [@vyasr](https://github.com/vyasr) +- Remove unused max_rows_tensor parameter from subword tokenizer ([#13463](https://github.com/rapidsai/cudf/pull/13463)) [@davidwendt](https://github.com/davidwendt) +- Fix decimal scale reductions in `_get_decimal_type` ([#13224](https://github.com/rapidsai/cudf/pull/13224)) [@charlesbluca](https://github.com/charlesbluca) + +## 🐛 Bug Fixes + +- Add CUDA version to cudf_kafka and libcudf-example build strings. ([#13769](https://github.com/rapidsai/cudf/pull/13769)) [@bdice](https://github.com/bdice) +- Fix typo in wheels-test.yaml. ([#13763](https://github.com/rapidsai/cudf/pull/13763)) [@bdice](https://github.com/bdice) +- Don't test strings shorter than the requested ngram size ([#13758](https://github.com/rapidsai/cudf/pull/13758)) [@vyasr](https://github.com/vyasr) +- Add CUDA version to custreamz build string. ([#13754](https://github.com/rapidsai/cudf/pull/13754)) [@bdice](https://github.com/bdice) +- Fix writing of ORC files with empty child string columns ([#13745](https://github.com/rapidsai/cudf/pull/13745)) [@vuule](https://github.com/vuule) +- Remove the erroneous "empty level" short-circuit from ORC reader ([#13722](https://github.com/rapidsai/cudf/pull/13722)) [@vuule](https://github.com/vuule) +- Fix character counting when writing sliced tables into ORC ([#13721](https://github.com/rapidsai/cudf/pull/13721)) [@vuule](https://github.com/vuule) +- Parquet uses row group row count if missing from header ([#13712](https://github.com/rapidsai/cudf/pull/13712)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Fix reading of RLE encoded boolean data from parquet files with V2 page headers ([#13707](https://github.com/rapidsai/cudf/pull/13707)) [@etseidl](https://github.com/etseidl) +- Fix a corner case of list lexicographic comparator ([#13701](https://github.com/rapidsai/cudf/pull/13701)) [@ttnghia](https://github.com/ttnghia) +- Fix combined filtering and column projection in `dask_cudf.read_parquet` ([#13697](https://github.com/rapidsai/cudf/pull/13697)) [@rjzamora](https://github.com/rjzamora) +- Revert fetch-rapids changes ([#13696](https://github.com/rapidsai/cudf/pull/13696)) [@vyasr](https://github.com/vyasr) +- Data generator - include offsets in the size estimate of list elments ([#13688](https://github.com/rapidsai/cudf/pull/13688)) [@vuule](https://github.com/vuule) +- Add `cuda-nvcc-impl` to `cudf` for `numba` CUDA 12 ([#13673](https://github.com/rapidsai/cudf/pull/13673)) [@jakirkham](https://github.com/jakirkham) +- Fix combined filtering and column projection in `read_parquet` ([#13666](https://github.com/rapidsai/cudf/pull/13666)) [@rjzamora](https://github.com/rjzamora) +- Use `thrust::identity` as hash functions for byte pair encoding ([#13665](https://github.com/rapidsai/cudf/pull/13665)) [@PointKernel](https://github.com/PointKernel) +- Fix loc-getitem ordering when index contains duplicate labels ([#13659](https://github.com/rapidsai/cudf/pull/13659)) [@wence-](https://github.com/wence-) +- [REVIEW] Introduce parity with pandas for `MultiIndex.loc` ordering & fix a bug in `Groupby` with `as_index` ([#13657](https://github.com/rapidsai/cudf/pull/13657)) [@galipremsagar](https://github.com/galipremsagar) +- Fix memcheck error found in nvtext tokenize functions ([#13649](https://github.com/rapidsai/cudf/pull/13649)) [@davidwendt](https://github.com/davidwendt) +- Fix `has_nonempty_nulls` ignoring column offset ([#13647](https://github.com/rapidsai/cudf/pull/13647)) [@ttnghia](https://github.com/ttnghia) +- [Java] Avoid double-free corruption in case of an Exception while creating a ColumnView ([#13645](https://github.com/rapidsai/cudf/pull/13645)) [@razajafri](https://github.com/razajafri) +- Fix memcheck error in ORC reader call to cudf::io::copy_uncompressed_kernel ([#13643](https://github.com/rapidsai/cudf/pull/13643)) [@davidwendt](https://github.com/davidwendt) +- Fix CUDA 12 conda environment to remove cubinlinker and ptxcompiler. ([#13636](https://github.com/rapidsai/cudf/pull/13636)) [@bdice](https://github.com/bdice) +- Fix inf/NaN comparisons for FLOAT orderby in window functions ([#13635](https://github.com/rapidsai/cudf/pull/13635)) [@mythrocks](https://github.com/mythrocks) +- Refactor `Index` search to simplify code and increase correctness ([#13625](https://github.com/rapidsai/cudf/pull/13625)) [@wence-](https://github.com/wence-) +- Fix compile warning for unused variable in split_re.cu ([#13621](https://github.com/rapidsai/cudf/pull/13621)) [@davidwendt](https://github.com/davidwendt) +- Fix tz_localize for dask_cudf Series ([#13610](https://github.com/rapidsai/cudf/pull/13610)) [@shwina](https://github.com/shwina) +- Fix issue with no decompressed data in ORC reader ([#13609](https://github.com/rapidsai/cudf/pull/13609)) [@vuule](https://github.com/vuule) +- Fix floating point window range extents. ([#13606](https://github.com/rapidsai/cudf/pull/13606)) [@mythrocks](https://github.com/mythrocks) +- Fix `localize(None)` for timezone-naive columns ([#13603](https://github.com/rapidsai/cudf/pull/13603)) [@shwina](https://github.com/shwina) +- Fixed a memory leak caused by Exception thrown while constructing a ColumnView ([#13597](https://github.com/rapidsai/cudf/pull/13597)) [@razajafri](https://github.com/razajafri) +- Handle nullptr return value from bitmask_or in distinct_count ([#13590](https://github.com/rapidsai/cudf/pull/13590)) [@wence-](https://github.com/wence-) +- Bring parity with pandas in Index.join ([#13589](https://github.com/rapidsai/cudf/pull/13589)) [@galipremsagar](https://github.com/galipremsagar) +- Fix cudf.melt when there are more than 255 columns ([#13588](https://github.com/rapidsai/cudf/pull/13588)) [@hcho3](https://github.com/hcho3) +- Fix memory issues in cuIO due to removal of memory padding ([#13586](https://github.com/rapidsai/cudf/pull/13586)) [@ttnghia](https://github.com/ttnghia) +- Fix Parquet multi-file reading ([#13584](https://github.com/rapidsai/cudf/pull/13584)) [@etseidl](https://github.com/etseidl) +- Fix memcheck error found in LISTS_TEST ([#13579](https://github.com/rapidsai/cudf/pull/13579)) [@davidwendt](https://github.com/davidwendt) +- Fix memcheck error found in STRINGS_TEST ([#13578](https://github.com/rapidsai/cudf/pull/13578)) [@davidwendt](https://github.com/davidwendt) +- Fix memcheck error found in INTEROP_TEST ([#13577](https://github.com/rapidsai/cudf/pull/13577)) [@davidwendt](https://github.com/davidwendt) +- Fix memcheck errors found in REDUCTION_TEST ([#13574](https://github.com/rapidsai/cudf/pull/13574)) [@davidwendt](https://github.com/davidwendt) +- Preemptive fix for hive-partitioning change in dask ([#13564](https://github.com/rapidsai/cudf/pull/13564)) [@rjzamora](https://github.com/rjzamora) +- Fix an issue with `dask_cudf.read_csv` when lines are needed to be skipped ([#13555](https://github.com/rapidsai/cudf/pull/13555)) [@galipremsagar](https://github.com/galipremsagar) +- Fix out-of-bounds memory write in cudf::dictionary::detail::concatenate ([#13554](https://github.com/rapidsai/cudf/pull/13554)) [@davidwendt](https://github.com/davidwendt) +- Fix the null mask size in json reader ([#13537](https://github.com/rapidsai/cudf/pull/13537)) [@karthikeyann](https://github.com/karthikeyann) +- Fix cudf::strings::strip for all-empty input column ([#13533](https://github.com/rapidsai/cudf/pull/13533)) [@davidwendt](https://github.com/davidwendt) +- Make sure to build without isolation or installing dependencies ([#13524](https://github.com/rapidsai/cudf/pull/13524)) [@vyasr](https://github.com/vyasr) +- Remove preload lib from CMake for now ([#13519](https://github.com/rapidsai/cudf/pull/13519)) [@vyasr](https://github.com/vyasr) +- Fix missing separator after null values in JSON writer ([#13503](https://github.com/rapidsai/cudf/pull/13503)) [@karthikeyann](https://github.com/karthikeyann) +- Ensure `single_lane_block_sum_reduce` is safe to call in a loop ([#13488](https://github.com/rapidsai/cudf/pull/13488)) [@wence-](https://github.com/wence-) +- Update all versions in pyproject.toml files. ([#13486](https://github.com/rapidsai/cudf/pull/13486)) [@bdice](https://github.com/bdice) +- Remove applying nvbench that doesn't exist in 23.08 ([#13484](https://github.com/rapidsai/cudf/pull/13484)) [@robertmaynard](https://github.com/robertmaynard) +- Fix chunked Parquet reader benchmark ([#13482](https://github.com/rapidsai/cudf/pull/13482)) [@vuule](https://github.com/vuule) +- Update JNI JSON reader column compatability for Spark ([#13477](https://github.com/rapidsai/cudf/pull/13477)) [@revans2](https://github.com/revans2) +- Fix unsanitized output of scan with strings ([#13455](https://github.com/rapidsai/cudf/pull/13455)) [@davidwendt](https://github.com/davidwendt) +- Reject functions without bytecode from `_can_be_jitted` in GroupBy Apply ([#13429](https://github.com/rapidsai/cudf/pull/13429)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix decimal scale reductions in `_get_decimal_type` ([#13224](https://github.com/rapidsai/cudf/pull/13224)) [@charlesbluca](https://github.com/charlesbluca) + +## 📖 Documentation + +- Fix doxygen groups for io data sources and sinks ([#13718](https://github.com/rapidsai/cudf/pull/13718)) [@davidwendt](https://github.com/davidwendt) +- Add pandas compatibility note to DataFrame.query docstring ([#13693](https://github.com/rapidsai/cudf/pull/13693)) [@beckernick](https://github.com/beckernick) +- Add pylibcudf to developer guide ([#13639](https://github.com/rapidsai/cudf/pull/13639)) [@vyasr](https://github.com/vyasr) +- Fix repeated words in doxygen text ([#13598](https://github.com/rapidsai/cudf/pull/13598)) [@karthikeyann](https://github.com/karthikeyann) +- Update docs for top-level API. ([#13592](https://github.com/rapidsai/cudf/pull/13592)) [@bdice](https://github.com/bdice) +- Fix the the doxygen text for cudf::concatenate and other places ([#13561](https://github.com/rapidsai/cudf/pull/13561)) [@davidwendt](https://github.com/davidwendt) +- Document stream validation approach used in testing ([#13556](https://github.com/rapidsai/cudf/pull/13556)) [@vyasr](https://github.com/vyasr) +- Cleanup doc repetitions in libcudf ([#13470](https://github.com/rapidsai/cudf/pull/13470)) [@karthikeyann](https://github.com/karthikeyann) + +## 🚀 New Features + +- Support `min` and `max` aggregations for list type in groupby and reduction ([#13676](https://github.com/rapidsai/cudf/pull/13676)) [@ttnghia](https://github.com/ttnghia) +- Add nvtext::jaccard_index API for strings columns ([#13669](https://github.com/rapidsai/cudf/pull/13669)) [@davidwendt](https://github.com/davidwendt) +- Add read_parquet_metadata libcudf API ([#13663](https://github.com/rapidsai/cudf/pull/13663)) [@karthikeyann](https://github.com/karthikeyann) +- Expose streams in all public copying APIs ([#13629](https://github.com/rapidsai/cudf/pull/13629)) [@vyasr](https://github.com/vyasr) +- Add XXHash_64 hash function to cudf ([#13612](https://github.com/rapidsai/cudf/pull/13612)) [@davidwendt](https://github.com/davidwendt) +- Java support: Floating point order-by columns for RANGE window functions ([#13595](https://github.com/rapidsai/cudf/pull/13595)) [@mythrocks](https://github.com/mythrocks) +- Use `cuco::static_map` to build string dictionaries in ORC writer ([#13580](https://github.com/rapidsai/cudf/pull/13580)) [@vuule](https://github.com/vuule) +- Add pylibcudf subpackage with gather implementation ([#13562](https://github.com/rapidsai/cudf/pull/13562)) [@vyasr](https://github.com/vyasr) +- Add JNI for `lists::concatenate_list_elements` ([#13547](https://github.com/rapidsai/cudf/pull/13547)) [@ttnghia](https://github.com/ttnghia) +- Enable nested types for `lists::concatenate_list_elements` ([#13545](https://github.com/rapidsai/cudf/pull/13545)) [@ttnghia](https://github.com/ttnghia) +- Add unicode encoding for string columns in JSON writer ([#13539](https://github.com/rapidsai/cudf/pull/13539)) [@karthikeyann](https://github.com/karthikeyann) +- Remove numba kernels from `find_index_of_val` ([#13517](https://github.com/rapidsai/cudf/pull/13517)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Floating point order-by columns for RANGE window functions ([#13512](https://github.com/rapidsai/cudf/pull/13512)) [@mythrocks](https://github.com/mythrocks) +- Parse column chunk metadata statistics in parquet reader ([#13472](https://github.com/rapidsai/cudf/pull/13472)) [@karthikeyann](https://github.com/karthikeyann) +- Add `abs` function to apply ([#13408](https://github.com/rapidsai/cudf/pull/13408)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- [FEA] AST filtering in parquet reader ([#13348](https://github.com/rapidsai/cudf/pull/13348)) [@karthikeyann](https://github.com/karthikeyann) +- [FEA] Adds option to recover from invalid JSON lines in JSON tokenizer ([#13344](https://github.com/rapidsai/cudf/pull/13344)) [@elstehle](https://github.com/elstehle) +- Ensure cccl packages don't clash with upstream version ([#13235](https://github.com/rapidsai/cudf/pull/13235)) [@robertmaynard](https://github.com/robertmaynard) +- Update `struct_minmax_util` to experimental row comparator ([#13069](https://github.com/rapidsai/cudf/pull/13069)) [@divyegala](https://github.com/divyegala) +- Add stream parameter to hashing APIs ([#12090](https://github.com/rapidsai/cudf/pull/12090)) [@vyasr](https://github.com/vyasr) + +## 🛠️ Improvements + +- Pin `dask` and `distributed` for `23.08` release ([#13802](https://github.com/rapidsai/cudf/pull/13802)) [@galipremsagar](https://github.com/galipremsagar) +- Relax protobuf pinnings. ([#13770](https://github.com/rapidsai/cudf/pull/13770)) [@bdice](https://github.com/bdice) +- Switch fully unbounded window functions to use aggregations ([#13727](https://github.com/rapidsai/cudf/pull/13727)) [@mythrocks](https://github.com/mythrocks) +- Switch to new wheel building pipeline ([#13723](https://github.com/rapidsai/cudf/pull/13723)) [@vyasr](https://github.com/vyasr) +- Revert CUDA 12.0 CI workflows to branch-23.08. ([#13719](https://github.com/rapidsai/cudf/pull/13719)) [@bdice](https://github.com/bdice) +- Adding identify minimum version requirement ([#13713](https://github.com/rapidsai/cudf/pull/13713)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Enforce deprecations and add clarifications around existing deprecations ([#13710](https://github.com/rapidsai/cudf/pull/13710)) [@galipremsagar](https://github.com/galipremsagar) +- Optimize ORC reader performance for list data ([#13708](https://github.com/rapidsai/cudf/pull/13708)) [@vyasr](https://github.com/vyasr) +- fix limit overflow message in a docstring ([#13703](https://github.com/rapidsai/cudf/pull/13703)) [@ahmet-uyar](https://github.com/ahmet-uyar) +- Alleviates JSON parser's need for multi-file sources to end with a newline ([#13702](https://github.com/rapidsai/cudf/pull/13702)) [@elstehle](https://github.com/elstehle) +- Update cython-lint and replace flake8 with ruff ([#13699](https://github.com/rapidsai/cudf/pull/13699)) [@vyasr](https://github.com/vyasr) +- Add `__dask_tokenize__` definitions to cudf classes ([#13695](https://github.com/rapidsai/cudf/pull/13695)) [@rjzamora](https://github.com/rjzamora) +- Convert libcudf hashing benchmarks to nvbench ([#13694](https://github.com/rapidsai/cudf/pull/13694)) [@davidwendt](https://github.com/davidwendt) +- Separate MurmurHash32 from hash_functions.cuh ([#13681](https://github.com/rapidsai/cudf/pull/13681)) [@davidwendt](https://github.com/davidwendt) +- Improve performance of cudf::strings::split on whitespace ([#13680](https://github.com/rapidsai/cudf/pull/13680)) [@davidwendt](https://github.com/davidwendt) +- Allow ORC and Parquet writers to write nullable columns without nulls as non-nullable ([#13675](https://github.com/rapidsai/cudf/pull/13675)) [@vuule](https://github.com/vuule) +- Raise a NotImplementedError in to_datetime when utc is passed ([#13670](https://github.com/rapidsai/cudf/pull/13670)) [@shwina](https://github.com/shwina) +- Add rmm_mode parameter to nvbench base fixture ([#13668](https://github.com/rapidsai/cudf/pull/13668)) [@davidwendt](https://github.com/davidwendt) +- Fix multiindex loc ordering in pandas-compat mode ([#13660](https://github.com/rapidsai/cudf/pull/13660)) [@wence-](https://github.com/wence-) +- Add nvtext hash_character_ngrams function ([#13654](https://github.com/rapidsai/cudf/pull/13654)) [@davidwendt](https://github.com/davidwendt) +- Avoid storing metadata in pointers in ORC and Parquet writers ([#13648](https://github.com/rapidsai/cudf/pull/13648)) [@vuule](https://github.com/vuule) +- Acquire spill lock in to/from_arrow ([#13646](https://github.com/rapidsai/cudf/pull/13646)) [@shwina](https://github.com/shwina) +- Expose stable versions of libcudf sort routines ([#13634](https://github.com/rapidsai/cudf/pull/13634)) [@wence-](https://github.com/wence-) +- Separate out hash_test.cpp source for each hash API ([#13633](https://github.com/rapidsai/cudf/pull/13633)) [@davidwendt](https://github.com/davidwendt) +- Remove deprecated cudf::strings::slice_strings (by delimiter) functions ([#13628](https://github.com/rapidsai/cudf/pull/13628)) [@davidwendt](https://github.com/davidwendt) +- Create separate libcudf hash APIs for each supported hash function ([#13626](https://github.com/rapidsai/cudf/pull/13626)) [@davidwendt](https://github.com/davidwendt) +- Add convert_dtypes API ([#13623](https://github.com/rapidsai/cudf/pull/13623)) [@shwina](https://github.com/shwina) +- Clean up cupy in dependencies.yaml. ([#13617](https://github.com/rapidsai/cudf/pull/13617)) [@bdice](https://github.com/bdice) +- Use cuda-version to constrain cudatoolkit. ([#13615](https://github.com/rapidsai/cudf/pull/13615)) [@bdice](https://github.com/bdice) +- Add murmurhash3_x64_128 function to libcudf ([#13604](https://github.com/rapidsai/cudf/pull/13604)) [@davidwendt](https://github.com/davidwendt) +- Performance improvement for cudf::strings::like ([#13594](https://github.com/rapidsai/cudf/pull/13594)) [@davidwendt](https://github.com/davidwendt) +- Remove deprecated cudf.set_allocator. ([#13591](https://github.com/rapidsai/cudf/pull/13591)) [@bdice](https://github.com/bdice) +- Clean up cudf device atomic with `cuda::atomic_ref` ([#13583](https://github.com/rapidsai/cudf/pull/13583)) [@PointKernel](https://github.com/PointKernel) +- Add java bindings for distinct count ([#13573](https://github.com/rapidsai/cudf/pull/13573)) [@revans2](https://github.com/revans2) +- Use nvcomp conda package. ([#13566](https://github.com/rapidsai/cudf/pull/13566)) [@bdice](https://github.com/bdice) +- Add exception to string_scalar if input string exceeds size_type ([#13560](https://github.com/rapidsai/cudf/pull/13560)) [@davidwendt](https://github.com/davidwendt) +- Add dispatch for `cudf.Dataframe` to/from `pyarrow.Table` conversion ([#13558](https://github.com/rapidsai/cudf/pull/13558)) [@rjzamora](https://github.com/rjzamora) +- Get rid of `cuco::pair_type` aliases ([#13553](https://github.com/rapidsai/cudf/pull/13553)) [@PointKernel](https://github.com/PointKernel) +- Introduce parity with pandas when `sort=False` in `Groupby` ([#13551](https://github.com/rapidsai/cudf/pull/13551)) [@galipremsagar](https://github.com/galipremsagar) +- Update CMake in docker to 3.26.4 ([#13550](https://github.com/rapidsai/cudf/pull/13550)) [@NvTimLiu](https://github.com/NvTimLiu) +- Clarify source of error message in stream testing. ([#13541](https://github.com/rapidsai/cudf/pull/13541)) [@bdice](https://github.com/bdice) +- Deprecate `strings_to_categorical` in `cudf.read_parquet` ([#13540](https://github.com/rapidsai/cudf/pull/13540)) [@galipremsagar](https://github.com/galipremsagar) +- Update to CMake 3.26.4 ([#13538](https://github.com/rapidsai/cudf/pull/13538)) [@vyasr](https://github.com/vyasr) +- s3 folder naming fix ([#13536](https://github.com/rapidsai/cudf/pull/13536)) [@AyodeAwe](https://github.com/AyodeAwe) +- Implement iloc-getitem using parse-don't-validate approach ([#13534](https://github.com/rapidsai/cudf/pull/13534)) [@wence-](https://github.com/wence-) +- Make synchronization explicit in the names of `hostdevice_*` copying APIs ([#13530](https://github.com/rapidsai/cudf/pull/13530)) [@ttnghia](https://github.com/ttnghia) +- Add benchmark (Google Benchmark) dependency to conda packages. ([#13528](https://github.com/rapidsai/cudf/pull/13528)) [@bdice](https://github.com/bdice) +- Add libcufile to dependencies.yaml. ([#13523](https://github.com/rapidsai/cudf/pull/13523)) [@bdice](https://github.com/bdice) +- Fix some memoization logic in groupby/sort/sort_helper.cu ([#13521](https://github.com/rapidsai/cudf/pull/13521)) [@davidwendt](https://github.com/davidwendt) +- Use sizes_to_offsets_iterator in cudf::gather for strings ([#13520](https://github.com/rapidsai/cudf/pull/13520)) [@davidwendt](https://github.com/davidwendt) +- use rapids-upload-docs script ([#13518](https://github.com/rapidsai/cudf/pull/13518)) [@AyodeAwe](https://github.com/AyodeAwe) +- Support UTF-8 BOM in CSV reader ([#13516](https://github.com/rapidsai/cudf/pull/13516)) [@davidwendt](https://github.com/davidwendt) +- Move stream-related test configuration to CMake ([#13513](https://github.com/rapidsai/cudf/pull/13513)) [@vyasr](https://github.com/vyasr) +- Implement `cudf.option_context` ([#13511](https://github.com/rapidsai/cudf/pull/13511)) [@galipremsagar](https://github.com/galipremsagar) +- Unpin `dask` and `distributed` for development ([#13508](https://github.com/rapidsai/cudf/pull/13508)) [@galipremsagar](https://github.com/galipremsagar) +- Change build.sh to use pip install instead of setup.py ([#13507](https://github.com/rapidsai/cudf/pull/13507)) [@vyasr](https://github.com/vyasr) +- Use test default stream ([#13506](https://github.com/rapidsai/cudf/pull/13506)) [@vyasr](https://github.com/vyasr) +- Remove documentation build scripts for Jenkins ([#13495](https://github.com/rapidsai/cudf/pull/13495)) [@ajschmidt8](https://github.com/ajschmidt8) +- Use east const in include files ([#13494](https://github.com/rapidsai/cudf/pull/13494)) [@karthikeyann](https://github.com/karthikeyann) +- Use east const in src files ([#13493](https://github.com/rapidsai/cudf/pull/13493)) [@karthikeyann](https://github.com/karthikeyann) +- Use east const in tests files ([#13492](https://github.com/rapidsai/cudf/pull/13492)) [@karthikeyann](https://github.com/karthikeyann) +- Use east const in benchmarks files ([#13491](https://github.com/rapidsai/cudf/pull/13491)) [@karthikeyann](https://github.com/karthikeyann) +- Performance improvement for nvtext tokenize/token functions ([#13480](https://github.com/rapidsai/cudf/pull/13480)) [@davidwendt](https://github.com/davidwendt) +- Add pd.Float*Dtype to Avro and ORC mappings ([#13475](https://github.com/rapidsai/cudf/pull/13475)) [@mroeschke](https://github.com/mroeschke) +- Use pandas public APIs where available ([#13467](https://github.com/rapidsai/cudf/pull/13467)) [@mroeschke](https://github.com/mroeschke) +- Allow pd.ArrowDtype in cudf.from_pandas ([#13465](https://github.com/rapidsai/cudf/pull/13465)) [@mroeschke](https://github.com/mroeschke) +- Rework libcudf regex benchmarks with nvbench ([#13464](https://github.com/rapidsai/cudf/pull/13464)) [@davidwendt](https://github.com/davidwendt) +- Remove unused max_rows_tensor parameter from subword tokenizer ([#13463](https://github.com/rapidsai/cudf/pull/13463)) [@davidwendt](https://github.com/davidwendt) +- Separate io-text and nvtext pytests into different files ([#13435](https://github.com/rapidsai/cudf/pull/13435)) [@davidwendt](https://github.com/davidwendt) +- Add a move_to function to cudf::string_view::const_iterator ([#13428](https://github.com/rapidsai/cudf/pull/13428)) [@davidwendt](https://github.com/davidwendt) +- Allow newer scikit-build ([#13424](https://github.com/rapidsai/cudf/pull/13424)) [@vyasr](https://github.com/vyasr) +- Refactor sort_by_values to sort_values, drop indices from return values. ([#13419](https://github.com/rapidsai/cudf/pull/13419)) [@bdice](https://github.com/bdice) +- Inline Cython exception handler ([#13411](https://github.com/rapidsai/cudf/pull/13411)) [@vyasr](https://github.com/vyasr) +- Init JNI version 23.08.0-SNAPSHOT ([#13401](https://github.com/rapidsai/cudf/pull/13401)) [@pxLi](https://github.com/pxLi) +- Refactor ORC reader ([#13396](https://github.com/rapidsai/cudf/pull/13396)) [@ttnghia](https://github.com/ttnghia) +- JNI: Remove cleaned objects in memory cleaner ([#13378](https://github.com/rapidsai/cudf/pull/13378)) [@res-life](https://github.com/res-life) +- Add tests of currently unsupported indexing ([#13338](https://github.com/rapidsai/cudf/pull/13338)) [@wence-](https://github.com/wence-) +- Performance improvement for some libcudf regex functions for long strings ([#13322](https://github.com/rapidsai/cudf/pull/13322)) [@davidwendt](https://github.com/davidwendt) +- Exposure Tracked Buffer (first step towards unifying copy-on-write and spilling) ([#13307](https://github.com/rapidsai/cudf/pull/13307)) [@madsbk](https://github.com/madsbk) +- Write string data directly to column_buffer in Parquet reader ([#13302](https://github.com/rapidsai/cudf/pull/13302)) [@etseidl](https://github.com/etseidl) +- Add stacktrace into cudf exception types ([#13298](https://github.com/rapidsai/cudf/pull/13298)) [@ttnghia](https://github.com/ttnghia) +- cuDF: Build CUDA 12 packages ([#12922](https://github.com/rapidsai/cudf/pull/12922)) [@bdice](https://github.com/bdice) + +# cuDF 23.06.00 (7 Jun 2023) + +## 🚨 Breaking Changes + +- Fix batch processing for parquet writer ([#13438](https://github.com/rapidsai/cudf/pull/13438)) [@ttnghia](https://github.com/ttnghia) +- Use <NA> instead of null to match pandas. ([#13415](https://github.com/rapidsai/cudf/pull/13415)) [@bdice](https://github.com/bdice) +- Remove UNKNOWN_NULL_COUNT ([#13372](https://github.com/rapidsai/cudf/pull/13372)) [@vyasr](https://github.com/vyasr) +- Remove default UNKNOWN_NULL_COUNT from cudf::column member functions ([#13341](https://github.com/rapidsai/cudf/pull/13341)) [@davidwendt](https://github.com/davidwendt) +- Use std::overflow_error when output would exceed column size limit ([#13323](https://github.com/rapidsai/cudf/pull/13323)) [@davidwendt](https://github.com/davidwendt) +- Remove null mask and null count from column_view constructors ([#13311](https://github.com/rapidsai/cudf/pull/13311)) [@vyasr](https://github.com/vyasr) +- Change default value of the `observed=` argument in groupby to `True` to reflect the actual behaviour ([#13296](https://github.com/rapidsai/cudf/pull/13296)) [@shwina](https://github.com/shwina) +- Throw error if UNINITIALIZED is passed to cudf::state_null_count ([#13292](https://github.com/rapidsai/cudf/pull/13292)) [@davidwendt](https://github.com/davidwendt) +- Remove default null-count parameter from cudf::make_strings_column factory ([#13227](https://github.com/rapidsai/cudf/pull/13227)) [@davidwendt](https://github.com/davidwendt) +- Remove UNKNOWN_NULL_COUNT where it can be easily computed ([#13205](https://github.com/rapidsai/cudf/pull/13205)) [@vyasr](https://github.com/vyasr) +- Update minimum Python version to Python 3.9 ([#13196](https://github.com/rapidsai/cudf/pull/13196)) [@shwina](https://github.com/shwina) +- Refactor contiguous_split API into contiguous_split.hpp ([#13186](https://github.com/rapidsai/cudf/pull/13186)) [@abellina](https://github.com/abellina) +- Cleanup Parquet chunked writer ([#13094](https://github.com/rapidsai/cudf/pull/13094)) [@ttnghia](https://github.com/ttnghia) +- Cleanup ORC chunked writer ([#13091](https://github.com/rapidsai/cudf/pull/13091)) [@ttnghia](https://github.com/ttnghia) +- Raise `NotImplementedError` when attempting to construct cuDF objects from timezone-aware datetimes ([#13086](https://github.com/rapidsai/cudf/pull/13086)) [@shwina](https://github.com/shwina) +- Remove deprecated regex functions from libcudf ([#13067](https://github.com/rapidsai/cudf/pull/13067)) [@davidwendt](https://github.com/davidwendt) +- [REVIEW] Upgrade to `arrow-11` ([#12757](https://github.com/rapidsai/cudf/pull/12757)) [@galipremsagar](https://github.com/galipremsagar) +- Implement Python drop_duplicates with cudf::stable_distinct. ([#11656](https://github.com/rapidsai/cudf/pull/11656)) [@brandon-b-miller](https://github.com/brandon-b-miller) + +## 🐛 Bug Fixes + +- Fix valid count computation in offset_bitmask_binop kernel ([#13489](https://github.com/rapidsai/cudf/pull/13489)) [@davidwendt](https://github.com/davidwendt) +- Fix writing of ORC files with empty rowgroups ([#13466](https://github.com/rapidsai/cudf/pull/13466)) [@vuule](https://github.com/vuule) +- Fix cudf::repeat logic when count is zero ([#13459](https://github.com/rapidsai/cudf/pull/13459)) [@davidwendt](https://github.com/davidwendt) +- Fix batch processing for parquet writer ([#13438](https://github.com/rapidsai/cudf/pull/13438)) [@ttnghia](https://github.com/ttnghia) +- Fix invalid use of std::exclusive_scan in Parquet writer ([#13434](https://github.com/rapidsai/cudf/pull/13434)) [@etseidl](https://github.com/etseidl) +- Patch numba if it is imported first to ensure minor version compatibility works. ([#13433](https://github.com/rapidsai/cudf/pull/13433)) [@bdice](https://github.com/bdice) +- Fix cudf::strings::replace_with_backrefs hang on empty match result ([#13418](https://github.com/rapidsai/cudf/pull/13418)) [@davidwendt](https://github.com/davidwendt) +- Use <NA> instead of null to match pandas. ([#13415](https://github.com/rapidsai/cudf/pull/13415)) [@bdice](https://github.com/bdice) +- Fix tokenize with non-space delimiter ([#13403](https://github.com/rapidsai/cudf/pull/13403)) [@shwina](https://github.com/shwina) +- Fix groupby head/tail for empty dataframe ([#13398](https://github.com/rapidsai/cudf/pull/13398)) [@shwina](https://github.com/shwina) +- Default to closed="right" in `IntervalIndex` constructor ([#13394](https://github.com/rapidsai/cudf/pull/13394)) [@shwina](https://github.com/shwina) +- Correctly reorder and reindex scan groupbys with null keys ([#13389](https://github.com/rapidsai/cudf/pull/13389)) [@wence-](https://github.com/wence-) +- Fix unused argument errors in nvcc 11.5 ([#13387](https://github.com/rapidsai/cudf/pull/13387)) [@abellina](https://github.com/abellina) +- Updates needed to work with jitify that leverages libcudacxx ([#13383](https://github.com/rapidsai/cudf/pull/13383)) [@robertmaynard](https://github.com/robertmaynard) +- Fix unused parameter warning/error in parquet/page_data.cu ([#13367](https://github.com/rapidsai/cudf/pull/13367)) [@davidwendt](https://github.com/davidwendt) +- Fix page size estimation in Parquet writer ([#13364](https://github.com/rapidsai/cudf/pull/13364)) [@etseidl](https://github.com/etseidl) +- Fix subword_tokenize error when input contains no tokens ([#13320](https://github.com/rapidsai/cudf/pull/13320)) [@davidwendt](https://github.com/davidwendt) +- Support gcc 12 as the C++ compiler ([#13316](https://github.com/rapidsai/cudf/pull/13316)) [@robertmaynard](https://github.com/robertmaynard) +- Correctly set bitmask size in `from_column_view` ([#13315](https://github.com/rapidsai/cudf/pull/13315)) [@wence-](https://github.com/wence-) +- Fix approach to detecting assignment for gte/lte operators ([#13285](https://github.com/rapidsai/cudf/pull/13285)) [@vyasr](https://github.com/vyasr) +- Fix parquet schema interpretation issue ([#13277](https://github.com/rapidsai/cudf/pull/13277)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Fix 64bit shift bug in avro reader ([#13276](https://github.com/rapidsai/cudf/pull/13276)) [@karthikeyann](https://github.com/karthikeyann) +- Fix unused variables/parameters in parquet/writer_impl.cu ([#13263](https://github.com/rapidsai/cudf/pull/13263)) [@davidwendt](https://github.com/davidwendt) +- Clean up buffers in case AssertionError ([#13262](https://github.com/rapidsai/cudf/pull/13262)) [@razajafri](https://github.com/razajafri) +- Allow empty input table in ast `compute_column` ([#13245](https://github.com/rapidsai/cudf/pull/13245)) [@wence-](https://github.com/wence-) +- Fix structs_column_wrapper constructors to copy input column wrappers ([#13243](https://github.com/rapidsai/cudf/pull/13243)) [@davidwendt](https://github.com/davidwendt) +- Fix the row index stream order in ORC reader ([#13242](https://github.com/rapidsai/cudf/pull/13242)) [@vuule](https://github.com/vuule) +- Make `is_decompression_disabled` and `is_compression_disabled` thread-safe ([#13240](https://github.com/rapidsai/cudf/pull/13240)) [@vuule](https://github.com/vuule) +- Add [[maybe_unused]] to nvbench environment. ([#13219](https://github.com/rapidsai/cudf/pull/13219)) [@bdice](https://github.com/bdice) +- Fix race in ORC string dictionary creation ([#13214](https://github.com/rapidsai/cudf/pull/13214)) [@revans2](https://github.com/revans2) +- Add scalar argtypes to udf cache keys ([#13194](https://github.com/rapidsai/cudf/pull/13194)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix unused parameter warning/error in grouped_rolling.cu ([#13192](https://github.com/rapidsai/cudf/pull/13192)) [@davidwendt](https://github.com/davidwendt) +- Avoid skbuild 0.17.2 which affected the cmake -DPython_LIBRARY string ([#13188](https://github.com/rapidsai/cudf/pull/13188)) [@sevagh](https://github.com/sevagh) +- Fix `hostdevice_vector::subspan` ([#13187](https://github.com/rapidsai/cudf/pull/13187)) [@ttnghia](https://github.com/ttnghia) +- Use custom nvbench entry point to ensure `cudf::nvbench_base_fixture` usage ([#13183](https://github.com/rapidsai/cudf/pull/13183)) [@robertmaynard](https://github.com/robertmaynard) +- Fix slice_strings to return empty strings for stop < start indices ([#13178](https://github.com/rapidsai/cudf/pull/13178)) [@davidwendt](https://github.com/davidwendt) +- Allow compilation with any GTest version 1.11+ ([#13153](https://github.com/rapidsai/cudf/pull/13153)) [@robertmaynard](https://github.com/robertmaynard) +- Fix a few clang-format style check errors ([#13146](https://github.com/rapidsai/cudf/pull/13146)) [@davidwendt](https://github.com/davidwendt) +- [REVIEW] Fix `Series` and `DataFrame` constructors to validate index lengths ([#13122](https://github.com/rapidsai/cudf/pull/13122)) [@galipremsagar](https://github.com/galipremsagar) +- Fix hash join when the input tables have nulls on only one side ([#13120](https://github.com/rapidsai/cudf/pull/13120)) [@ttnghia](https://github.com/ttnghia) +- Fix GPU_ARCHS setting in Java CMake build and CMAKE_CUDA_ARCHITECTURES in Python package build. ([#13117](https://github.com/rapidsai/cudf/pull/13117)) [@davidwendt](https://github.com/davidwendt) +- Adds checks to make sure json reader won't overflow ([#13115](https://github.com/rapidsai/cudf/pull/13115)) [@elstehle](https://github.com/elstehle) +- Fix `null_count` of columns returned by `chunked_parquet_reader` ([#13111](https://github.com/rapidsai/cudf/pull/13111)) [@vuule](https://github.com/vuule) +- Fixes sliced list and struct column bug in JSON chunked writer ([#13108](https://github.com/rapidsai/cudf/pull/13108)) [@karthikeyann](https://github.com/karthikeyann) +- [REVIEW] Fix missing confluent kafka version ([#13101](https://github.com/rapidsai/cudf/pull/13101)) [@galipremsagar](https://github.com/galipremsagar) +- Use make_empty_lists_column instead of make_empty_column(type_id::LIST) ([#13099](https://github.com/rapidsai/cudf/pull/13099)) [@davidwendt](https://github.com/davidwendt) +- Raise `NotImplementedError` when attempting to construct cuDF objects from timezone-aware datetimes ([#13086](https://github.com/rapidsai/cudf/pull/13086)) [@shwina](https://github.com/shwina) +- Fix column selection `read_parquet` benchmarks ([#13082](https://github.com/rapidsai/cudf/pull/13082)) [@vuule](https://github.com/vuule) +- Fix bugs in iterative groupby apply algorithm ([#13078](https://github.com/rapidsai/cudf/pull/13078)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add algorithm include in data_sink.hpp ([#13068](https://github.com/rapidsai/cudf/pull/13068)) [@ahendriksen](https://github.com/ahendriksen) +- Fix tests/identify_stream_usage.cpp ([#13066](https://github.com/rapidsai/cudf/pull/13066)) [@ahendriksen](https://github.com/ahendriksen) +- Prevent overflow with `skip_rows` in ORC and Parquet readers ([#13063](https://github.com/rapidsai/cudf/pull/13063)) [@vuule](https://github.com/vuule) +- Add except declaration in Cython interface for regex_program::create ([#13054](https://github.com/rapidsai/cudf/pull/13054)) [@davidwendt](https://github.com/davidwendt) +- [REVIEW] Fix branch version in CI scripts ([#13029](https://github.com/rapidsai/cudf/pull/13029)) [@galipremsagar](https://github.com/galipremsagar) +- Fix OOB memory access in CSV reader when reading without NA values ([#13011](https://github.com/rapidsai/cudf/pull/13011)) [@vuule](https://github.com/vuule) +- Fix read_avro() skip_rows and num_rows. ([#12912](https://github.com/rapidsai/cudf/pull/12912)) [@tpn](https://github.com/tpn) +- Purge nonempty nulls from byte_cast list outputs. ([#11971](https://github.com/rapidsai/cudf/pull/11971)) [@bdice](https://github.com/bdice) +- Fix consumption of CPU-backed interchange protocol dataframes ([#11392](https://github.com/rapidsai/cudf/pull/11392)) [@shwina](https://github.com/shwina) + +## 🚀 New Features + +- Remove numba JIT kernel usage from dataframe copy tests ([#13385](https://github.com/rapidsai/cudf/pull/13385)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add JNI for ORC/Parquet writer compression statistics ([#13376](https://github.com/rapidsai/cudf/pull/13376)) [@ttnghia](https://github.com/ttnghia) +- Use _compile_or_get in JIT groupby apply ([#13350](https://github.com/rapidsai/cudf/pull/13350)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- cuDF numba cuda 12 updates ([#13337](https://github.com/rapidsai/cudf/pull/13337)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add tz_convert method to convert between timestamps ([#13328](https://github.com/rapidsai/cudf/pull/13328)) [@shwina](https://github.com/shwina) +- Optionally return compression statistics from ORC and Parquet writers ([#13294](https://github.com/rapidsai/cudf/pull/13294)) [@vuule](https://github.com/vuule) +- Support the case=False argument to str.contains ([#13290](https://github.com/rapidsai/cudf/pull/13290)) [@shwina](https://github.com/shwina) +- Add an event handler for ColumnVector.close ([#13279](https://github.com/rapidsai/cudf/pull/13279)) [@abellina](https://github.com/abellina) +- JNI api for cudf::chunked_pack ([#13278](https://github.com/rapidsai/cudf/pull/13278)) [@abellina](https://github.com/abellina) +- Implement a chunked_pack API ([#13260](https://github.com/rapidsai/cudf/pull/13260)) [@abellina](https://github.com/abellina) +- Update cudf recipes to use GTest version to >=1.13 ([#13207](https://github.com/rapidsai/cudf/pull/13207)) [@robertmaynard](https://github.com/robertmaynard) +- JNI changes for range-extents in window functions. ([#13199](https://github.com/rapidsai/cudf/pull/13199)) [@mythrocks](https://github.com/mythrocks) +- Add support for DatetimeTZDtype and tz_localize ([#13163](https://github.com/rapidsai/cudf/pull/13163)) [@shwina](https://github.com/shwina) +- Add IS_NULL operator to AST ([#13145](https://github.com/rapidsai/cudf/pull/13145)) [@karthikeyann](https://github.com/karthikeyann) +- STRING order-by column for RANGE window functions ([#13143](https://github.com/rapidsai/cudf/pull/13143)) [@mythrocks](https://github.com/mythrocks) +- Update `contains_table` to experimental row hasher and equality comparator ([#13119](https://github.com/rapidsai/cudf/pull/13119)) [@divyegala](https://github.com/divyegala) +- Automatically select `GroupBy.apply` algorithm based on if the UDF is jittable ([#13113](https://github.com/rapidsai/cudf/pull/13113)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Refactor Parquet chunked writer ([#13076](https://github.com/rapidsai/cudf/pull/13076)) [@ttnghia](https://github.com/ttnghia) +- Add Python bindings for string literal support in AST ([#13073](https://github.com/rapidsai/cudf/pull/13073)) [@karthikeyann](https://github.com/karthikeyann) +- Add Java bindings for string literal support in AST ([#13072](https://github.com/rapidsai/cudf/pull/13072)) [@karthikeyann](https://github.com/karthikeyann) +- Add string scalar support in AST ([#13061](https://github.com/rapidsai/cudf/pull/13061)) [@karthikeyann](https://github.com/karthikeyann) +- Log cuIO warnings using the libcudf logger ([#13043](https://github.com/rapidsai/cudf/pull/13043)) [@vuule](https://github.com/vuule) +- Update `mixed_join` to use experimental row hasher and comparator ([#13028](https://github.com/rapidsai/cudf/pull/13028)) [@divyegala](https://github.com/divyegala) +- Support structs of lists in row lexicographic comparator ([#13005](https://github.com/rapidsai/cudf/pull/13005)) [@ttnghia](https://github.com/ttnghia) +- Adding `hostdevice_span` that is a span createable from `hostdevice_vector` ([#12981](https://github.com/rapidsai/cudf/pull/12981)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Add nvtext::minhash function ([#12961](https://github.com/rapidsai/cudf/pull/12961)) [@davidwendt](https://github.com/davidwendt) +- Support lists of structs in row lexicographic comparator ([#12953](https://github.com/rapidsai/cudf/pull/12953)) [@ttnghia](https://github.com/ttnghia) +- Update `join` to use experimental row hasher and comparator ([#12787](https://github.com/rapidsai/cudf/pull/12787)) [@divyegala](https://github.com/divyegala) +- Implement Python drop_duplicates with cudf::stable_distinct. ([#11656](https://github.com/rapidsai/cudf/pull/11656)) [@brandon-b-miller](https://github.com/brandon-b-miller) + +## 🛠️ Improvements + +- Drop extraneous dependencies from cudf conda recipe. ([#13406](https://github.com/rapidsai/cudf/pull/13406)) [@bdice](https://github.com/bdice) +- Handle some corner-cases in indexing with boolean masks ([#13402](https://github.com/rapidsai/cudf/pull/13402)) [@wence-](https://github.com/wence-) +- Add cudf::stable_distinct public API, tests, and benchmarks. ([#13392](https://github.com/rapidsai/cudf/pull/13392)) [@bdice](https://github.com/bdice) +- [JNI] Pass this ColumnVector to the onClosed event handler ([#13386](https://github.com/rapidsai/cudf/pull/13386)) [@abellina](https://github.com/abellina) +- Fix JNI method with mismatched parameter list ([#13384](https://github.com/rapidsai/cudf/pull/13384)) [@ttnghia](https://github.com/ttnghia) +- Split up experimental_row_operator_tests.cu to improve its compile time ([#13382](https://github.com/rapidsai/cudf/pull/13382)) [@davidwendt](https://github.com/davidwendt) +- Deprecate cudf::strings::slice_strings APIs that accept delimiters ([#13373](https://github.com/rapidsai/cudf/pull/13373)) [@davidwendt](https://github.com/davidwendt) +- Remove UNKNOWN_NULL_COUNT ([#13372](https://github.com/rapidsai/cudf/pull/13372)) [@vyasr](https://github.com/vyasr) +- Move some nvtext benchmarks to nvbench ([#13368](https://github.com/rapidsai/cudf/pull/13368)) [@davidwendt](https://github.com/davidwendt) +- run docs nightly too ([#13366](https://github.com/rapidsai/cudf/pull/13366)) [@AyodeAwe](https://github.com/AyodeAwe) +- Add warning for default `dtype` parameter in `get_dummies` ([#13365](https://github.com/rapidsai/cudf/pull/13365)) [@galipremsagar](https://github.com/galipremsagar) +- Add log messages about kvikIO compatibility mode ([#13363](https://github.com/rapidsai/cudf/pull/13363)) [@vuule](https://github.com/vuule) +- Switch back to using primary shared-action-workflows branch ([#13362](https://github.com/rapidsai/cudf/pull/13362)) [@vyasr](https://github.com/vyasr) +- Deprecate `StringIndex` and use `Index` instead ([#13361](https://github.com/rapidsai/cudf/pull/13361)) [@galipremsagar](https://github.com/galipremsagar) +- Ensure columns have valid null counts in CUDF JNI. ([#13355](https://github.com/rapidsai/cudf/pull/13355)) [@mythrocks](https://github.com/mythrocks) +- Expunge most uses of `TypeVar(bound="Foo")` ([#13346](https://github.com/rapidsai/cudf/pull/13346)) [@wence-](https://github.com/wence-) +- Remove all references to UNKNOWN_NULL_COUNT in Python ([#13345](https://github.com/rapidsai/cudf/pull/13345)) [@vyasr](https://github.com/vyasr) +- Improve `distinct_count` with `cuco::static_set` ([#13343](https://github.com/rapidsai/cudf/pull/13343)) [@PointKernel](https://github.com/PointKernel) +- Fix `contiguous_split` performance ([#13342](https://github.com/rapidsai/cudf/pull/13342)) [@ttnghia](https://github.com/ttnghia) +- Remove default UNKNOWN_NULL_COUNT from cudf::column member functions ([#13341](https://github.com/rapidsai/cudf/pull/13341)) [@davidwendt](https://github.com/davidwendt) +- Update mypy to 1.3 ([#13340](https://github.com/rapidsai/cudf/pull/13340)) [@wence-](https://github.com/wence-) +- [Java] Purge non-empty nulls when setting validity ([#13335](https://github.com/rapidsai/cudf/pull/13335)) [@razajafri](https://github.com/razajafri) +- Add row-wise filtering step to `read_parquet` ([#13334](https://github.com/rapidsai/cudf/pull/13334)) [@rjzamora](https://github.com/rjzamora) +- Performance improvement for nvtext::minhash ([#13333](https://github.com/rapidsai/cudf/pull/13333)) [@davidwendt](https://github.com/davidwendt) +- Fix some libcudf functions to set the null count on returning columns ([#13331](https://github.com/rapidsai/cudf/pull/13331)) [@davidwendt](https://github.com/davidwendt) +- Change cudf::detail::concatenate_masks to return null-count ([#13330](https://github.com/rapidsai/cudf/pull/13330)) [@davidwendt](https://github.com/davidwendt) +- Move `meta` calculation in `dask_cudf.read_parquet` ([#13327](https://github.com/rapidsai/cudf/pull/13327)) [@rjzamora](https://github.com/rjzamora) +- Changes to support Numpy >= 1.24 ([#13325](https://github.com/rapidsai/cudf/pull/13325)) [@shwina](https://github.com/shwina) +- Use std::overflow_error when output would exceed column size limit ([#13323](https://github.com/rapidsai/cudf/pull/13323)) [@davidwendt](https://github.com/davidwendt) +- Clean up `distinct_count` benchmark ([#13321](https://github.com/rapidsai/cudf/pull/13321)) [@PointKernel](https://github.com/PointKernel) +- Fix gtest pinning to 1.13.0. ([#13319](https://github.com/rapidsai/cudf/pull/13319)) [@bdice](https://github.com/bdice) +- Remove null mask and null count from column_view constructors ([#13311](https://github.com/rapidsai/cudf/pull/13311)) [@vyasr](https://github.com/vyasr) +- Address feedback from 13289 ([#13306](https://github.com/rapidsai/cudf/pull/13306)) [@vyasr](https://github.com/vyasr) +- Change default value of the `observed=` argument in groupby to `True` to reflect the actual behaviour ([#13296](https://github.com/rapidsai/cudf/pull/13296)) [@shwina](https://github.com/shwina) +- First check for `BaseDtype` when infering the data type of an arbitrary object ([#13295](https://github.com/rapidsai/cudf/pull/13295)) [@shwina](https://github.com/shwina) +- Throw error if UNINITIALIZED is passed to cudf::state_null_count ([#13292](https://github.com/rapidsai/cudf/pull/13292)) [@davidwendt](https://github.com/davidwendt) +- Support CUDA 12.0 for pip wheels ([#13289](https://github.com/rapidsai/cudf/pull/13289)) [@divyegala](https://github.com/divyegala) +- Refactor `transform_lists_of_structs` in `row_operators.cu` ([#13288](https://github.com/rapidsai/cudf/pull/13288)) [@ttnghia](https://github.com/ttnghia) +- Branch 23.06 merge 23.04 ([#13286](https://github.com/rapidsai/cudf/pull/13286)) [@vyasr](https://github.com/vyasr) +- Update cupy dependency ([#13284](https://github.com/rapidsai/cudf/pull/13284)) [@vyasr](https://github.com/vyasr) +- Performance improvement in cudf::strings::join_strings for long strings ([#13283](https://github.com/rapidsai/cudf/pull/13283)) [@davidwendt](https://github.com/davidwendt) +- Fix unused variables and functions ([#13275](https://github.com/rapidsai/cudf/pull/13275)) [@karthikeyann](https://github.com/karthikeyann) +- Fix integer overflow in `partition` `scatter_map` construction ([#13272](https://github.com/rapidsai/cudf/pull/13272)) [@wence-](https://github.com/wence-) +- Numba 0.57 compatibility fixes ([#13271](https://github.com/rapidsai/cudf/pull/13271)) [@gmarkall](https://github.com/gmarkall) +- Performance improvement in cudf::strings::all_characters_of_type ([#13259](https://github.com/rapidsai/cudf/pull/13259)) [@davidwendt](https://github.com/davidwendt) +- Remove default null-count parameter from some libcudf factory functions ([#13258](https://github.com/rapidsai/cudf/pull/13258)) [@davidwendt](https://github.com/davidwendt) +- Roll our own generate_string() because mimesis' has gone away ([#13257](https://github.com/rapidsai/cudf/pull/13257)) [@shwina](https://github.com/shwina) +- Build wheels using new single image workflow ([#13249](https://github.com/rapidsai/cudf/pull/13249)) [@vyasr](https://github.com/vyasr) +- Enable sccache hits from local builds ([#13248](https://github.com/rapidsai/cudf/pull/13248)) [@AyodeAwe](https://github.com/AyodeAwe) +- Revert to branch-23.06 for shared-action-workflows ([#13247](https://github.com/rapidsai/cudf/pull/13247)) [@shwina](https://github.com/shwina) +- Introduce `pandas_compatible` option in `cudf` ([#13241](https://github.com/rapidsai/cudf/pull/13241)) [@galipremsagar](https://github.com/galipremsagar) +- Add metadata_builder helper class ([#13232](https://github.com/rapidsai/cudf/pull/13232)) [@abellina](https://github.com/abellina) +- Use libkvikio conda packages in libcudf, add explicit libcufile dependency. ([#13231](https://github.com/rapidsai/cudf/pull/13231)) [@bdice](https://github.com/bdice) +- Remove default null-count parameter from cudf::make_strings_column factory ([#13227](https://github.com/rapidsai/cudf/pull/13227)) [@davidwendt](https://github.com/davidwendt) +- Performance improvement in cudf::strings::find/rfind for long strings ([#13226](https://github.com/rapidsai/cudf/pull/13226)) [@davidwendt](https://github.com/davidwendt) +- Add chunked reader benchmark ([#13223](https://github.com/rapidsai/cudf/pull/13223)) [@SrikarVanavasam](https://github.com/SrikarVanavasam) +- Set the null count in output columns in the CSV reader ([#13221](https://github.com/rapidsai/cudf/pull/13221)) [@vuule](https://github.com/vuule) +- Skip Non-Empty nulls tests for the nightly build just like we skip CuFileTest and CudaFatalTest ([#13213](https://github.com/rapidsai/cudf/pull/13213)) [@razajafri](https://github.com/razajafri) +- Fix string_scalar stream usage in write_json.cu ([#13212](https://github.com/rapidsai/cudf/pull/13212)) [@davidwendt](https://github.com/davidwendt) +- Use canonicalized name for dlopen'd libraries (libcufile) ([#13210](https://github.com/rapidsai/cudf/pull/13210)) [@shwina](https://github.com/shwina) +- Refactor pinned memory vector and ORC+Parquet writers ([#13206](https://github.com/rapidsai/cudf/pull/13206)) [@ttnghia](https://github.com/ttnghia) +- Remove UNKNOWN_NULL_COUNT where it can be easily computed ([#13205](https://github.com/rapidsai/cudf/pull/13205)) [@vyasr](https://github.com/vyasr) +- Optimization to decoding of parquet level streams ([#13203](https://github.com/rapidsai/cudf/pull/13203)) [@nvdbaranec](https://github.com/nvdbaranec) +- Clean up and simplify `gpuDecideCompression` ([#13202](https://github.com/rapidsai/cudf/pull/13202)) [@vuule](https://github.com/vuule) +- Use std::array for a statically sized vector in `create_serialized_trie` ([#13201](https://github.com/rapidsai/cudf/pull/13201)) [@vuule](https://github.com/vuule) +- Update minimum Python version to Python 3.9 ([#13196](https://github.com/rapidsai/cudf/pull/13196)) [@shwina](https://github.com/shwina) +- Refactor contiguous_split API into contiguous_split.hpp ([#13186](https://github.com/rapidsai/cudf/pull/13186)) [@abellina](https://github.com/abellina) +- Remove usage of rapids-get-rapids-version-from-git ([#13184](https://github.com/rapidsai/cudf/pull/13184)) [@jjacobelli](https://github.com/jjacobelli) +- Enable mixed-dtype decimal/scalar binary operations ([#13171](https://github.com/rapidsai/cudf/pull/13171)) [@shwina](https://github.com/shwina) +- Split up unique_count.cu to improve build time ([#13169](https://github.com/rapidsai/cudf/pull/13169)) [@davidwendt](https://github.com/davidwendt) +- Use nvtx3 includes in string examples. ([#13165](https://github.com/rapidsai/cudf/pull/13165)) [@bdice](https://github.com/bdice) +- Change some .cu gtest files to .cpp ([#13155](https://github.com/rapidsai/cudf/pull/13155)) [@davidwendt](https://github.com/davidwendt) +- Remove wheel pytest verbosity ([#13151](https://github.com/rapidsai/cudf/pull/13151)) [@sevagh](https://github.com/sevagh) +- Fix libcudf to always pass null-count to set_null_mask ([#13149](https://github.com/rapidsai/cudf/pull/13149)) [@davidwendt](https://github.com/davidwendt) +- Fix gtests to always pass null-count to set_null_mask calls ([#13148](https://github.com/rapidsai/cudf/pull/13148)) [@davidwendt](https://github.com/davidwendt) +- Optimize JSON writer ([#13144](https://github.com/rapidsai/cudf/pull/13144)) [@karthikeyann](https://github.com/karthikeyann) +- Performance improvement for libcudf upper/lower conversion for long strings ([#13142](https://github.com/rapidsai/cudf/pull/13142)) [@davidwendt](https://github.com/davidwendt) +- [REVIEW] Deprecate `pad` and `backfill` methods ([#13140](https://github.com/rapidsai/cudf/pull/13140)) [@galipremsagar](https://github.com/galipremsagar) +- Use CTAD instead of functions in ProtobufReader ([#13135](https://github.com/rapidsai/cudf/pull/13135)) [@vuule](https://github.com/vuule) +- Remove more instances of `UNKNOWN_NULL_COUNT` ([#13134](https://github.com/rapidsai/cudf/pull/13134)) [@vyasr](https://github.com/vyasr) +- Update clang-format to 16.0.1. ([#13133](https://github.com/rapidsai/cudf/pull/13133)) [@bdice](https://github.com/bdice) +- Add log messages about cuIO's nvCOMP and cuFile use ([#13132](https://github.com/rapidsai/cudf/pull/13132)) [@vuule](https://github.com/vuule) +- Branch 23.06 merge 23.04 ([#13131](https://github.com/rapidsai/cudf/pull/13131)) [@vyasr](https://github.com/vyasr) +- Compute null-count in cudf::detail::slice ([#13124](https://github.com/rapidsai/cudf/pull/13124)) [@davidwendt](https://github.com/davidwendt) +- Use ARC V2 self-hosted runners for GPU jobs ([#13123](https://github.com/rapidsai/cudf/pull/13123)) [@jjacobelli](https://github.com/jjacobelli) +- Set null-count in linked_column_view conversion operator ([#13121](https://github.com/rapidsai/cudf/pull/13121)) [@davidwendt](https://github.com/davidwendt) +- Adding ifdefs around nvcc-specific pragmas ([#13110](https://github.com/rapidsai/cudf/pull/13110)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Add null-count parameter to json experimental parse_data utility ([#13107](https://github.com/rapidsai/cudf/pull/13107)) [@davidwendt](https://github.com/davidwendt) +- Remove uses-setup-env-vars ([#13105](https://github.com/rapidsai/cudf/pull/13105)) [@vyasr](https://github.com/vyasr) +- Explicitly compute null count in concatenate APIs ([#13104](https://github.com/rapidsai/cudf/pull/13104)) [@vyasr](https://github.com/vyasr) +- Replace unnecessary uses of `UNKNOWN_NULL_COUNT` ([#13102](https://github.com/rapidsai/cudf/pull/13102)) [@vyasr](https://github.com/vyasr) +- Performance improvement for cudf::string_view::find functions ([#13100](https://github.com/rapidsai/cudf/pull/13100)) [@davidwendt](https://github.com/davidwendt) +- Use `.element()` instead of `.data()` for window range calculations ([#13095](https://github.com/rapidsai/cudf/pull/13095)) [@mythrocks](https://github.com/mythrocks) +- Cleanup Parquet chunked writer ([#13094](https://github.com/rapidsai/cudf/pull/13094)) [@ttnghia](https://github.com/ttnghia) +- Fix unused variable error/warning in page_data.cu ([#13093](https://github.com/rapidsai/cudf/pull/13093)) [@davidwendt](https://github.com/davidwendt) +- Cleanup ORC chunked writer ([#13091](https://github.com/rapidsai/cudf/pull/13091)) [@ttnghia](https://github.com/ttnghia) +- Remove using namespace cudf; from libcudf gtests source ([#13089](https://github.com/rapidsai/cudf/pull/13089)) [@davidwendt](https://github.com/davidwendt) +- Change cudf::test::make_null_mask to also return null-count ([#13081](https://github.com/rapidsai/cudf/pull/13081)) [@davidwendt](https://github.com/davidwendt) +- Resolved automerger from `branch-23.04` to `branch-23.06` ([#13080](https://github.com/rapidsai/cudf/pull/13080)) [@galipremsagar](https://github.com/galipremsagar) +- Assert for non-empty nulls ([#13071](https://github.com/rapidsai/cudf/pull/13071)) [@razajafri](https://github.com/razajafri) +- Remove deprecated regex functions from libcudf ([#13067](https://github.com/rapidsai/cudf/pull/13067)) [@davidwendt](https://github.com/davidwendt) +- Refactor `cudf::detail::sorted_order` ([#13062](https://github.com/rapidsai/cudf/pull/13062)) [@ttnghia](https://github.com/ttnghia) +- Improve performance of slice_strings for long strings ([#13057](https://github.com/rapidsai/cudf/pull/13057)) [@davidwendt](https://github.com/davidwendt) +- Reduce shared memory usage in gpuComputePageSizes by 50% ([#13047](https://github.com/rapidsai/cudf/pull/13047)) [@nvdbaranec](https://github.com/nvdbaranec) +- [REVIEW] Add notes to performance comparisons notebook ([#13044](https://github.com/rapidsai/cudf/pull/13044)) [@galipremsagar](https://github.com/galipremsagar) +- Enable binary operations between scalars and columns of differing decimal types ([#13034](https://github.com/rapidsai/cudf/pull/13034)) [@shwina](https://github.com/shwina) +- Remove console output from some libcudf gtests ([#13027](https://github.com/rapidsai/cudf/pull/13027)) [@davidwendt](https://github.com/davidwendt) +- Remove underscore in build string. ([#13025](https://github.com/rapidsai/cudf/pull/13025)) [@bdice](https://github.com/bdice) +- Bump up JNI version 23.06.0-SNAPSHOT ([#13021](https://github.com/rapidsai/cudf/pull/13021)) [@pxLi](https://github.com/pxLi) +- Fix auto merger from `branch-23.04` to `branch-23.06` ([#13009](https://github.com/rapidsai/cudf/pull/13009)) [@galipremsagar](https://github.com/galipremsagar) +- Reduce peak memory use when writing compressed ORC files. ([#12963](https://github.com/rapidsai/cudf/pull/12963)) [@vuule](https://github.com/vuule) +- Add nvtx annotatations to groupby methods ([#12941](https://github.com/rapidsai/cudf/pull/12941)) [@wence-](https://github.com/wence-) +- Compute column sizes in Parquet preprocess with single kernel ([#12931](https://github.com/rapidsai/cudf/pull/12931)) [@SrikarVanavasam](https://github.com/SrikarVanavasam) +- Add Python bindings for time zone data (TZiF) reader ([#12826](https://github.com/rapidsai/cudf/pull/12826)) [@shwina](https://github.com/shwina) +- Optimize set-like operations ([#12769](https://github.com/rapidsai/cudf/pull/12769)) [@ttnghia](https://github.com/ttnghia) +- [REVIEW] Upgrade to `arrow-11` ([#12757](https://github.com/rapidsai/cudf/pull/12757)) [@galipremsagar](https://github.com/galipremsagar) +- Add empty test files for test reorganization ([#12288](https://github.com/rapidsai/cudf/pull/12288)) [@shwina](https://github.com/shwina) + # cuDF 23.04.00 (6 Apr 2023) ## 🚨 Breaking Changes diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f5959de10ab..07537e75018 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -73,7 +73,7 @@ Compilers: * `gcc` version 9.3+ * `nvcc` version 11.5+ -* `cmake` version 3.23.1+ +* `cmake` version 3.26.4+ CUDA/GPU: diff --git a/README.md b/README.md index e62d6772755..64c980d0cb3 100644 --- a/README.md +++ b/README.md @@ -61,11 +61,11 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids ### Conda -cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` channel: +cuDF can be installed with conda (via [miniconda](https://conda.io/miniconda.html) or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` channel: ```bash conda install -c rapidsai -c conda-forge -c nvidia \ - cudf=23.06 python=3.10 cudatoolkit=11.8 + cudf=23.10 python=3.10 cuda-version=11.8 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/build.sh b/build.sh index 3d004f4fd4d..2ad69712e5d 100755 --- a/build.sh +++ b/build.sh @@ -32,7 +32,7 @@ HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [li custreamz - build the custreamz Python package -v - verbose build mode -g - build for debug - -n - no install step + -n - no install step (does not affect Python) --allgpuarch - build for all supported GPU architectures --disable_nvtx - disable inserting NVTX profiling ranges --opensource_nvcomp - disable use of proprietary nvcomp extensions @@ -332,10 +332,9 @@ fi if buildAll || hasArg cudf; then cd ${REPODIR}/python/cudf - python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1} - if [[ ${INSTALL_TARGET} != "" ]]; then - python setup.py install --single-version-externally-managed --record=record.txt -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1} - fi + SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} ${EXTRA_CMAKE_ARGS}" \ + SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \ + python -m pip install --no-build-isolation --no-deps . fi @@ -343,12 +342,7 @@ fi if buildAll || hasArg dask_cudf; then cd ${REPODIR}/python/dask_cudf - if [[ ${INSTALL_TARGET} != "" ]]; then - PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL} - python setup.py install --single-version-externally-managed --record=record.txt - else - PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL} - fi + python -m pip install --no-build-isolation --no-deps . fi if hasArg cudfjar; then @@ -375,21 +369,15 @@ fi # build cudf_kafka Python package if hasArg cudf_kafka; then cd ${REPODIR}/python/cudf_kafka - if [[ ${INSTALL_TARGET} != "" ]]; then - PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL} - python setup.py install --single-version-externally-managed --record=record.txt - else - PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL} --library-dir=${LIBCUDF_BUILD_DIR} - fi + SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR}" \ + SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \ + python -m pip install --no-build-isolation --no-deps . fi # build custreamz Python package if hasArg custreamz; then cd ${REPODIR}/python/custreamz - if [[ ${INSTALL_TARGET} != "" ]]; then - PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL} - python setup.py install --single-version-externally-managed --record=record.txt - else - PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL} --library-dir=${LIBCUDF_BUILD_DIR} - fi + SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR}" \ + SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \ + python -m pip install --no-build-isolation --no-deps . fi diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index bc27e7d76b0..8b757fecf5a 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -11,6 +11,8 @@ rapids-print-env rapids-logger "Begin cpp build" -rapids-mamba-retry mambabuild conda/recipes/libcudf +# With boa installed conda build forward to boa +rapids-conda-retry mambabuild \ + conda/recipes/libcudf rapids-upload-conda-to-s3 cpp diff --git a/ci/build_docs.sh b/ci/build_docs.sh index bfb782ef467..1ed047a500b 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -19,39 +19,39 @@ rapids-print-env rapids-logger "Downloading artifacts from previous jobs" CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) -VERSION_NUMBER="23.06" rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ libcudf cudf dask-cudf +export RAPIDS_VERSION_NUMBER="23.10" +export RAPIDS_DOCS_DIR="$(mktemp -d)" -rapids-logger "Build Doxygen docs" +rapids-logger "Build CPP docs" pushd cpp/doxygen -aws s3 cp s3://rapidsai-docs/librmm/${VERSION_NUMBER}/html/rmm.tag . || echo "Failed to download rmm Doxygen tag" +aws s3 cp s3://rapidsai-docs/librmm/${RAPIDS_VERSION_NUMBER}/html/rmm.tag . || echo "Failed to download rmm Doxygen tag" doxygen Doxyfile +mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html" +mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html" popd -rapids-logger "Build cuDF Sphinx docs" +rapids-logger "Build Python docs" pushd docs/cudf -sphinx-build -b dirhtml source _html -sphinx-build -b text source _text +make dirhtml +make text +mkdir -p "${RAPIDS_DOCS_DIR}/cudf/"{html,txt} +mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html" +mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt" popd - rapids-logger "Build dask-cuDF Sphinx docs" pushd docs/dask_cudf -sphinx-build -b dirhtml source _html -sphinx-build -b text source _text +make dirhtml +make text +mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/"{html,txt} +mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html" +mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt" popd - -if [[ ${RAPIDS_BUILD_TYPE} == "branch" ]]; then - rapids-logger "Upload Docs to S3" - aws s3 sync --no-progress --delete cpp/doxygen/html "s3://rapidsai-docs/libcudf/${VERSION_NUMBER}/html" - aws s3 sync --no-progress --delete docs/cudf/_html "s3://rapidsai-docs/cudf/${VERSION_NUMBER}/html" - aws s3 sync --no-progress --delete docs/cudf/_text "s3://rapidsai-docs/cudf/${VERSION_NUMBER}/txt" - aws s3 sync --no-progress --delete docs/dask_cudf/_html "s3://rapidsai-docs/dask-cudf/${VERSION_NUMBER}/html" - aws s3 sync --no-progress --delete docs/dask_cudf/_text "s3://rapidsai-docs/dask-cudf/${VERSION_NUMBER}/txt" -fi +rapids-upload-docs diff --git a/ci/build_python.sh b/ci/build_python.sh index ec34d63b282..61f160b25f5 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -15,24 +15,25 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) # TODO: Remove `--no-test` flag once importing on a CPU # node works correctly -rapids-mamba-retry mambabuild \ +# With boa installed conda build forwards to the boa builder +rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ conda/recipes/cudf -rapids-mamba-retry mambabuild \ +rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/dask-cudf -rapids-mamba-retry mambabuild \ +rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/cudf_kafka -rapids-mamba-retry mambabuild \ +rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh new file mode 100755 index 00000000000..06d0c3c7a56 --- /dev/null +++ b/ci/build_wheel.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -euo pipefail + +package_name=$1 +package_dir=$2 + +source rapids-configure-sccache +source rapids-date-string + +# Use gha-tools rapids-pip-wheel-version to generate wheel version then +# update the necessary files +version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})" + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" + +# This is the version of the suffix with a preceding hyphen. It's used +# everywhere except in the final wheel name. +PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}" + +# Patch project metadata files to include the CUDA version suffix and version override. +pyproject_file="${package_dir}/pyproject.toml" + +sed -i "s/^version = .*/version = \"${version_override}\"/g" ${pyproject_file} +sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} + +# For nightlies we want to ensure that we're pulling in alphas as well. The +# easiest way to do so is to augment the spec with a constraint containing a +# min alpha version that doesn't affect the version bounds but does allow usage +# of alpha versions for that dependency without --pre +alpha_spec='' +if ! rapids-is-release-build; then + alpha_spec=',>=0.0.0a0' +fi + +if [[ ${package_name} == "dask_cudf" ]]; then + sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} +else + sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file} + # ptxcompiler and cubinlinker aren't version constrained + sed -r -i "s/ptxcompiler\"/ptxcompiler${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} + sed -r -i "s/cubinlinker\"/cubinlinker${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} +fi + +if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then + sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file} + sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file} + sed -i "/ptxcompiler/d" ${pyproject_file} + sed -i "/cubinlinker/d" ${pyproject_file} +fi + +cd "${package_dir}" + +python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh new file mode 100755 index 00000000000..7d3919b2d72 --- /dev/null +++ b/ci/build_wheel_cudf.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -euo pipefail + +package_dir="python/cudf" + +export SKBUILD_CONFIGURE_OPTIONS="-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF" + +./ci/build_wheel.sh cudf ${package_dir} + +mkdir -p ${package_dir}/final_dist +python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/* + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh new file mode 100755 index 00000000000..47e35c46004 --- /dev/null +++ b/ci/build_wheel_dask_cudf.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -euo pipefail + +package_dir="python/dask_cudf" + +./ci/build_wheel.sh dask_cudf ${package_dir} + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/dist diff --git a/ci/check_style.sh b/ci/check_style.sh index 36b856ae6f3..e96ad8bf1db 100755 --- a/ci/check_style.sh +++ b/ci/check_style.sh @@ -14,7 +14,7 @@ rapids-dependency-file-generator \ rapids-mamba-retry env create --force -f env.yaml -n checks conda activate checks -FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.06/cmake-format-rapids-cmake.json +FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/cmake-format-rapids-cmake.json export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE}) wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL} diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py index e76d9524c76..dd89b092496 100644 --- a/ci/checks/copyright.py +++ b/ci/checks/copyright.py @@ -26,7 +26,6 @@ re.compile(r"CMakeLists[.]txt$"), re.compile(r"CMakeLists_standalone[.]txt$"), re.compile(r"setup[.]cfg$"), - re.compile(r"[.]flake8[.]cython$"), re.compile(r"meta[.]yaml$"), ] ExemptFiles = [ diff --git a/ci/checks/doxygen.sh b/ci/checks/doxygen.sh index f260fbcd1a4..d932fa097e9 100755 --- a/ci/checks/doxygen.sh +++ b/ci/checks/doxygen.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. ############################### # cuDF doxygen warnings check # ############################### @@ -13,11 +13,11 @@ fi # Utility to return version as number for comparison function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }'; } -# doxygen supported version 1.8.20 to 1.9.1 +# doxygen supported version 1.9.1 DOXYGEN_VERSION=`doxygen --version` -if [ $(version "$DOXYGEN_VERSION") -lt $(version "1.8.20") ] || [ $(version $DOXYGEN_VERSION) -gt $(version "1.9.1") ]; then +if [ ! $(version "$DOXYGEN_VERSION") -eq $(version "1.9.1") ] ; then echo -e "warning: Unsupported doxygen version $DOXYGEN_VERSION" - echo -e "Expecting doxygen version from 1.8.20 to 1.9.1" + echo -e "Expecting doxygen version 1.9.1" exit 0 fi diff --git a/ci/docs/build.sh b/ci/docs/build.sh deleted file mode 100755 index f50bb14d648..00000000000 --- a/ci/docs/build.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# Copyright (c) 2020-2022, NVIDIA CORPORATION. -################################# -# cuDF Docs build script for CI # -################################# - -if [ -z "$PROJECT_WORKSPACE" ]; then - echo ">>>> ERROR: Could not detect PROJECT_WORKSPACE in environment" - echo ">>>> WARNING: This script contains git commands meant for automated building, do not run locally" - exit 1 -fi - -export DOCS_WORKSPACE="$WORKSPACE/docs" -export PATH=/conda/bin:/usr/local/cuda/bin:$PATH -export HOME="$WORKSPACE" -export PROJECT_WORKSPACE=/rapids/cudf -export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" -export PROJECTS=(cudf libcudf) - -gpuci_logger "Check environment..." -env - -gpuci_logger "Check GPU usage..." -nvidia-smi - -gpuci_logger "Activate conda env..." -. /opt/conda/etc/profile.d/conda.sh -conda activate rapids - -gpuci_logger "Check versions..." -python --version - -conda info -conda config --show-sources -conda list --show-channel-urls - - -#libcudf Doxygen build -gpuci_logger "Build libcudf docs..." -cd $PROJECT_WORKSPACE/cpp/doxygen -wget "https://raw.githubusercontent.com/rapidsai/docs/gh-pages/api/librmm/${BRANCH_VERSION}/rmm.tag" || echo "Failed to download rmm Doxygen tag" -doxygen Doxyfile - -#cudf Sphinx Build -gpuci_logger "Build cuDF docs..." -cd $PROJECT_WORKSPACE/docs/cudf -make html - -#Commit to Website -cd $DOCS_WORKSPACE - -for PROJECT in ${PROJECTS[@]}; do - if [ ! -d "api/$PROJECT/$BRANCH_VERSION" ]; then - mkdir -p api/$PROJECT/$BRANCH_VERSION - fi - rm -rf $DOCS_WORKSPACE/api/$PROJECT/$BRANCH_VERSION/* -done - - -mv $PROJECT_WORKSPACE/docs/cudf/build/html/* $DOCS_WORKSPACE/api/cudf/$BRANCH_VERSION -mv $PROJECT_WORKSPACE/cpp/doxygen/html/* $DOCS_WORKSPACE/api/libcudf/$BRANCH_VERSION diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh deleted file mode 100755 index 3de1814dfaf..00000000000 --- a/ci/release/apply_wheel_modifications.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# Copyright (c) 2023, NVIDIA CORPORATION. -# -# Usage: bash apply_wheel_modifications.sh - -VERSION=${1} -CUDA_SUFFIX=${2} - -# pyproject.toml versions -sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/cudf/pyproject.toml -sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/dask_cudf/pyproject.toml -sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/cudf_kafka/pyproject.toml -sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/custreamz/pyproject.toml - -# cudf pyproject.toml cuda suffixes -sed -i "s/^name = \"cudf\"/name = \"cudf${CUDA_SUFFIX}\"/g" python/cudf/pyproject.toml -sed -i "s/rmm/rmm${CUDA_SUFFIX}/g" python/cudf/pyproject.toml -sed -i "s/ptxcompiler/ptxcompiler${CUDA_SUFFIX}/g" python/cudf/pyproject.toml -sed -i "s/cubinlinker/cubinlinker${CUDA_SUFFIX}/g" python/cudf/pyproject.toml - -# dask_cudf pyproject.toml cuda suffixes -sed -i "s/^name = \"dask_cudf\"/name = \"dask_cudf${CUDA_SUFFIX}\"/g" python/dask_cudf/pyproject.toml -# Need to provide the == to avoid modifying the URL -sed -i "s/\"cudf==/\"cudf${CUDA_SUFFIX}==/g" python/dask_cudf/pyproject.toml - -if [[ $CUDA_SUFFIX == "-cu12" ]]; then - sed -i "s/cuda-python[<=>\.,0-9]*/cuda-python>=12.0,<13.0/g" python/cudf/pyproject.toml - sed -i "s/cupy-cuda11x/cupy-cuda12x/g" python/{cudf,dask_cudf}/pyproject.toml - sed -i "s/numba[<=>\.,0-9]*/numba>=0.57/g" python/{cudf,dask_cudf}/pyproject.toml - sed -i "/ptxcompiler/d" python/cudf/pyproject.toml - sed -i "/cubinlinker/d" python/cudf/pyproject.toml -fi diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 2ee901d178e..5e735a71994 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -21,12 +21,14 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR} #Get . for next version NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}') NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}') +NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}') NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*" # Need to distutils-normalize the versions for some use cases CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))") NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") +PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))") echo "current is ${CURRENT_SHORT_TAG_PEP440}, next is ${NEXT_SHORT_TAG_PEP440}" echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG" @@ -60,6 +62,9 @@ sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/dask_cudf/p sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cudf_kafka/pyproject.toml sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/custreamz/pyproject.toml +# Wheel testing script +sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_dask_cudf.sh + # rapids-cmake version sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake @@ -75,14 +80,24 @@ sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/ sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/dask_cudf/source/conf.py sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/dask_cudf/source/conf.py - -# bump rmm & dask-cuda -for FILE in conda/environments/*.yaml dependencies.yaml; do - sed_runner "s/cudf==${CURRENT_SHORT_TAG_PEP440}/cudf==${NEXT_SHORT_TAG_PEP440}/g" ${FILE}; - sed_runner "s/cudf_kafka==${CURRENT_SHORT_TAG_PEP440}/cudf_kafka==${NEXT_SHORT_TAG_PEP440}/g" ${FILE}; - sed_runner "s/dask-cuda==${CURRENT_SHORT_TAG_PEP440}/dask-cuda==${NEXT_SHORT_TAG_PEP440}/g" ${FILE}; - sed_runner "s/kvikio==${CURRENT_SHORT_TAG_PEP440}/kvikio==${NEXT_SHORT_TAG_PEP440}/g" ${FILE}; - sed_runner "s/rmm==${CURRENT_SHORT_TAG_PEP440}/rmm==${NEXT_SHORT_TAG_PEP440}/g" ${FILE}; +DEPENDENCIES=( + cudf + cudf_kafka + custreamz + dask-cuda + dask-cudf + kvikio + libkvikio + librmm + rmm +) +for DEP in "${DEPENDENCIES[@]}"; do + for FILE in dependencies.yaml conda/environments/*.yaml; do + sed_runner "/-.* ${DEP}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" ${FILE} + done + for FILE in python/*/pyproject.toml; do + sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE} + done done # Doxyfile update @@ -96,13 +111,15 @@ sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt -# Dependency versions in pyproject.toml -sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/pyproject.toml -sed_runner "s/cudf==.*\",/cudf==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/dask_cudf/pyproject.toml - # CI files for FILE in .github/workflows/*.yaml; do sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE}; done -sed_runner "s/VERSION_NUMBER=\".*/VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh +sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh + +# Java files +NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT" +sed_runner "s|.*-SNAPSHOT|${NEXT_FULL_JAVA_TAG}|g" java/pom.xml +sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" java/ci/README.md +sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 846b90c78e5..30172b76f01 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -7,35 +7,39 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e -# Get library for finding incorrect default stream usage. -STREAM_IDENTIFY_LIB_MODE_CUDF="${CONDA_PREFIX}/lib/libcudf_identify_stream_usage_mode_cudf.so" -STREAM_IDENTIFY_LIB_MODE_TESTING="${CONDA_PREFIX}/lib/libcudf_identify_stream_usage_mode_testing.so" - -echo "STREAM_IDENTIFY_LIB=${STREAM_IDENTIFY_LIB_MODE_CUDF}" - # Run libcudf and libcudf_kafka gtests from libcudf-tests package -rapids-logger "Run gtests" - -cd $CONDA_PREFIX/bin/gtests/libcudf/ -export GTEST_CUDF_STREAM_MODE="new_cudf_default" export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/ -export LD_PRELOAD=${STREAM_IDENTIFY_LIB_MODE_CUDF} - -ctest -E SPAN_TEST -j20 --output-on-failure - -# This one test is specifically designed to test using a thrust device vector, -# so we expect and allow it to include default stream usage. -_allowlist_filter="SpanTest.CanConstructFromDeviceContainers" -GTEST_FILTER="-${_allowlist_filter}" ctest -R SPAN_TEST -VV -LD_PRELOAD= GTEST_CUDF_STREAM_MODE=default GTEST_FILTER="${_allowlist_filter}" ctest -R SPAN_TEST -VV +pushd $CONDA_PREFIX/bin/gtests/libcudf/ +rapids-logger "Run libcudf gtests" +ctest -j20 --output-on-failure SUITEERROR=$? +popd if (( ${SUITEERROR} == 0 )); then - cd $CONDA_PREFIX/bin/gtests/libcudf_kafka/ + pushd $CONDA_PREFIX/bin/gtests/libcudf_kafka/ + rapids-logger "Run libcudf_kafka gtests" ctest -j20 --output-on-failure SUITEERROR=$? + popd +fi + +# Ensure that benchmarks are runnable +pushd $CONDA_PREFIX/bin/benchmarks/libcudf/ +rapids-logger "Run tests of libcudf benchmarks" + +if (( ${SUITEERROR} == 0 )); then + # Run a small Google benchmark + ./MERGE_BENCH --benchmark_filter=/2/ + SUITEERROR=$? +fi + +if (( ${SUITEERROR} == 0 )); then + # Run a small nvbench benchmark + ./STRINGS_NVBENCH --run-once --benchmark 0 --devices 0 + SUITEERROR=$? fi +popd rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh new file mode 100755 index 00000000000..83e24ab3ff1 --- /dev/null +++ b/ci/test_wheel_cudf.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -eou pipefail + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist + +# echo to expand wildcard before adding `[extra]` requires for pip +python -m pip install $(echo ./dist/cudf*.whl)[test] + +# Run smoke tests for aarch64 pull requests +if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then + python ./ci/wheel_smoke_test_cudf.py +else + python -m pytest -n 8 ./python/cudf/cudf/tests +fi diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh new file mode 100755 index 00000000000..d6e7f4bf65e --- /dev/null +++ b/ci/test_wheel_dask_cudf.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION. + +set -eou pipefail + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist + +# Download the cudf built in the previous step +RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep +python -m pip install --no-deps ./local-cudf-dep/cudf*.whl + +# Always install latest dask for testing +python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10 + +# echo to expand wildcard before adding `[extra]` requires for pip +python -m pip install $(echo ./dist/dask_cudf*.whl)[test] + +python -m pytest -n 8 ./python/dask_cudf/dask_cudf/tests diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 4031f1aa1c3..692ba78f317 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -9,24 +9,27 @@ channels: - nvidia dependencies: - aiobotocore>=2.2.0 +- benchmark==1.8.0 - boto3>=1.21.21 - botocore>=1.24.21 - c-compiler - cachetools -- cmake>=3.23.1,!=3.25.0 +- cmake>=3.26.4 - cubinlinker -- cuda-python>=11.7.1,<12.0 +- cuda-nvtx=11.8 +- cuda-python>=11.7.1,<12.0a0 - cuda-sanitizer-api=11.8.86 -- cudatoolkit=11.8 +- cuda-version=11.8 +- cudatoolkit - cupy>=12.0.0 - cxx-compiler -- cython>=0.29,<0.30 -- dask-core==2023.3.2 -- dask-cuda==23.6.* -- dask==2023.3.2 -- distributed==2023.3.2.1 +- cython>=3.0.0 +- dask-core>=2023.7.1 +- dask-cuda==23.10.* +- dask>=2023.7.1 +- distributed>=2023.7.1 - dlpack>=0.5,<0.6.0a0 -- doxygen=1.8.20 +- doxygen=1.9.1 - fastavro>=0.22.9 - fmt>=9.1.0,<10 - fsspec>=0.6.0 @@ -34,13 +37,17 @@ dependencies: - gmock>=1.13.0 - gtest>=1.13.0 - hypothesis +- identify>=2.5.20 - ipython -- libarrow==11.0.0.* +- libarrow==12.0.1.* +- libcufile-dev=1.4.0.31 +- libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 -- libkvikio==23.6.* +- libkvikio==23.10.* - librdkafka>=1.9.0,<1.10.0a0 -- librmm==23.6.* +- librmm==23.10.* +- make - mimesis>=4.1.0 - moto>=4.0.8 - msgpack-python @@ -48,19 +55,20 @@ dependencies: - nbsphinx - ninja - notebook -- numba>=0.56.4,<0.57 -- numpy>=1.21,<1.24 +- numba>=0.57 +- numpy>=1.21 - numpydoc - nvcc_linux-64=11.8 +- nvcomp==2.6.1 - nvtx>=0.2.1 - packaging - pandas>=1.3,<1.6.0dev0 - pandoc - pip - pre-commit -- protobuf>=4.21.6,<4.22 +- protobuf>=4.21,<5 - ptxcompiler -- pyarrow==11.0.0.* +- pyarrow==12.0.1.* - pydata-sphinx-theme - pyorc - pytest @@ -72,9 +80,9 @@ dependencies: - python-snappy>=0.6.0 - python>=3.9,<3.11 - pytorch<1.12.0 -- rmm==23.6.* +- rmm==23.10.* - s3fs>=2022.3.0 -- scikit-build>=0.13.1,<0.17.2 +- scikit-build>=0.13.1 - scipy - spdlog>=1.11.0,<1.12 - sphinx @@ -86,7 +94,8 @@ dependencies: - sysroot_linux-64==2.17 - tokenizers==0.13.1 - transformers==4.24.0 -- typing_extensions +- typing_extensions>=4.0.0 +- zlib>=1.2.13 - pip: - git+https://github.com/python-streamz/streamz.git@master name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml new file mode 100644 index 00000000000..cf1bf4b8733 --- /dev/null +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -0,0 +1,98 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- pytorch +- conda-forge +- nvidia +dependencies: +- aiobotocore>=2.2.0 +- benchmark==1.8.0 +- boto3>=1.21.21 +- botocore>=1.24.21 +- c-compiler +- cachetools +- cmake>=3.26.4 +- cuda-cudart-dev +- cuda-nvcc +- cuda-nvrtc-dev +- cuda-nvtx-dev +- cuda-python>=12.0,<13.0a0 +- cuda-sanitizer-api +- cuda-version=12.0 +- cupy>=12.0.0 +- cxx-compiler +- cython>=3.0.0 +- dask-core>=2023.7.1 +- dask-cuda==23.10.* +- dask>=2023.7.1 +- distributed>=2023.7.1 +- dlpack>=0.5,<0.6.0a0 +- doxygen=1.9.1 +- fastavro>=0.22.9 +- fmt>=9.1.0,<10 +- fsspec>=0.6.0 +- gcc_linux-64=11.* +- gmock>=1.13.0 +- gtest>=1.13.0 +- hypothesis +- identify>=2.5.20 +- ipython +- libarrow==12.0.1.* +- libcufile-dev +- libcurand-dev +- libkvikio==23.10.* +- librdkafka>=1.9.0,<1.10.0a0 +- librmm==23.10.* +- make +- mimesis>=4.1.0 +- moto>=4.0.8 +- msgpack-python +- myst-nb +- nbsphinx +- ninja +- notebook +- numba>=0.57 +- numpy>=1.21 +- numpydoc +- nvcomp==2.6.1 +- nvtx>=0.2.1 +- packaging +- pandas>=1.3,<1.6.0dev0 +- pandoc +- pip +- pre-commit +- protobuf>=4.21,<5 +- pyarrow==12.0.1.* +- pydata-sphinx-theme +- pyorc +- pytest +- pytest-benchmark +- pytest-cases +- pytest-cov +- pytest-xdist +- python-confluent-kafka>=1.9.0,<1.10.0a0 +- python-snappy>=0.6.0 +- python>=3.9,<3.11 +- pytorch<1.12.0 +- rmm==23.10.* +- s3fs>=2022.3.0 +- scikit-build>=0.13.1 +- scipy +- spdlog>=1.11.0,<1.12 +- sphinx +- sphinx-autobuild +- sphinx-copybutton +- sphinx-markdown-tables +- sphinxcontrib-websupport +- streamz +- sysroot_linux-64==2.17 +- tokenizers==0.13.1 +- transformers==4.24.0 +- typing_extensions>=4.0.0 +- zlib>=1.2.13 +- pip: + - git+https://github.com/python-streamz/streamz.git@master +name: all_cuda-120_arch-x86_64 diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml index 7494fec79a0..c98c2701653 100644 --- a/conda/recipes/cudf/conda_build_config.yaml +++ b/conda/recipes/cudf/conda_build_config.yaml @@ -8,7 +8,10 @@ sysroot_version: - "2.17" cmake_version: - - ">=3.23.1,!=3.25.0" + - ">=3.26.4" cuda_compiler: + - cuda-nvcc + +cuda11_compiler: - nvcc diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index f8074711b88..a909b72c878 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -37,52 +37,74 @@ build: # libcudf's run_exports pinning is looser than we would like - libcudf ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - cmake {{ cmake_version }} + - ninja - {{ compiler('c') }} - {{ compiler('cxx') }} - - {{ compiler('cuda') }} {{ cuda_version }} - - ninja + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} - sysroot_{{ target_platform }} {{ sysroot_version }} host: - - protobuf >=4.21.6,<4.22 + - protobuf ==4.21.* - python - - cython >=0.29,<0.30 + - cython >=3.0.0 - scikit-build >=0.13.1 - setuptools - - numba >=0.56.4,<0.57 - dlpack >=0.5,<0.6.0a0 - - pyarrow =11 + - pyarrow =12 - libcudf ={{ version }} - rmm ={{ minor_version }} - - cudatoolkit ={{ cuda_version }} + {% if cuda_major == "11" %} + - cudatoolkit + {% else %} + - cuda-cudart-dev + - cuda-nvrtc + - libcufile-dev # [linux64] + {% endif %} + - cuda-version ={{ cuda_version }} run: - - protobuf >=4.21.6,<4.22 + - {{ pin_compatible('protobuf', min_pin='x.x', max_pin='x') }} - python - - typing_extensions + - typing_extensions >=4.0.0 - pandas >=1.3,<1.6.0dev0 - cupy >=12.0.0 - - numba >=0.56.4,<0.57 - - numpy >=1.21,<1.24 # Temporarily upper bound numpy to avoid overflow deprecations + - numba >=0.57 + - numpy >=1.21 - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} - - libcudf {{ version }} - - fastavro >=0.22.0 + - libcudf ={{ version }} - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 - - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} + {% if cuda_major == "11" %} + - cudatoolkit + - ptxcompiler >=0.7.0 + - cubinlinker # CUDA enhanced compatibility. + - cuda-python >=11.7.1,<12.0a0 + {% else %} + # Needed by Numba for CUDA support + - cuda-nvcc-impl + # TODO: Add nvjitlink here + # xref: https://github.com/rapidsai/cudf/issues/12822 + - cuda-nvrtc + - cuda-python >=12.0,<13.0a0 + {% endif %} + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - nvtx >=0.2.1 - packaging - - ptxcompiler >=0.7.0 - cachetools - - cubinlinker # CUDA enhanced compatibility. - - cuda-python >=11.7.1,<12.0 test: requires: - - cudatoolkit ={{ cuda_version }} + - cuda-version ={{ cuda_version }} imports: - cudf diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml index ccc49851a8e..b63a136ad2d 100644 --- a/conda/recipes/cudf_kafka/conda_build_config.yaml +++ b/conda/recipes/cudf_kafka/conda_build_config.yaml @@ -6,3 +6,6 @@ cxx_compiler_version: sysroot_version: - "2.17" + +cmake_version: + - ">=3.26.4" diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index 9a0d0f0d48e..ec0cc402511 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -4,6 +4,7 @@ {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} +{% set cuda_major = cuda_version.split('.')[0] %} {% set date_string = environ['RAPIDS_DATE_STRING'] %} package: @@ -15,7 +16,7 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY @@ -35,25 +36,27 @@ build: requirements: build: - - cmake >=3.23.1,!=3.25.0 + - cmake {{ cmake_version }} - {{ compiler('c') }} - {{ compiler('cxx') }} - ninja - sysroot_{{ target_platform }} {{ sysroot_version }} host: - python - - cython >=0.29,<0.30 + - cython >=3.0.0 + - cuda-version ={{ cuda_version }} - cudf ={{ version }} - libcudf_kafka ={{ version }} - setuptools run: - python + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - libcudf_kafka ={{ version }} - cudf ={{ version }} test: requires: - - cudatoolkit ={{ cuda_version }} + - cuda-version ={{ cuda_version }} imports: - cudf_kafka diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index b5aff0090dd..7aaa40bffd0 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -4,6 +4,7 @@ {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} +{% set cuda_major = cuda_version.split('.')[0] %} {% set date_string = environ['RAPIDS_DATE_STRING'] %} package: @@ -15,7 +16,7 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY @@ -38,19 +39,21 @@ requirements: - python - python-confluent-kafka >=1.9.0,<1.10.0a0 - cudf_kafka ={{ version }} + - cuda-version ={{ cuda_version }} run: - python - streamz - cudf ={{ version }} - - dask ==2023.3.2 - - dask-core ==2023.3.2 - - distributed ==2023.3.2.1 - - python-confluent-kafka >=1.9.0,<1.10.0a0 - cudf_kafka ={{ version }} + - dask >=2023.7.1 + - dask-core >=2023.7.1 + - distributed >=2023.7.1 + - python-confluent-kafka >=1.9.0,<1.10.0a0 + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: requires: - - cudatoolkit ={{ cuda_version }} + - cuda-version ={{ cuda_version }} imports: - custreamz diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index d060723859d..12809ba648f 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -38,21 +38,21 @@ requirements: host: - python - cudf ={{ version }} - - dask ==2023.3.2 - - dask-core ==2023.3.2 - - distributed ==2023.3.2.1 - - cudatoolkit ={{ cuda_version }} + - dask >=2023.7.1 + - dask-core >=2023.7.1 + - distributed >=2023.7.1 + - cuda-version ={{ cuda_version }} run: - python - cudf ={{ version }} - - dask ==2023.3.2 - - dask-core ==2023.3.2 - - distributed ==2023.3.2.1 - - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} + - dask >=2023.7.1 + - dask-core >=2023.7.1 + - distributed >=2023.7.1 + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: requires: - - cudatoolkit ={{ cuda_version }} + - cuda-version ={{ cuda_version }} imports: - dask_cudf diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh index d315e1d8a6d..7dc54747a0c 100644 --- a/conda/recipes/dask-cudf/run_test.sh +++ b/conda/recipes/dask-cudf/run_test.sh @@ -18,18 +18,18 @@ if [ "${ARCH}" = "aarch64" ]; then fi # Dask & Distributed option to install main(nightly) or `conda-forge` packages. -export INSTALL_DASK_MAIN=0 +export INSTALL_DASK_MAIN=1 # Dask version to install when `INSTALL_DASK_MAIN=0` -export DASK_STABLE_VERSION="2023.3.2" +export DASK_STABLE_VERSION="2023.7.1" # Install the conda-forge or nightly version of dask and distributed if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then rapids-logger "rapids-mamba-retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'" rapids-mamba-retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed" else - rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed==2023.3.2.1 conda-forge::dask-core==2023.3.2 --force-reinstall" - rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=="2023.3.2.1" conda-forge::dask-core=="2023.3.2" --force-reinstall + rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall" + rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall fi logger "python -c 'import dask_cudf'" diff --git a/conda/recipes/libcudf/build.sh b/conda/recipes/libcudf/build.sh index 7ac9e83f31c..47047f41b25 100644 --- a/conda/recipes/libcudf/build.sh +++ b/conda/recipes/libcudf/build.sh @@ -1,5 +1,9 @@ #!/bin/bash -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. export cudf_ROOT="$(realpath ./cpp/build)" -./build.sh -n -v libcudf libcudf_kafka benchmarks tests --build_metrics --incl_cache_stats --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DCUDF_ENABLE_ARROW_S3=ON\" + +./build.sh -n -v \ + libcudf libcudf_kafka benchmarks tests \ + --build_metrics --incl_cache_stats \ + --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DCUDF_ENABLE_ARROW_S3=ON -DNVBench_ENABLE_CUPTI=OFF\" diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index d89cbee67d0..25b3f19de77 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -5,19 +5,25 @@ cxx_compiler_version: - 11 cuda_compiler: + - cuda-nvcc + +cuda11_compiler: - nvcc sysroot_version: - "2.17" cmake_version: - - ">=3.23.1,!=3.25.0" + - ">=3.26.4" + +gbench_version: + - "==1.8.0" gtest_version: - ">=1.13.0" libarrow_version: - - "=11" + - "=12" dlpack_version: - ">=0.5,<0.6.0a0" @@ -25,24 +31,29 @@ dlpack_version: librdkafka_version: - ">=1.9.0,<1.10.0a0" -# The CTK libraries below are missing from the conda-forge::cudatoolkit -# package. The "*_host_*" version specifiers correspond to `11.8` packages and the -# "*_run_*" version specifiers correspond to `11.x` packages. +fmt_version: + - ">=9.1.0,<10" + +spdlog_version: + - ">=1.11.0,<1.12" + +nvcomp_version: + - "=2.6.1" -libcufile_host_version: +zlib_version: + - ">=1.2.13" +# The CTK libraries below are missing from the conda-forge::cudatoolkit package +# for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages +# and the "*_run_*" version specifiers correspond to `11.x` packages. + +cuda11_libcufile_host_version: - "1.4.0.31" -libcufile_run_version: +cuda11_libcufile_run_version: - ">=1.0.0.82,<=1.4.0.31" -libcurand_host_version: +cuda11_libcurand_host_version: - "=10.3.0.86" -libcurand_run_version: +cuda11_libcurand_run_version: - ">=10.2.5.43,<10.3.1" - -fmt_version: - - ">=9.1.0,<10" - -spdlog_version: - - ">=1.11.0,<1.12" diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 275b8f9332f..c844131ad31 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -37,27 +37,43 @@ requirements: - cmake {{ cmake_version }} - {{ compiler('c') }} - {{ compiler('cxx') }} - - {{ compiler('cuda') }} {{ cuda_version }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} - ninja - sysroot_{{ target_platform }} {{ sysroot_version }} host: - librmm ={{ minor_version }} - libkvikio ={{ minor_version }} - - cudatoolkit ={{ cuda_version }} + {% if cuda_major == "11" %} + - cudatoolkit + - libcufile {{ cuda11_libcufile_host_version }} # [linux64] + - libcufile-dev {{ cuda11_libcufile_host_version }} # [linux64] + - libcurand {{ cuda11_libcurand_host_version }} + - libcurand-dev {{ cuda11_libcurand_host_version }} - cuda-nvrtc ={{ cuda_version }} - cuda-nvrtc-dev ={{ cuda_version }} - cuda-nvtx ={{ cuda_version }} - - libcufile {{ libcufile_host_version }} # [linux64] - - libcufile-dev {{ libcufile_host_version }} # [linux64] - - libcurand {{ libcurand_host_version }} - - libcurand-dev {{ libcurand_host_version }} + {% else %} + - cuda-nvrtc-dev + - cuda-nvtx-dev + - libcufile-dev # [linux64] + - libcurand-dev + {% endif %} + - cuda-version ={{ cuda_version }} + - nvcomp {{ nvcomp_version }} - libarrow {{ libarrow_version }} - dlpack {{ dlpack_version }} - librdkafka {{ librdkafka_version }} - fmt {{ fmt_version }} - spdlog {{ spdlog_version }} + - benchmark {{ gbench_version }} - gtest {{ gtest_version }} - gmock {{ gtest_version }} + - zlib {{ zlib_version }} outputs: - name: libcudf @@ -69,17 +85,25 @@ outputs: run_exports: - {{ pin_subpackage("libcudf", max_pin="x.x") }} ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - cmake {{ cmake_version }} run: - - cudatoolkit {{ cuda_spec }} + {% if cuda_major == "11" %} + - cudatoolkit + - libcufile {{ cuda11_libcufile_run_version }} # [linux64] + {% else %} + - cuda-nvrtc + - libcufile # [linux64] + {% endif %} + - cuda-version {{ cuda_spec }} + - nvcomp {{ nvcomp_version }} - librmm ={{ minor_version }} - libkvikio ={{ minor_version }} - libarrow {{ libarrow_version }} - - libcufile {{ libcufile_run_version }} # [linux64] - - libcufile-dev {{ libcufile_run_version }} # [linux64] - dlpack {{ dlpack_version }} - gtest {{ gtest_version }} - gmock {{ gtest_version }} @@ -91,6 +115,7 @@ outputs: - test -f $PREFIX/lib/libcudf_identify_stream_usage_mode_testing.so - test -f $PREFIX/include/cudf/aggregation.hpp - test -f $PREFIX/include/cudf/ast/detail/expression_parser.hpp + - test -f $PREFIX/include/cudf/ast/detail/expression_transformer.hpp - test -f $PREFIX/include/cudf/ast/detail/operators.hpp - test -f $PREFIX/include/cudf/ast/expressions.hpp - test -f $PREFIX/include/cudf/binaryop.hpp @@ -107,6 +132,7 @@ outputs: - test -f $PREFIX/include/cudf/detail/binaryop.hpp - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh - test -f $PREFIX/include/cudf/detail/concatenate.hpp + - test -f $PREFIX/include/cudf/detail/concatenate_masks.hpp - test -f $PREFIX/include/cudf/detail/contiguous_split.hpp - test -f $PREFIX/include/cudf/detail/copy.hpp - test -f $PREFIX/include/cudf/detail/datetime.hpp @@ -115,7 +141,6 @@ outputs: - test -f $PREFIX/include/cudf/detail/groupby.hpp - test -f $PREFIX/include/cudf/detail/groupby/group_replace_nulls.hpp - test -f $PREFIX/include/cudf/detail/groupby/sort_helper.hpp - - test -f $PREFIX/include/cudf/detail/hashing.hpp - test -f $PREFIX/include/cudf/detail/interop.hpp - test -f $PREFIX/include/cudf/detail/is_element_valid.hpp - test -f $PREFIX/include/cudf/detail/join.hpp @@ -149,6 +174,7 @@ outputs: - test -f $PREFIX/include/cudf/detail/utilities/linked_column.hpp - test -f $PREFIX/include/cudf/detail/utilities/logger.hpp - test -f $PREFIX/include/cudf/detail/utilities/pinned_host_vector.hpp + - test -f $PREFIX/include/cudf/detail/utilities/stacktrace.hpp - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp - test -f $PREFIX/include/cudf/detail/utilities/visitor_overload.hpp - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp @@ -167,7 +193,9 @@ outputs: - test -f $PREFIX/include/cudf/fixed_point/temporary.hpp - test -f $PREFIX/include/cudf/groupby.hpp - test -f $PREFIX/include/cudf/hashing.hpp + - test -f $PREFIX/include/cudf/hashing/detail/hashing.hpp - test -f $PREFIX/include/cudf/interop.hpp + - test -f $PREFIX/include/cudf/io/arrow_io_source.hpp - test -f $PREFIX/include/cudf/io/avro.hpp - test -f $PREFIX/include/cudf/io/csv.hpp - test -f $PREFIX/include/cudf/io/data_sink.hpp @@ -184,6 +212,7 @@ outputs: - test -f $PREFIX/include/cudf/io/orc_metadata.hpp - test -f $PREFIX/include/cudf/io/orc_types.hpp - test -f $PREFIX/include/cudf/io/parquet.hpp + - test -f $PREFIX/include/cudf/io/parquet_metadata.hpp - test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp @@ -226,6 +255,7 @@ outputs: - test -f $PREFIX/include/cudf/partitioning.hpp - test -f $PREFIX/include/cudf/quantiles.hpp - test -f $PREFIX/include/cudf/reduction.hpp + - test -f $PREFIX/include/cudf/reduction/detail/reduction.hpp - test -f $PREFIX/include/cudf/reduction/detail/reduction_functions.hpp - test -f $PREFIX/include/cudf/reduction/detail/segmented_reduction_functions.hpp - test -f $PREFIX/include/cudf/replace.hpp @@ -329,19 +359,22 @@ outputs: license_family: APACHE license_file: LICENSE summary: libcudf library - prelink_message: - - nvcomp.txt - name: libcudf_kafka version: {{ version }} script: install_libcudf_kafka.sh build: number: {{ GIT_DESCRIBE_NUMBER }} - string: {{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - cmake {{ cmake_version }} + host: + - librdkafka {{ librdkafka_version }} + - {{ pin_subpackage('libcudf', exact=True) }} run: - librdkafka {{ librdkafka_version }} - {{ pin_subpackage('libcudf', exact=True) }} @@ -359,20 +392,32 @@ outputs: script: install_libcudf_example.sh build: number: {{ GIT_DESCRIBE_NUMBER }} - string: {{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - cmake {{ cmake_version }} - {{ compiler('c') }} - {{ compiler('cxx') }} - - {{ compiler('cuda') }} {{ cuda_version }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} + {% endif %} + - cuda-version ={{ cuda_version }} - ninja - sysroot_{{ target_platform }} {{ sysroot_version }} host: - {{ pin_subpackage('libcudf', exact=True) }} + {% if cuda_major == "11" %} - cuda-nvtx ={{ cuda_version }} + {% else %} + - cuda-nvtx-dev + {% endif %} + - cuda-version ={{ cuda_version }} run: - {{ pin_subpackage('libcudf', exact=True) }} about: @@ -388,17 +433,34 @@ outputs: number: {{ GIT_DESCRIBE_NUMBER }} string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: - - {{ compiler('cuda') }} + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - cmake {{ cmake_version }} + host: + - {{ pin_subpackage('libcudf', exact=True) }} + - {{ pin_subpackage('libcudf_kafka', exact=True) }} + - cuda-version {{ cuda_spec }} + {% if cuda_major == "11" %} + - libcurand {{ cuda11_libcurand_run_version }} + {% else %} + - libcurand-dev + {% endif %} + - benchmark {{ gbench_version }} + - gtest {{ gtest_version }} + - gmock {{ gtest_version }} run: - {{ pin_subpackage('libcudf', exact=True) }} - {{ pin_subpackage('libcudf_kafka', exact=True) }} - - cudatoolkit {{ cuda_spec }} + - cuda-version {{ cuda_spec }} + {% if cuda_major == "11" %} + - libcurand {{ cuda11_libcurand_run_version }} + {% endif %} + - benchmark {{ gbench_version }} - gtest {{ gtest_version }} - gmock {{ gtest_version }} - - libcurand {{ libcurand_run_version }} about: home: https://rapids.ai/ license: Apache-2.0 diff --git a/conda/recipes/libcudf/nvcomp.txt b/conda/recipes/libcudf/nvcomp.txt deleted file mode 100644 index 9a0047e71fa..00000000000 --- a/conda/recipes/libcudf/nvcomp.txt +++ /dev/null @@ -1,3 +0,0 @@ -By downloading and using the libcudf conda package, you accept the terms -and conditions of the NVIDIA NVCOMP Software License Agreement: - https://developer.download.nvidia.com/compute/nvcomp/2.3/LICENSE.txt diff --git a/conda/recipes/libcudf/post-link.sh b/conda/recipes/libcudf/post-link.sh deleted file mode 100644 index 8ae2349f791..00000000000 --- a/conda/recipes/libcudf/post-link.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# Only add the license notice to libcudf and not our examples / tests -if [[ "$PKG_NAME" == "libcudf" ]]; then - cat ./nvcomp.txt >> $PREFIX/.messages.txt -fi diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0c33550c9df..516865e5782 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) include(../fetch_rapids.cmake) include(rapids-cmake) @@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF) project( CUDF - VERSION 23.06.00 + VERSION 23.10.00 LANGUAGES C CXX CUDA ) if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5) @@ -62,11 +62,18 @@ option( stream to external libraries." OFF ) +# Option to add all symbols to the dynamic symbol table in the library file, allowing to retrieve +# human-readable stacktrace for debugging. +option( + CUDF_BUILD_STACKTRACE_DEBUG + "Replace the current optimization flags by the options '-rdynamic -Og -NDEBUG', useful for debugging with stacktrace retrieval" + OFF +) option(DISABLE_DEPRECATION_WARNINGS "Disable warnings generated from deprecated declarations." OFF) # Option to enable line info in CUDA device compilation to allow introspection when profiling / # memchecking option(CUDA_ENABLE_LINEINFO - "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF + "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF ) option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON) # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking @@ -94,13 +101,17 @@ message(VERBOSE "CUDF: Use a file cache for JIT compiled kernels: ${JITIFY_USE_C message(VERBOSE "CUDF: Build and statically link Arrow libraries: ${CUDF_USE_ARROW_STATIC}") message(VERBOSE "CUDF: Build and enable S3 filesystem support for Arrow: ${CUDF_ENABLE_ARROW_S3}") message(VERBOSE "CUDF: Build with per-thread default stream: ${CUDF_USE_PER_THREAD_DEFAULT_STREAM}") +message( + VERBOSE + "CUDF: Replace the current optimization flags by the options '-rdynamic -Og' (useful for debugging with stacktrace retrieval): ${CUDF_BUILD_STACKTRACE_DEBUG}" +) message( VERBOSE "CUDF: Disable warnings generated from deprecated declarations: ${DISABLE_DEPRECATION_WARNINGS}" ) message( VERBOSE - "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler: ${CUDA_ENABLE_LINEINFO}" + "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler): ${CUDA_ENABLE_LINEINFO}" ) message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}") @@ -115,6 +126,10 @@ if(BUILD_TESTS AND NOT CUDF_BUILD_TESTUTIL) ) endif() +if(CUDF_BUILD_STACKTRACE_DEBUG AND NOT CMAKE_COMPILER_IS_GNUCXX) + message(FATAL_ERROR "CUDF_BUILD_STACKTRACE_DEBUG is only supported with GCC compiler") +endif() + set(CUDF_CXX_FLAGS "") set(CUDF_CUDA_FLAGS "") set(CUDF_CXX_DEFINITIONS "") @@ -178,8 +193,7 @@ include(cmake/thirdparty/get_arrow.cmake) # find dlpack include(cmake/thirdparty/get_dlpack.cmake) # find libcu++ -include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) -rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) +include(cmake/thirdparty/get_libcudacxx.cmake) # find cuCollections Should come after including thrust and libcudacxx include(cmake/thirdparty/get_cucollections.cmake) # find or install GoogleTest @@ -330,8 +344,10 @@ add_library( src/groupby/sort/sort_helper.cu src/hash/hashing.cu src/hash/md5_hash.cu - src/hash/murmur_hash.cu - src/hash/spark_murmur_hash.cu + src/hash/murmurhash3_x86_32.cu + src/hash/murmurhash3_x64_128.cu + src/hash/spark_murmurhash3_x86_32.cu + src/hash/xxhash_64.cu src/interop/dlpack.cpp src/interop/from_arrow.cu src/interop/to_arrow.cu @@ -346,6 +362,7 @@ add_library( src/io/comp/nvcomp_adapter.cpp src/io/comp/nvcomp_adapter.cu src/io/comp/snap.cu + src/io/comp/statistics.cu src/io/comp/uncomp.cpp src/io/comp/unsnap.cu src/io/csv/csv_gpu.cu @@ -353,13 +370,13 @@ add_library( src/io/csv/reader_impl.cu src/io/csv/writer_impl.cu src/io/functions.cpp + src/io/json/byte_range_info.cu src/io/json/json_column.cu - src/io/json/json_gpu.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu - src/io/json/reader_impl.cu - src/io/json/experimental/byte_range_info.cu - src/io/json/experimental/read_json.cpp + src/io/json/read_json.cu + src/io/json/legacy/json_gpu.cu + src/io/json/legacy/reader_impl.cu src/io/json/write_json.cu src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu @@ -373,10 +390,14 @@ add_library( src/io/orc/writer_impl.cu src/io/parquet/compact_protocol_reader.cpp src/io/parquet/compact_protocol_writer.cpp + src/io/parquet/decode_preprocess.cu src/io/parquet/page_data.cu src/io/parquet/chunk_dict.cu src/io/parquet/page_enc.cu src/io/parquet/page_hdr.cu + src/io/parquet/page_delta_decode.cu + src/io/parquet/page_string_decode.cu + src/io/parquet/predicate_pushdown.cpp src/io/parquet/reader.cpp src/io/parquet/reader_impl.cpp src/io/parquet/reader_impl_helpers.cpp @@ -389,6 +410,7 @@ add_library( src/io/text/bgzip_data_chunk_source.cu src/io/text/bgzip_utils.cpp src/io/text/multibyte_split.cu + src/io/utilities/arrow_io_source.cpp src/io/utilities/column_buffer.cpp src/io/utilities/config_utils.cpp src/io/utilities/data_sink.cpp @@ -483,6 +505,7 @@ add_library( src/reshape/byte_cast.cu src/reshape/interleave_columns.cu src/reshape/tile.cu + src/rolling/detail/optimized_unbounded_window.cpp src/rolling/detail/rolling_collect_list.cu src/rolling/detail/rolling_fixed_window.cu src/rolling/detail/rolling_variable_window.cu @@ -580,6 +603,7 @@ add_library( src/text/detokenize.cu src/text/edit_distance.cu src/text/generate_ngrams.cu + src/text/jaccard.cu src/text/minhash.cu src/text/ngrams_tokenize.cu src/text/normalize.cu @@ -608,6 +632,7 @@ add_library( src/utilities/default_stream.cpp src/utilities/linked_column.cpp src/utilities/logger.cpp + src/utilities/stacktrace.cpp src/utilities/traits.cpp src/utilities/type_checks.cpp src/utilities/type_dispatcher.cpp @@ -646,6 +671,31 @@ target_compile_options( "$<$:${CUDF_CUDA_FLAGS}>" ) +if(CUDF_BUILD_STACKTRACE_DEBUG) + # Remove any optimization level to avoid nvcc warning "incompatible redefinition for option + # 'optimize'". + string(REGEX REPLACE "(\-O[0123])" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") + string(REGEX REPLACE "(\-O[0123])" "" CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE}") + string(REGEX REPLACE "(\-O[0123])" "" CMAKE_CUDA_FLAGS_MINSIZEREL + "${CMAKE_CUDA_FLAGS_MINSIZEREL}" + ) + string(REGEX REPLACE "(\-O[0123])" "" CMAKE_CUDA_FLAGS_RELWITHDEBINFO + "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO}" + ) + + add_library(cudf_backtrace INTERFACE) + target_compile_definitions(cudf_backtrace INTERFACE CUDF_BUILD_STACKTRACE_DEBUG) + target_compile_options( + cudf_backtrace INTERFACE "$<$:-Og>" + "$<$:-Xcompiler=-Og>" + ) + target_link_options( + cudf_backtrace INTERFACE "$<$:-rdynamic>" + "$<$:-Xlinker=-rdynamic>" + ) + target_link_libraries(cudf PRIVATE cudf_backtrace) +endif() + # Specify include paths for the current target and dependents target_include_directories( cudf @@ -829,7 +879,9 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL) # depending via ctest and whether it has been updated to expose public stream APIs. foreach(_mode cudf testing) set(_tgt "cudf_identify_stream_usage_mode_${_mode}") - add_library(${_tgt} SHARED tests/utilities/identify_stream_usage.cpp) + add_library( + ${_tgt} SHARED src/utilities/stacktrace.cpp tests/utilities/identify_stream_usage.cpp + ) set_target_properties( ${_tgt} @@ -838,7 +890,14 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL) CXX_STANDARD_REQUIRED ON POSITION_INDEPENDENT_CODE ON ) + target_compile_options( + ${_tgt} PRIVATE "$:${CUDF_CXX_FLAGS}>>" + ) + target_include_directories(${_tgt} PRIVATE "$") target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm) + if(CUDF_BUILD_STACKTRACE_DEBUG) + target_link_libraries(${_tgt} PRIVATE cudf_backtrace) + endif() add_library(cudf::${_tgt} ALIAS ${_tgt}) if("${_mode}" STREQUAL "testing") diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index dcc70a4b6d9..5e7862f4b3b 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -153,8 +153,12 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureNVBench( - STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/distinct_count.cpp - stream_compaction/unique.cpp stream_compaction/unique_count.cpp + STREAM_COMPACTION_NVBENCH + stream_compaction/distinct.cpp + stream_compaction/distinct_count.cpp + stream_compaction/stable_distinct.cpp + stream_compaction/unique.cpp + stream_compaction/unique_count.cpp ) # ################################################################################################## @@ -220,7 +224,8 @@ ConfigureNVBench( # ################################################################################################## # * hashing benchmark ----------------------------------------------------------------------------- -ConfigureBench(HASHING_BENCH hashing/hash.cpp hashing/partition.cpp) +ConfigureBench(HASHING_BENCH hashing/partition.cpp) +ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp) # ################################################################################################## # * merge benchmark ------------------------------------------------------------------------------- @@ -268,39 +273,48 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.c # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- -ConfigureBench( - TEXT_BENCH text/ngrams.cpp text/normalize.cpp text/normalize_spaces.cpp text/replace.cpp - text/subword.cpp text/tokenize.cpp -) +ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) -ConfigureNVBench(TEXT_NVBENCH text/minhash.cpp) +ConfigureNVBench( + TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp +) # ################################################################################################## # * strings benchmark ------------------------------------------------------------------- ConfigureBench( STRINGS_BENCH string/combine.cpp - string/contains.cpp string/convert_datetime.cpp string/convert_durations.cpp string/convert_fixed_point.cpp string/convert_numerics.cpp string/copy.cu - string/extract.cpp string/factory.cu string/filter.cpp string/find.cpp string/repeat_strings.cpp string/replace.cpp - string/replace_re.cpp string/slice.cpp - string/split.cpp string/translate.cpp string/url_decode.cu ) ConfigureNVBench( - STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp string/case.cpp + STRINGS_NVBENCH + string/case.cpp + string/char_types.cpp + string/contains.cpp + string/count.cpp + string/extract.cpp + string/gather.cpp + string/join_strings.cpp + string/lengths.cpp + string/like.cpp + string/replace_re.cpp + string/reverse.cpp + string/split.cpp + string/split_re.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp index fbba38431dd..a1131df4472 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp @@ -42,6 +42,10 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) cuda_event_timer timer(state, true); cudf::binary_operation(lhs, rhs, binop, output_dtype); } + + // use number of bytes read and written to global memory + state.SetBytesProcessed(static_cast(state.iterations()) * column_size * + (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut))); } // TODO tparam boolean for null. diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index fd7b469cffd..aef3d92b4f5 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -118,13 +118,27 @@ size_t non_fixed_width_size(data_profile const& profile) return get_distribution_mean(dist); } +double geometric_sum(size_t n, double p) +{ + if (p == 1) { return n; } + return (1 - std::pow(p, n)) / (1 - p); +} + template <> size_t non_fixed_width_size(data_profile const& profile) { auto const dist_params = profile.get_distribution_params(); auto const single_level_mean = get_distribution_mean(dist_params.length_params); - auto const element_size = avg_element_size(profile, cudf::data_type{dist_params.element_type}); - return element_size * pow(single_level_mean, dist_params.max_depth); + + auto const element_size = avg_element_size(profile, cudf::data_type{dist_params.element_type}); + auto const element_count = std::pow(single_level_mean, dist_params.max_depth); + + // Each nesting level includes offsets, this is the sum of all levels + // Also include an additional offset per level for the size of the last element + auto const total_offset_count = + geometric_sum(dist_params.max_depth, single_level_mean) + dist_params.max_depth; + + return sizeof(cudf::size_type) * total_offset_count + element_size * element_count; } template <> @@ -441,7 +455,8 @@ std::unique_ptr create_random_column(data_profile const& profile, dtype, num_rows, data.release(), - profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}); + profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}, + profile.get_null_probability().has_value() ? null_count : 0); } struct valid_or_zero { @@ -721,8 +736,11 @@ std::unique_ptr create_random_column(data_profile thrust::device_pointer_cast(offsets.end())[-1] = current_child_column->size(); // Always include all elements - auto offsets_column = std::make_unique( - cudf::data_type{cudf::type_id::INT32}, num_rows + 1, offsets.release()); + auto offsets_column = std::make_unique(cudf::data_type{cudf::type_id::INT32}, + num_rows + 1, + offsets.release(), + rmm::device_buffer{}, + 0); auto [null_mask, null_count] = cudf::detail::valid_if(valids.begin(), valids.end(), @@ -781,6 +799,25 @@ std::vector cycle_dtypes(std::vector const& dtype_ return out_dtypes; } +/** + * @brief Repeat the given two data types with a given ratio of a:b. + * + * The first dtype will have 'first_num' columns and the second will have 'num_cols - first_num' + * columns. + */ +std::vector mix_dtypes(std::pair const& dtype_ids, + cudf::size_type num_cols, + int first_num) +{ + std::vector out_dtypes; + out_dtypes.reserve(num_cols); + for (cudf::size_type col = 0; col < first_num; ++col) + out_dtypes.push_back(dtype_ids.first); + for (cudf::size_type col = first_num; col < num_cols; ++col) + out_dtypes.push_back(dtype_ids.second); + return out_dtypes; +} + std::unique_ptr create_random_table(std::vector const& dtype_ids, table_size_bytes table_bytes, data_profile const& profile, diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp index 8a5811218d0..a2efdb819bf 100644 --- a/cpp/benchmarks/common/generate_input.hpp +++ b/cpp/benchmarks/common/generate_input.hpp @@ -666,6 +666,21 @@ std::unique_ptr create_sequence_table( */ std::vector cycle_dtypes(std::vector const& dtype_ids, cudf::size_type num_cols); + +/** + * @brief Repeat the given two data types with a given ratio of a:b. + * + * The first dtype will have 'first_num' columns and the second will have 'num_cols - first_num' + * columns. + * + * @param dtype_ids Pair of requested column types + * @param num_cols Total number of columns in the output vector + * @param first_num Total number of columns of type `dtype_ids.first` + * @return A vector of type_ids + */ +std::vector mix_dtypes(std::pair const& dtype_ids, + cudf::size_type num_cols, + int first_num); /** * @brief Create a random null mask object * diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu index aff90039cb9..910fc689c0b 100644 --- a/cpp/benchmarks/copying/contiguous_split.cu +++ b/cpp/benchmarks/copying/contiguous_split.cu @@ -25,12 +25,30 @@ #include -template +void contiguous_split(cudf::table_view const& src_table, std::vector const& splits) +{ + auto result = cudf::contiguous_split(src_table, splits); +} + +void chunked_pack(cudf::table_view const& src_table, std::vector const&) +{ + auto const mr = rmm::mr::get_current_device_resource(); + auto const stream = cudf::get_default_stream(); + auto user_buffer = rmm::device_uvector(100L * 1024 * 1024, stream, mr); + auto chunked_pack = cudf::chunked_pack::create(src_table, user_buffer.size(), mr); + while (chunked_pack->has_next()) { + auto iter_size = chunked_pack->next(user_buffer); + } + stream.synchronize(); +} + +template void BM_contiguous_split_common(benchmark::State& state, std::vector& src_cols, int64_t num_rows, int64_t num_splits, - int64_t bytes_total) + int64_t bytes_total, + ContigSplitImpl& impl) { // generate splits std::vector splits; @@ -57,7 +75,7 @@ void BM_contiguous_split_common(benchmark::State& state, for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - auto result = cudf::contiguous_split(src_table, splits); + impl(src_table, splits); } // it's 2x bytes_total because we're both reading and writing. @@ -65,8 +83,10 @@ void BM_contiguous_split_common(benchmark::State& state, } class ContiguousSplit : public cudf::benchmark {}; +class ChunkedPack : public cudf::benchmark {}; -void BM_contiguous_split(benchmark::State& state) +template +void BM_contiguous_split(benchmark::State& state, ContiguousSplitImpl& impl) { int64_t const total_desired_bytes = state.range(0); cudf::size_type const num_cols = state.range(1); @@ -91,12 +111,14 @@ void BM_contiguous_split(benchmark::State& state) (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols) : 0); - BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes); + BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes, impl); } class ContiguousSplitStrings : public cudf::benchmark {}; +class ChunkedPackStrings : public cudf::benchmark {}; -void BM_contiguous_split_strings(benchmark::State& state) +template +void BM_contiguous_split_strings(benchmark::State& state, ContiguousSplitImpl& impl) { int64_t const total_desired_bytes = state.range(0); cudf::size_type const num_cols = state.range(1); @@ -104,7 +126,7 @@ void BM_contiguous_split_strings(benchmark::State& state) bool const include_validity = state.range(3) != 0; constexpr int64_t string_len = 8; - std::vector h_strings{ + std::vector h_strings{ "aaaaaaaa", "bbbbbbbb", "cccccccc", "dddddddd", "eeeeeeee", "ffffffff", "gggggggg", "hhhhhhhh"}; int64_t const col_len_bytes = total_desired_bytes / num_cols; @@ -129,17 +151,17 @@ void BM_contiguous_split_strings(benchmark::State& state) } int64_t const total_bytes = - total_desired_bytes + ((num_rows + 1) * sizeof(cudf::offset_type)) + + total_desired_bytes + ((num_rows + 1) * sizeof(cudf::size_type)) + (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols) : 0); - BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes); + BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes, impl); } #define CSBM_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \ BENCHMARK_DEFINE_F(ContiguousSplit, name)(::benchmark::State & state) \ { \ - BM_contiguous_split(state); \ + BM_contiguous_split(state, contiguous_split); \ } \ BENCHMARK_REGISTER_F(ContiguousSplit, name) \ ->Args({size, num_columns, num_splits, validity}) \ @@ -168,7 +190,7 @@ CSBM_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, #define CSBM_STRINGS_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \ BENCHMARK_DEFINE_F(ContiguousSplitStrings, name)(::benchmark::State & state) \ { \ - BM_contiguous_split_strings(state); \ + BM_contiguous_split_strings(state, contiguous_split); \ } \ BENCHMARK_REGISTER_F(ContiguousSplitStrings, name) \ ->Args({size, num_columns, num_splits, validity}) \ @@ -189,3 +211,53 @@ CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 10 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 1); CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 0); CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1); + +#define CCSBM_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \ + BENCHMARK_DEFINE_F(ChunkedPack, name)(::benchmark::State & state) \ + { \ + BM_contiguous_split(state, chunked_pack); \ + } \ + BENCHMARK_REGISTER_F(ChunkedPack, name) \ + ->Args({size, num_columns, num_splits, validity}) \ + ->Unit(benchmark::kMillisecond) \ + ->UseManualTime() \ + ->Iterations(8) +CCSBM_BENCHMARK_DEFINE(6Gb512ColsNoValidity, (int64_t)6 * 1024 * 1024 * 1024, 512, 0, 0); +CCSBM_BENCHMARK_DEFINE(6Gb512ColsValidity, (int64_t)6 * 1024 * 1024 * 1024, 512, 0, 1); +CCSBM_BENCHMARK_DEFINE(6Gb10ColsNoValidity, (int64_t)6 * 1024 * 1024 * 1024, 10, 0, 0); +CCSBM_BENCHMARK_DEFINE(6Gb10ColsValidity, (int64_t)6 * 1024 * 1024 * 1024, 10, 0, 1); + +CCSBM_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 0); +CCSBM_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 1); +CCSBM_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 0); +CCSBM_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 1); +CCSBM_BENCHMARK_DEFINE(4Gb4ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1); + +CCSBM_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 0); +CCSBM_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 1); +CCSBM_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 0); +CCSBM_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 1); +CCSBM_BENCHMARK_DEFINE(1Gb1ColValidity, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1); + +#define CCSBM_STRINGS_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \ + BENCHMARK_DEFINE_F(ChunkedPackStrings, name)(::benchmark::State & state) \ + { \ + BM_contiguous_split_strings(state, chunked_pack); \ + } \ + BENCHMARK_REGISTER_F(ChunkedPackStrings, name) \ + ->Args({size, num_columns, num_splits, validity}) \ + ->Unit(benchmark::kMillisecond) \ + ->UseManualTime() \ + ->Iterations(8) + +CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 0); +CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 1); +CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 0); +CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 1); +CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb4ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1); + +CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 0); +CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 1); +CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 0); +CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 1); +CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColValidity, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1); diff --git a/cpp/benchmarks/copying/copy_if_else.cpp b/cpp/benchmarks/copying/copy_if_else.cpp index a10f54b3d6f..50ddfb82feb 100644 --- a/cpp/benchmarks/copying/copy_if_else.cpp +++ b/cpp/benchmarks/copying/copy_if_else.cpp @@ -47,6 +47,14 @@ static void BM_copy_if_else(benchmark::State& state, bool nulls) cuda_event_timer raii(state, true, cudf::get_default_stream()); cudf::copy_if_else(lhs, rhs, decision); } + + auto const bytes_read = n_rows * (sizeof(TypeParam) + sizeof(bool)); + auto const bytes_written = n_rows * sizeof(TypeParam); + auto const null_bytes = nulls ? 2 * cudf::bitmask_allocation_size_bytes(n_rows) : 0; + + // Use number of bytes read and written. + state.SetBytesProcessed(static_cast(state.iterations()) * + (bytes_read + bytes_written + null_bytes)); } #define COPY_BENCHMARK_DEFINE(name, type, b) \ diff --git a/cpp/benchmarks/copying/gather.cu b/cpp/benchmarks/copying/gather.cu index 149fd611656..eeb0149fb3a 100644 --- a/cpp/benchmarks/copying/gather.cu +++ b/cpp/benchmarks/copying/gather.cu @@ -31,8 +31,8 @@ class Gather : public cudf::benchmark {}; template void BM_gather(benchmark::State& state) { - const cudf::size_type source_size{(cudf::size_type)state.range(0)}; - const auto n_cols = (cudf::size_type)state.range(1); + cudf::size_type const source_size{(cudf::size_type)state.range(0)}; + auto const n_cols = (cudf::size_type)state.range(1); // Gather indices auto gather_map_table = diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp index e153abee3a3..bc6c2e52da8 100644 --- a/cpp/benchmarks/fixture/benchmark_fixture.hpp +++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -73,17 +73,17 @@ class benchmark : public ::benchmark::Fixture { public: benchmark() : ::benchmark::Fixture() { - const char* env_iterations = std::getenv("CUDF_BENCHMARK_ITERATIONS"); + char const* env_iterations = std::getenv("CUDF_BENCHMARK_ITERATIONS"); if (env_iterations != nullptr) { this->Iterations(std::max(0L, atol(env_iterations))); } } - void SetUp(const ::benchmark::State& state) override + void SetUp(::benchmark::State const& state) override { mr = make_pool_instance(); rmm::mr::set_current_device_resource(mr.get()); // set default resource to pool } - void TearDown(const ::benchmark::State& state) override + void TearDown(::benchmark::State const& state) override { // reset default resource to the initial resource rmm::mr::set_current_device_resource(nullptr); @@ -91,10 +91,10 @@ class benchmark : public ::benchmark::Fixture { } // eliminate partial override warnings (see benchmark/benchmark.h) - void SetUp(::benchmark::State& st) override { SetUp(const_cast(st)); } + void SetUp(::benchmark::State& st) override { SetUp(const_cast<::benchmark::State const&>(st)); } void TearDown(::benchmark::State& st) override { - TearDown(const_cast(st)); + TearDown(const_cast<::benchmark::State const&>(st)); } std::shared_ptr mr; diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp new file mode 100644 index 00000000000..e08f9101522 --- /dev/null +++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace detail { +static std::string rmm_mode_param{"--rmm_mode"}; ///< RMM mode command-line parameter name +} // namespace detail + +/** + * Base fixture for cudf benchmarks using nvbench. + * + * Initializes the default memory resource to use the RMM pool device resource. + */ +struct nvbench_base_fixture { + inline auto make_cuda() { return std::make_shared(); } + + inline auto make_pool() + { + return rmm::mr::make_owning_wrapper(make_cuda()); + } + + inline auto make_async() { return std::make_shared(); } + + inline auto make_managed() { return std::make_shared(); } + + inline auto make_arena() + { + return rmm::mr::make_owning_wrapper(make_cuda()); + } + + inline auto make_managed_pool() + { + return rmm::mr::make_owning_wrapper(make_managed()); + } + + inline std::shared_ptr create_memory_resource( + std::string const& mode) + { + if (mode == "cuda") return make_cuda(); + if (mode == "pool") return make_pool(); + if (mode == "async") return make_async(); + if (mode == "arena") return make_arena(); + if (mode == "managed") return make_managed(); + if (mode == "managed_pool") return make_managed_pool(); + CUDF_FAIL("Unknown rmm_mode parameter: " + mode + + "\nExpecting: cuda, pool, async, arena, managed, or managed_pool"); + } + + nvbench_base_fixture(int argc, char const* const* argv) + { + for (int i = 1; i < argc - 1; ++i) { + std::string arg = argv[i]; + if (arg == detail::rmm_mode_param) { + i++; + rmm_mode = argv[i]; + } + } + + mr = create_memory_resource(rmm_mode); + rmm::mr::set_current_device_resource(mr.get()); + std::cout << "RMM memory resource = " << rmm_mode << "\n"; + } + + std::shared_ptr mr; + std::string rmm_mode{"pool"}; +}; + +} // namespace cudf diff --git a/cpp/benchmarks/fixture/nvbench_main.cpp b/cpp/benchmarks/fixture/nvbench_main.cpp index f58eae62372..64c4d83ac17 100644 --- a/cpp/benchmarks/fixture/nvbench_main.cpp +++ b/cpp/benchmarks/fixture/nvbench_main.cpp @@ -14,9 +14,28 @@ * limitations under the License. */ -#include +#include #define NVBENCH_ENVIRONMENT cudf::nvbench_base_fixture #include +#include + +// strip off the rmm_mode parameter before passing the +// remaining arguments to nvbench::option_parser +#undef NVBENCH_MAIN_PARSE +#define NVBENCH_MAIN_PARSE(argc, argv) \ + nvbench::option_parser parser; \ + std::vector m_args; \ + for (int i = 0; i < argc; ++i) { \ + std::string arg = argv[i]; \ + if (arg == cudf::detail::rmm_mode_param) { \ + i += 2; \ + } else { \ + m_args.push_back(arg); \ + } \ + } \ + parser.parse(m_args) + +// this declares/defines the main() function using the definitions above NVBENCH_MAIN diff --git a/cpp/benchmarks/fixture/rmm_pool_raii.hpp b/cpp/benchmarks/fixture/rmm_pool_raii.hpp deleted file mode 100644 index 23f49735855..00000000000 --- a/cpp/benchmarks/fixture/rmm_pool_raii.hpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include - -namespace cudf { - -/** - * @brief An RAII class setting up RMM memory pool for `nvbench` benchmarks - * - * This is a temporary solution before templated fixtures tests are supported - * in `nvbench`. Similarly to `cudf::benchmark`, creating this RAII object in - * each benchmark will ensure that the RAPIDS Memory Manager pool mode is used - * in benchmarks, which eliminates memory allocation / deallocation performance - * overhead from the benchmark. - * - * Example: - * - * void my_benchmark(nvbench::state& state) { - * cudf::rmm_pool_raii pool_raii; - * state.exec([](nvbench::launch& launch) { - * // benchmark stuff - * }); - * } - * - * NVBENCH_BENCH(my_benchmark); - */ -class rmm_pool_raii { - private: - // memory resource factory helpers - inline auto make_cuda() { return std::make_shared(); } - - inline auto make_pool() - { - return rmm::mr::make_owning_wrapper(make_cuda()); - } - - public: - rmm_pool_raii() - { - mr = make_pool(); - rmm::mr::set_current_device_resource(mr.get()); // set default resource to pool - } - - ~rmm_pool_raii() - { - rmm::mr::set_current_device_resource(nullptr); - mr.reset(); - } - - private: - std::shared_ptr mr; -}; - -/** - * Base fixture for cudf benchmarks using nvbench. - * - * Initializes the default memory resource to use the RMM pool device resource. - */ -struct nvbench_base_fixture { - rmm_pool_raii _mr; -}; - -} // namespace cudf diff --git a/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp b/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp index 7d86ed1b95c..57f52861cb5 100644 --- a/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp +++ b/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,7 +45,7 @@ namespace cudf { template class FunctionTemplateBenchmark : public Fixture { public: - FunctionTemplateBenchmark(const char* name, ::benchmark::internal::Function* func) + FunctionTemplateBenchmark(char const* name, ::benchmark::internal::Function* func) : Fixture(), func_(func) { this->SetName(name); diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp index 077558f8709..e65c37f001d 100644 --- a/cpp/benchmarks/groupby/group_max.cpp +++ b/cpp/benchmarks/groupby/group_max.cpp @@ -15,7 +15,6 @@ */ #include -#include #include @@ -24,7 +23,7 @@ template void bench_groupby_max(nvbench::state& state, nvbench::type_list) { - const auto size = static_cast(state.get_int64("num_rows")); + auto const size = static_cast(state.get_int64("num_rows")); auto const keys = [&] { data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( diff --git a/cpp/benchmarks/groupby/group_no_requests.cpp b/cpp/benchmarks/groupby/group_no_requests.cpp index 7a35873efe9..34618acec75 100644 --- a/cpp/benchmarks/groupby/group_no_requests.cpp +++ b/cpp/benchmarks/groupby/group_no_requests.cpp @@ -28,7 +28,7 @@ class Groupby : public cudf::benchmark {}; void BM_basic_no_requests(benchmark::State& state) { - const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + cudf::size_type const column_size{(cudf::size_type)state.range(0)}; data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); @@ -59,7 +59,7 @@ BENCHMARK_REGISTER_F(Groupby, BasicNoRequest) void BM_pre_sorted_no_requests(benchmark::State& state) { - const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + cudf::size_type const column_size{(cudf::size_type)state.range(0)}; data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); diff --git a/cpp/benchmarks/groupby/group_nth.cpp b/cpp/benchmarks/groupby/group_nth.cpp index 948414e8417..f2c24433858 100644 --- a/cpp/benchmarks/groupby/group_nth.cpp +++ b/cpp/benchmarks/groupby/group_nth.cpp @@ -29,7 +29,7 @@ class Groupby : public cudf::benchmark {}; void BM_pre_sorted_nth(benchmark::State& state) { // const cudf::size_type num_columns{(cudf::size_type)state.range(0)}; - const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + cudf::size_type const column_size{(cudf::size_type)state.range(0)}; data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); diff --git a/cpp/benchmarks/groupby/group_nunique.cpp b/cpp/benchmarks/groupby/group_nunique.cpp index f74ed95200e..63d738b2951 100644 --- a/cpp/benchmarks/groupby/group_nunique.cpp +++ b/cpp/benchmarks/groupby/group_nunique.cpp @@ -15,7 +15,6 @@ */ #include -#include #include @@ -40,7 +39,7 @@ auto make_aggregation_request_vector(cudf::column_view const& values, Args&&... template void bench_groupby_nunique(nvbench::state& state, nvbench::type_list) { - const auto size = static_cast(state.get_int64("num_rows")); + auto const size = static_cast(state.get_int64("num_rows")); auto const keys = [&] { data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution( diff --git a/cpp/benchmarks/groupby/group_rank.cpp b/cpp/benchmarks/groupby/group_rank.cpp index 6aac3826e55..2122720a421 100644 --- a/cpp/benchmarks/groupby/group_rank.cpp +++ b/cpp/benchmarks/groupby/group_rank.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ #include -#include #include #include diff --git a/cpp/benchmarks/groupby/group_scan.cpp b/cpp/benchmarks/groupby/group_scan.cpp index c9ae10c775f..2ae5b6fc2b8 100644 --- a/cpp/benchmarks/groupby/group_scan.cpp +++ b/cpp/benchmarks/groupby/group_scan.cpp @@ -29,7 +29,7 @@ class Groupby : public cudf::benchmark {}; void BM_basic_sum_scan(benchmark::State& state) { - const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + cudf::size_type const column_size{(cudf::size_type)state.range(0)}; data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); @@ -61,7 +61,7 @@ BENCHMARK_REGISTER_F(Groupby, BasicSumScan) void BM_pre_sorted_sum_scan(benchmark::State& state) { - const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + cudf::size_type const column_size{(cudf::size_type)state.range(0)}; data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); diff --git a/cpp/benchmarks/groupby/group_shift.cpp b/cpp/benchmarks/groupby/group_shift.cpp index 1ad6560b73f..eda2b3dd158 100644 --- a/cpp/benchmarks/groupby/group_shift.cpp +++ b/cpp/benchmarks/groupby/group_shift.cpp @@ -28,8 +28,8 @@ class Groupby : public cudf::benchmark {}; void BM_group_shift(benchmark::State& state) { - const cudf::size_type column_size{(cudf::size_type)state.range(0)}; - const int num_groups = 100; + cudf::size_type const column_size{(cudf::size_type)state.range(0)}; + int const num_groups = 100; data_profile const profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution( diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp index 53ef12ffeaa..44a12c1c30e 100644 --- a/cpp/benchmarks/groupby/group_struct_keys.cpp +++ b/cpp/benchmarks/groupby/group_struct_keys.cpp @@ -15,7 +15,6 @@ */ #include -#include #include @@ -34,10 +33,10 @@ void bench_groupby_struct_keys(nvbench::state& state) std::default_random_engine generator; std::uniform_int_distribution distribution(0, 100); - const cudf::size_type n_rows{static_cast(state.get_int64("NumRows"))}; - const cudf::size_type n_cols{1}; - const cudf::size_type depth{static_cast(state.get_int64("Depth"))}; - const bool nulls{static_cast(state.get_int64("Nulls"))}; + cudf::size_type const n_rows{static_cast(state.get_int64("NumRows"))}; + cudf::size_type const n_cols{1}; + cudf::size_type const depth{static_cast(state.get_int64("Depth"))}; + bool const nulls{static_cast(state.get_int64("Nulls"))}; // Create columns with values in the range [0,100) std::vector columns; diff --git a/cpp/benchmarks/groupby/group_sum.cpp b/cpp/benchmarks/groupby/group_sum.cpp index fbfb8865b81..b3fd881ccbc 100644 --- a/cpp/benchmarks/groupby/group_sum.cpp +++ b/cpp/benchmarks/groupby/group_sum.cpp @@ -28,7 +28,7 @@ class Groupby : public cudf::benchmark {}; void BM_basic_sum(benchmark::State& state) { - const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + cudf::size_type const column_size{(cudf::size_type)state.range(0)}; data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); @@ -61,7 +61,7 @@ BENCHMARK_REGISTER_F(Groupby, Basic) void BM_pre_sorted_sum(benchmark::State& state) { - const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + cudf::size_type const column_size{(cudf::size_type)state.range(0)}; data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp index d71e4742f0a..e679b4b62d2 100644 --- a/cpp/benchmarks/hashing/hash.cpp +++ b/cpp/benchmarks/hashing/hash.cpp @@ -15,47 +15,71 @@ */ #include -#include -#include #include +#include #include #include -class HashBenchmark : public cudf::benchmark {}; +#include -enum contains_nulls { no_nulls, nulls }; +#include -static void BM_hash(benchmark::State& state, cudf::hash_id hid, contains_nulls has_nulls) +static void bench_hash(nvbench::state& state) { - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - auto const data = create_random_table({cudf::type_id::INT64}, row_count{n_rows}); - if (has_nulls == contains_nulls::no_nulls) - data->get_column(0).set_null_mask(rmm::device_buffer{}, 0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::hash(data->view(), hid); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const nulls = state.get_float64("nulls"); + // disable null bitmask if probability is exactly 0.0 + bool const no_nulls = nulls == 0.0; + auto const hash_name = state.get_string("hash_name"); + + data_profile const profile = + data_profile_builder().null_probability(no_nulls ? std::nullopt : std::optional{nulls}); + auto const data = create_random_table( + {cudf::type_id::INT64, cudf::type_id::STRING}, row_count{num_rows}, profile); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + // collect statistics + cudf::strings_column_view input(data->get_column(1).view()); + auto const chars_size = input.chars_size(); + // add memory read from string column + state.add_global_memory_reads(chars_size); + // add memory read from int64_t column + state.add_global_memory_reads(num_rows); + // add memory read from bitmaks + if (!no_nulls) { + state.add_global_memory_reads(2 * + cudf::bitmask_allocation_size_bytes(num_rows)); } -} + // memory written depends on used hash -#define concat(a, b, c) a##b##c + if (hash_name == "murmurhash3_x86_32") { + state.add_global_memory_writes(num_rows); -#define H_BENCHMARK_DEFINE(name, hid, n) \ - BENCHMARK_DEFINE_F(HashBenchmark, name) \ - (::benchmark::State & st) { BM_hash(st, cudf::hash_id::hid, contains_nulls::n); } \ - BENCHMARK_REGISTER_F(HashBenchmark, name) \ - ->RangeMultiplier(4) \ - ->Ranges({{1 << 14, 1 << 24}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::hashing::murmurhash3_x86_32(data->view()); + }); + } else if (hash_name == "md5") { + // md5 creates a 32-byte string + state.add_global_memory_writes(32 * num_rows); -#define HASH_BENCHMARK_DEFINE(hid, n) H_BENCHMARK_DEFINE(concat(hid, _, n), hid, n) + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); }); + } else if (hash_name == "spark_murmurhash3_x86_32") { + state.add_global_memory_writes(num_rows); -HASH_BENCHMARK_DEFINE(HASH_MURMUR3, nulls) -HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, nulls) -HASH_BENCHMARK_DEFINE(HASH_MD5, nulls) + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::hashing::spark_murmurhash3_x86_32(data->view()); + }); + } else { + state.skip(hash_name + ": unknown hash name"); + } +} -HASH_BENCHMARK_DEFINE(HASH_MURMUR3, no_nulls) -HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, no_nulls) -HASH_BENCHMARK_DEFINE(HASH_MD5, no_nulls) +NVBENCH_BENCH(bench_hash) + .set_name("hashing") + .add_int64_axis("num_rows", {65536, 16777216}) + .add_float64_axis("nulls", {0.0, 0.1}) + .add_string_axis("hash_name", {"murmurhash3_x86_32", "md5", "spark_murmurhash3_x86_32"}); diff --git a/cpp/benchmarks/hashing/partition.cpp b/cpp/benchmarks/hashing/partition.cpp index b688fe2ed7f..0bec4394216 100644 --- a/cpp/benchmarks/hashing/partition.cpp +++ b/cpp/benchmarks/hashing/partition.cpp @@ -43,6 +43,13 @@ void BM_hash_partition(benchmark::State& state) cuda_event_timer timer(state, true); auto output = cudf::hash_partition(input, columns_to_hash, num_partitions); } + + auto const bytes_read = num_rows * num_cols * sizeof(T); + auto const bytes_written = num_rows * num_cols * sizeof(T); + auto const partition_bytes = num_partitions * sizeof(cudf::size_type); + + state.SetBytesProcessed(static_cast(state.iterations()) * + (bytes_read + bytes_written + partition_bytes)); } BENCHMARK_DEFINE_F(Hashing, hash_partition) diff --git a/cpp/benchmarks/io/csv/csv_reader_input.cpp b/cpp/benchmarks/io/csv/csv_reader_input.cpp index 4ae4e139b59..6216a9ecec2 100644 --- a/cpp/benchmarks/io/csv/csv_reader_input.cpp +++ b/cpp/benchmarks/io/csv/csv_reader_input.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/cpp/benchmarks/io/csv/csv_reader_options.cpp b/cpp/benchmarks/io/csv/csv_reader_options.cpp index 2d0e0e5754e..93ef5bed774 100644 --- a/cpp/benchmarks/io/csv/csv_reader_options.cpp +++ b/cpp/benchmarks/io/csv/csv_reader_options.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp index 1ca6b5b2a9b..8ff07be1531 100644 --- a/cpp/benchmarks/io/csv/csv_writer.cpp +++ b/cpp/benchmarks/io/csv/csv_writer.cpp @@ -23,7 +23,7 @@ #include -// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput constexpr size_t data_size = 256 << 20; constexpr cudf::size_type num_cols = 64; diff --git a/cpp/benchmarks/io/fst.cu b/cpp/benchmarks/io/fst.cu index 7fb505f1d34..c0c88517d41 100644 --- a/cpp/benchmarks/io/fst.cu +++ b/cpp/benchmarks/io/fst.cu @@ -15,8 +15,6 @@ */ #include -#include -#include #include #include //TODO find better replacement @@ -35,6 +33,8 @@ #include +#include + #include namespace { @@ -60,17 +60,16 @@ auto make_test_json_data(nvbench::state& state) auto d_input_scalar = cudf::make_string_scalar(input); auto& d_string_scalar = static_cast(*d_input_scalar); - const cudf::size_type repeat_times = string_size / input.size(); + cudf::size_type const repeat_times = string_size / input.size(); return cudf::strings::repeat_string(d_string_scalar, repeat_times); } // Type used to represent the atomic symbol type used within the finite-state machine using SymbolT = char; // Type sufficiently large to index symbols within the input and output (may be unsigned) -using SymbolOffsetT = uint32_t; -// Helper class to set up transition table, symbol group lookup table, and translation table -using DfaFstT = cudf::io::fst::detail::Dfa; -constexpr std::size_t single_item = 1; +using SymbolOffsetT = uint32_t; +constexpr std::size_t single_item = 1; +constexpr auto max_translation_table_size = TT_NUM_STATES * NUM_SYMBOL_GROUPS; } // namespace @@ -89,12 +88,16 @@ void BM_FST_JSON(nvbench::state& state) state.add_element_count(d_input.size()); // Prepare input & output buffers - hostdevice_vector output_gpu(d_input.size(), stream_view); - hostdevice_vector output_gpu_size(single_item, stream_view); - hostdevice_vector out_indexes_gpu(d_input.size(), stream_view); + cudf::detail::hostdevice_vector output_gpu(d_input.size(), stream_view); + cudf::detail::hostdevice_vector output_gpu_size(single_item, stream_view); + cudf::detail::hostdevice_vector out_indexes_gpu(d_input.size(), stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { @@ -124,12 +127,16 @@ void BM_FST_JSON_no_outidx(nvbench::state& state) state.add_element_count(d_input.size()); // Prepare input & output buffers - hostdevice_vector output_gpu(d_input.size(), stream_view); - hostdevice_vector output_gpu_size(single_item, stream_view); - hostdevice_vector out_indexes_gpu(d_input.size(), stream_view); + cudf::detail::hostdevice_vector output_gpu(d_input.size(), stream_view); + cudf::detail::hostdevice_vector output_gpu_size(single_item, stream_view); + cudf::detail::hostdevice_vector out_indexes_gpu(d_input.size(), stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { @@ -159,10 +166,14 @@ void BM_FST_JSON_no_out(nvbench::state& state) state.add_element_count(d_input.size()); // Prepare input & output buffers - hostdevice_vector output_gpu_size(single_item, stream_view); + cudf::detail::hostdevice_vector output_gpu_size(single_item, stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { @@ -192,11 +203,15 @@ void BM_FST_JSON_no_str(nvbench::state& state) state.add_element_count(d_input.size()); // Prepare input & output buffers - hostdevice_vector output_gpu_size(single_item, stream_view); - hostdevice_vector out_indexes_gpu(d_input.size(), stream_view); + cudf::detail::hostdevice_vector output_gpu_size(single_item, stream_view); + cudf::detail::hostdevice_vector out_indexes_gpu(d_input.size(), stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { diff --git a/cpp/benchmarks/io/json/json_reader_input.cpp b/cpp/benchmarks/io/json/json_reader_input.cpp index 55614d040d5..31bb5dafa88 100644 --- a/cpp/benchmarks/io/json/json_reader_input.cpp +++ b/cpp/benchmarks/io/json/json_reader_input.cpp @@ -24,17 +24,13 @@ #include -// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput constexpr size_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; -void json_read_common(cudf::io::json_writer_options const& write_opts, - cuio_source_sink_pair& source_sink, - nvbench::state& state) +void json_read_common(cuio_source_sink_pair& source_sink, nvbench::state& state) { - cudf::io::write_json(write_opts); - cudf::io::json_reader_options read_opts = cudf::io::json_reader_options::builder(source_sink.make_source_info()); @@ -69,16 +65,21 @@ void BM_json_read_io(nvbench::state& state, nvbench::type_list(data_type::STRUCT)}); auto const source_type = IO; + cuio_source_sink_pair source_sink(source_type); - auto const tbl = create_random_table( - cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder()); - auto const view = tbl->view(); + { + auto const tbl = create_random_table( + cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder()); + auto const view = tbl->view(); - cuio_source_sink_pair source_sink(source_type); - cudf::io::json_writer_options const write_opts = - cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view).na_rep("null"); + cudf::io::json_writer_options const write_opts = + cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view) + .na_rep("null") + .rows_per_chunk(100'000); + cudf::io::write_json(write_opts); + } - json_read_common(write_opts, source_sink, state); + json_read_common(source_sink, state); } template @@ -87,16 +88,19 @@ void BM_json_read_data_type( { auto const d_type = get_type_or_group(static_cast(DataType)); auto const source_type = IO; - - auto const tbl = create_random_table( - cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder()); - auto const view = tbl->view(); - cuio_source_sink_pair source_sink(source_type); - cudf::io::json_writer_options const write_opts = - cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view).na_rep("null"); - - json_read_common(write_opts, source_sink, state); + { + auto const tbl = create_random_table( + cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, data_profile_builder()); + auto const view = tbl->view(); + + cudf::io::json_writer_options const write_opts = + cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view) + .na_rep("null") + .rows_per_chunk(100'000); + cudf::io::write_json(write_opts); + } + json_read_common(source_sink, state); } using d_type_list = nvbench::enum_type_list -// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput constexpr size_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; diff --git a/cpp/benchmarks/io/json/nested_json.cpp b/cpp/benchmarks/io/json/nested_json.cpp index d03f36ca81f..03ccd4e245d 100644 --- a/cpp/benchmarks/io/json/nested_json.cpp +++ b/cpp/benchmarks/io/json/nested_json.cpp @@ -16,9 +16,6 @@ #include #include -#include - -#include #include @@ -28,6 +25,8 @@ #include #include +#include + #include #include @@ -78,7 +77,7 @@ std::string generate_row( int num_columns, int max_depth, int max_list_size, int max_struct_size, size_t max_bytes) { std::string s = "{"; - const std::vector elems{ + std::vector const elems{ R"(1)", R"(-2)", R"(3.4)", R"("5")", R"("abcdefghij")", R"(true)", R"(null)"}; for (int i = 0; i < num_columns; i++) { s += R"("col)" + num_to_string(i) + R"(": )"; @@ -141,7 +140,7 @@ auto make_test_json_data(cudf::size_type string_size, rmm::cuda_stream_view stre {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}, {"a": 1, "b": 8.0, "d": { "author": "Jean-Jacques Rousseau"}},)"; - const cudf::size_type repeat_times = string_size / input.size(); + cudf::size_type const repeat_times = string_size / input.size(); auto d_input_scalar = cudf::make_string_scalar(input, stream); auto& d_string_scalar = static_cast(*d_input_scalar); @@ -192,7 +191,7 @@ void BM_NESTED_JSON_DEPTH(nvbench::state& state) auto d_scalar = cudf::string_scalar( generate_json(100'000'000, 10, depth, 10, 10, string_size), true, cudf::get_default_stream()); - auto input = cudf::device_span(d_scalar.data(), d_scalar.size()); + auto input = cudf::device_span(d_scalar.data(), d_scalar.size()); state.add_element_count(input.size()); auto const default_options = cudf::io::json_reader_options{}; diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp index 4705c083c02..b6e15fb3923 100644 --- a/cpp/benchmarks/io/orc/orc_reader_input.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include @@ -25,7 +24,7 @@ #include -// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput constexpr int64_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; diff --git a/cpp/benchmarks/io/orc/orc_reader_options.cpp b/cpp/benchmarks/io/orc/orc_reader_options.cpp index 0361ba7c7a6..647a411c89d 100644 --- a/cpp/benchmarks/io/orc/orc_reader_options.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_options.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include @@ -26,7 +25,7 @@ #include -// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput constexpr int64_t data_size = 512 << 20; // The number of separate read calls to use when reading files in multiple chunks diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp index 67bf4cb750b..bb373297222 100644 --- a/cpp/benchmarks/io/orc/orc_writer.cpp +++ b/cpp/benchmarks/io/orc/orc_writer.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include @@ -38,7 +37,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( }, [](auto) { return std::string{}; }) -// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput constexpr int64_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; diff --git a/cpp/benchmarks/io/orc/orc_writer_chunks.cpp b/cpp/benchmarks/io/orc/orc_writer_chunks.cpp index eda70bc05e6..dff88d7ab6c 100644 --- a/cpp/benchmarks/io/orc/orc_writer_chunks.cpp +++ b/cpp/benchmarks/io/orc/orc_writer_chunks.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include @@ -29,7 +28,7 @@ #include -// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput constexpr int64_t data_size = 512 << 20; diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index 6ad5d024312..80303ea04af 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include @@ -25,7 +24,7 @@ #include -// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput constexpr size_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; @@ -114,6 +113,38 @@ void BM_parquet_read_io_compression( parquet_read_common(write_opts, source_sink, state); } +template +void BM_parquet_read_io_small_mixed(nvbench::state& state, + nvbench::type_list>) +{ + auto const d_type = + std::pair{cudf::type_id::STRING, cudf::type_id::INT32}; + + cudf::size_type const cardinality = state.get_int64("cardinality"); + cudf::size_type const run_length = state.get_int64("run_length"); + cudf::size_type const num_strings = state.get_int64("num_string_cols"); + auto const source_type = IOType; + + // want 80 pages total, across 4 columns, so 20 pages per column + cudf::size_type constexpr n_col = 4; + cudf::size_type constexpr page_size_rows = 10'000; + cudf::size_type constexpr num_rows = page_size_rows * (80 / n_col); + + auto const tbl = + create_random_table(mix_dtypes(d_type, n_col, num_strings), + row_count{num_rows}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cuio_source_sink_pair source_sink(source_type); + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .max_page_size_rows(10'000) + .compression(cudf::io::compression_type::NONE); + + parquet_read_common(write_opts, source_sink, state); +} + template void BM_parquet_read_chunks( nvbench::state& state, @@ -140,7 +171,6 @@ void BM_parquet_read_chunks( cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); - auto reader = cudf::io::chunked_parquet_reader(byte_limit, read_opts); auto mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -149,8 +179,9 @@ void BM_parquet_read_chunks( try_drop_l3_cache(); timer.start(); + auto reader = cudf::io::chunked_parquet_reader(byte_limit, read_opts); do { - auto chunk = reader.read_chunk(); + [[maybe_unused]] auto const chunk = reader.read_chunk(); } while (reader.has_next()); timer.stop(); }); @@ -203,3 +234,12 @@ NVBENCH_BENCH_TYPES(BM_parquet_read_chunks, .add_int64_axis("cardinality", {0, 1000}) .add_int64_axis("run_length", {1, 32}) .add_int64_axis("byte_limit", {0, 500'000}); + +NVBENCH_BENCH_TYPES(BM_parquet_read_io_small_mixed, + NVBENCH_TYPE_AXES(nvbench::enum_type_list)) + .set_name("parquet_read_io_small_mixed") + .set_type_axes_names({"io"}) + .set_min_samples(4) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}) + .add_int64_axis("num_string_cols", {1, 2, 3}); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp index 5a6e4a8cb72..4105f2182d7 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include @@ -25,7 +24,7 @@ #include -// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput constexpr std::size_t data_size = 512 << 20; constexpr std::size_t row_group_size = 128 << 20; diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index d3d22e06086..13b396ea267 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include @@ -38,7 +37,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( }, [](auto) { return std::string{}; }) -// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput constexpr size_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp index ed70f53cad8..b85c97f65f7 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include @@ -27,7 +26,7 @@ #include -// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to // run on most GPUs, but large enough to allow highest throughput constexpr int64_t data_size = 512 << 20; diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp index a697c98a320..b5d855d8881 100644 --- a/cpp/benchmarks/io/text/multibyte_split.cpp +++ b/cpp/benchmarks/io/text/multibyte_split.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu index 2ea2ec34ee8..7acf24c30a5 100644 --- a/cpp/benchmarks/iterator/iterator.cu +++ b/cpp/benchmarks/iterator/iterator.cu @@ -131,7 +131,7 @@ class Iterator : public cudf::benchmark {}; template void BM_iterator(benchmark::State& state) { - const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + cudf::size_type const column_size{(cudf::size_type)state.range(0)}; using T = TypeParam; auto num_gen = thrust::counting_iterator(0); @@ -195,7 +195,7 @@ void pair_iterator_bench_thrust(cudf::column_view& col, template void BM_pair_iterator(benchmark::State& state) { - const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + cudf::size_type const column_size{(cudf::size_type)state.range(0)}; using T = TypeParam; auto num_gen = thrust::counting_iterator(0); auto null_gen = diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index c606cd8b4c0..84e607a9f28 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,7 +31,7 @@ #include -__global__ static void init_curand(curandState* state, const int nstates) +__global__ static void init_curand(curandState* state, int const nstates) { int ithread = threadIdx.x + blockIdx.x * blockDim.x; @@ -40,10 +40,10 @@ __global__ static void init_curand(curandState* state, const int nstates) template __global__ static void init_build_tbl(key_type* const build_tbl, - const size_type build_tbl_size, - const int multiplicity, + size_type const build_tbl_size, + int const multiplicity, curandState* state, - const int num_states) + int const num_states) { auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x; auto const stride = blockDim.x * gridDim.x; @@ -52,7 +52,7 @@ __global__ static void init_build_tbl(key_type* const build_tbl, curandState localState = state[start_idx]; for (size_type idx = start_idx; idx < build_tbl_size; idx += stride) { - const double x = curand_uniform_double(&localState); + double const x = curand_uniform_double(&localState); build_tbl[idx] = static_cast(x * (build_tbl_size / multiplicity)); } @@ -62,13 +62,13 @@ __global__ static void init_build_tbl(key_type* const build_tbl, template __global__ void init_probe_tbl(key_type* const probe_tbl, - const size_type probe_tbl_size, - const size_type build_tbl_size, - const key_type rand_max, - const double selectivity, - const int multiplicity, + size_type const probe_tbl_size, + size_type const build_tbl_size, + key_type const rand_max, + double const selectivity, + int const multiplicity, curandState* state, - const int num_states) + int const num_states) { auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x; auto const stride = blockDim.x * gridDim.x; @@ -123,11 +123,11 @@ __global__ void init_probe_tbl(key_type* const probe_tbl, */ template void generate_input_tables(key_type* const build_tbl, - const size_type build_tbl_size, + size_type const build_tbl_size, key_type* const probe_tbl, - const size_type probe_tbl_size, - const double selectivity, - const int multiplicity) + size_type const probe_tbl_size, + double const selectivity, + int const multiplicity) { // With large values of rand_max the a lot of temporary storage is needed for the lottery. At the // expense of not being that accurate with applying the selectivity an especially more memory @@ -152,7 +152,7 @@ void generate_input_tables(key_type* const build_tbl, int num_sms{-1}; CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); - const int num_states = + int const num_states = num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size; rmm::device_uvector devStates(num_states, cudf::get_default_stream()); diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu index 1b9e8cb1cfe..1c02a4488ac 100644 --- a/cpp/benchmarks/join/join.cu +++ b/cpp/benchmarks/join/join.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include template diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp index 44b7bc0af62..7d1b1c74465 100644 --- a/cpp/benchmarks/join/join_common.hpp +++ b/cpp/benchmarks/join/join_common.hpp @@ -96,8 +96,8 @@ void BM_join(state_type& state, Join JoinFunc) } }(); - const double selectivity = 0.3; - const int multiplicity = 1; + double const selectivity = 0.3; + int const multiplicity = 1; // Generate build and probe tables auto build_random_null_mask = [](int size) { diff --git a/cpp/benchmarks/join/mixed_join.cu b/cpp/benchmarks/join/mixed_join.cu index 1420625bbcd..67be4640f84 100644 --- a/cpp/benchmarks/join/mixed_join.cu +++ b/cpp/benchmarks/join/mixed_join.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include template diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu index 8e2ca8e677a..dbc3234dabf 100644 --- a/cpp/benchmarks/lists/copying/scatter_lists.cu +++ b/cpp/benchmarks/lists/copying/scatter_lists.cu @@ -40,9 +40,9 @@ void BM_lists_scatter(::benchmark::State& state) auto stream = cudf::get_default_stream(); auto mr = rmm::mr::get_current_device_resource(); - const cudf::size_type base_size{(cudf::size_type)state.range(0)}; - const cudf::size_type num_elements_per_row{(cudf::size_type)state.range(1)}; - const auto num_rows = (cudf::size_type)ceil(double(base_size) / num_elements_per_row); + cudf::size_type const base_size{(cudf::size_type)state.range(0)}; + cudf::size_type const num_elements_per_row{(cudf::size_type)state.range(1)}; + auto const num_rows = (cudf::size_type)ceil(double(base_size) / num_elements_per_row); auto source_base_col = make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, base_size, @@ -62,26 +62,26 @@ void BM_lists_scatter(::benchmark::State& state) target_base_col->mutable_view().end()); auto source_offsets = - make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, + make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, num_rows + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto target_offsets = - make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, + make_fixed_width_column(cudf::data_type{cudf::type_to_id()}, num_rows + 1, cudf::mask_state::UNALLOCATED, stream, mr); thrust::sequence(rmm::exec_policy(stream), - source_offsets->mutable_view().begin(), - source_offsets->mutable_view().end(), + source_offsets->mutable_view().begin(), + source_offsets->mutable_view().end(), 0, num_elements_per_row); thrust::sequence(rmm::exec_policy(stream), - target_offsets->mutable_view().begin(), - target_offsets->mutable_view().end(), + target_offsets->mutable_view().begin(), + target_offsets->mutable_view().end(), 0, num_elements_per_row); @@ -122,7 +122,11 @@ void BM_lists_scatter(::benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - scatter(cudf::table_view{{*source}}, *scatter_map, cudf::table_view{{*target}}, mr); + scatter(cudf::table_view{{*source}}, + *scatter_map, + cudf::table_view{{*target}}, + cudf::get_default_stream(), + mr); } state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * 2 * diff --git a/cpp/benchmarks/lists/set_operations.cpp b/cpp/benchmarks/lists/set_operations.cpp index 7a001b75376..5b240923358 100644 --- a/cpp/benchmarks/lists/set_operations.cpp +++ b/cpp/benchmarks/lists/set_operations.cpp @@ -15,7 +15,6 @@ */ #include -#include #include diff --git a/cpp/benchmarks/null_mask/set_null_mask.cpp b/cpp/benchmarks/null_mask/set_null_mask.cpp index 6d605b06c23..4ac4c9617e2 100644 --- a/cpp/benchmarks/null_mask/set_null_mask.cpp +++ b/cpp/benchmarks/null_mask/set_null_mask.cpp @@ -23,7 +23,7 @@ class SetNullmask : public cudf::benchmark {}; void BM_setnullmask(benchmark::State& state) { - const cudf::size_type size{(cudf::size_type)state.range(0)}; + cudf::size_type const size{(cudf::size_type)state.range(0)}; rmm::device_buffer mask = cudf::create_null_mask(size, cudf::mask_state::UNINITIALIZED); auto begin = 0, end = size; diff --git a/cpp/benchmarks/quantiles/quantiles.cpp b/cpp/benchmarks/quantiles/quantiles.cpp index 313a1270d91..24f9cc9c68e 100644 --- a/cpp/benchmarks/quantiles/quantiles.cpp +++ b/cpp/benchmarks/quantiles/quantiles.cpp @@ -30,9 +30,9 @@ static void BM_quantiles(benchmark::State& state, bool nulls) { using Type = int; - const cudf::size_type n_rows{(cudf::size_type)state.range(0)}; - const cudf::size_type n_cols{(cudf::size_type)state.range(1)}; - const cudf::size_type n_quantiles{(cudf::size_type)state.range(2)}; + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + cudf::size_type const n_cols{(cudf::size_type)state.range(1)}; + cudf::size_type const n_quantiles{(cudf::size_type)state.range(2)}; // Create columns with values in the range [0,100) data_profile profile = data_profile_builder().cardinality(0).distribution( diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp index ba723c16c4b..8b1e71c1585 100644 --- a/cpp/benchmarks/reduction/anyall.cpp +++ b/cpp/benchmarks/reduction/anyall.cpp @@ -30,7 +30,7 @@ template void BM_reduction_anyall(benchmark::State& state, std::unique_ptr const& agg) { - const cudf::size_type column_size{static_cast(state.range(0))}; + cudf::size_type const column_size{static_cast(state.range(0))}; auto const dtype = cudf::type_to_id(); data_profile const profile = data_profile_builder().no_validity().distribution( dtype, distribution_id::UNIFORM, 0, agg->kind == cudf::aggregation::ANY ? 0 : 100); diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp index 97ac5f56b2d..c1c44c919ac 100644 --- a/cpp/benchmarks/reduction/dictionary.cpp +++ b/cpp/benchmarks/reduction/dictionary.cpp @@ -29,7 +29,7 @@ template void BM_reduction_dictionary(benchmark::State& state, std::unique_ptr const& agg) { - const cudf::size_type column_size{static_cast(state.range(0))}; + cudf::size_type const column_size{static_cast(state.range(0))}; // int column and encoded dictionary column data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp index 2cabcdf680c..963c26692e7 100644 --- a/cpp/benchmarks/reduction/minmax.cpp +++ b/cpp/benchmarks/reduction/minmax.cpp @@ -27,7 +27,7 @@ class Reduction : public cudf::benchmark {}; template void BM_reduction(benchmark::State& state) { - const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + cudf::size_type const column_size{(cudf::size_type)state.range(0)}; auto const dtype = cudf::type_to_id(); auto const input_column = create_random_column(dtype, row_count{column_size}, data_profile_builder().no_validity()); diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp index 41295f787fc..e55f3b9e09f 100644 --- a/cpp/benchmarks/reduction/rank.cpp +++ b/cpp/benchmarks/reduction/rank.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp index 34e4a47c09d..5bd3e2e3bba 100644 --- a/cpp/benchmarks/reduction/reduce.cpp +++ b/cpp/benchmarks/reduction/reduce.cpp @@ -30,7 +30,7 @@ class Reduction : public cudf::benchmark {}; template void BM_reduction(benchmark::State& state, std::unique_ptr const& agg) { - const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + cudf::size_type const column_size{(cudf::size_type)state.range(0)}; auto const dtype = cudf::type_to_id(); data_profile const profile = data_profile_builder().no_validity().distribution(dtype, distribution_id::UNIFORM, 0, 100); diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp index d5b19faf773..ee97b54fbef 100644 --- a/cpp/benchmarks/reduction/scan_structs.cpp +++ b/cpp/benchmarks/reduction/scan_structs.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/benchmarks/reduction/segmented_reduce.cpp b/cpp/benchmarks/reduction/segmented_reduce.cpp index 590a014ad76..7accb82734a 100644 --- a/cpp/benchmarks/reduction/segmented_reduce.cpp +++ b/cpp/benchmarks/reduction/segmented_reduce.cpp @@ -15,8 +15,6 @@ */ #include -#include -#include #include #include @@ -28,6 +26,8 @@ #include +#include + #include bool constexpr is_boolean_output_agg(cudf::segmented_reduce_aggregation::Kind kind) diff --git a/cpp/benchmarks/search/contains.cpp b/cpp/benchmarks/search/contains.cpp index 01a0a37b21a..8d3c3f596d5 100644 --- a/cpp/benchmarks/search/contains.cpp +++ b/cpp/benchmarks/search/contains.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/benchmarks/sort/nested_types_common.hpp b/cpp/benchmarks/sort/nested_types_common.hpp index e0626b1b96f..93853ba5768 100644 --- a/cpp/benchmarks/sort/nested_types_common.hpp +++ b/cpp/benchmarks/sort/nested_types_common.hpp @@ -17,7 +17,6 @@ #pragma once #include -#include #include @@ -33,8 +32,8 @@ inline std::unique_ptr create_lists_data(nvbench::state& state, cudf::size_type const min_val = 0, cudf::size_type const max_val = 5) { - const size_t size_bytes(state.get_int64("size_bytes")); - const cudf::size_type depth{static_cast(state.get_int64("depth"))}; + size_t const size_bytes(state.get_int64("size_bytes")); + cudf::size_type const depth{static_cast(state.get_int64("depth"))}; auto const null_frequency{state.get_float64("null_frequency")}; data_profile table_profile; @@ -55,9 +54,9 @@ inline std::unique_ptr create_structs_data(nvbench::state& state, std::default_random_engine generator; std::uniform_int_distribution distribution(0, 100); - const cudf::size_type n_rows{static_cast(state.get_int64("NumRows"))}; - const cudf::size_type depth{static_cast(state.get_int64("Depth"))}; - const bool nulls{static_cast(state.get_int64("Nulls"))}; + cudf::size_type const n_rows{static_cast(state.get_int64("NumRows"))}; + cudf::size_type const depth{static_cast(state.get_int64("Depth"))}; + bool const nulls{static_cast(state.get_int64("Nulls"))}; // Create columns with values in the range [0,100) std::vector columns; diff --git a/cpp/benchmarks/sort/rank.cpp b/cpp/benchmarks/sort/rank.cpp index b2178f8f187..6231c7016aa 100644 --- a/cpp/benchmarks/sort/rank.cpp +++ b/cpp/benchmarks/sort/rank.cpp @@ -27,7 +27,7 @@ class Rank : public cudf::benchmark {}; static void BM_rank(benchmark::State& state, bool nulls) { using Type = int; - const cudf::size_type n_rows{(cudf::size_type)state.range(0)}; + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; // Create columns with values in the range [0,100) data_profile profile = data_profile_builder().cardinality(0).distribution( diff --git a/cpp/benchmarks/sort/rank_structs.cpp b/cpp/benchmarks/sort/rank_structs.cpp index c0227e85191..85427e2128f 100644 --- a/cpp/benchmarks/sort/rank_structs.cpp +++ b/cpp/benchmarks/sort/rank_structs.cpp @@ -26,7 +26,7 @@ void nvbench_rank_structs(nvbench::state& state, nvbench::type_list(state.get_int64("Nulls"))}; + bool const nulls{static_cast(state.get_int64("Nulls"))}; state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { cudf::rank(table->view().column(0), diff --git a/cpp/benchmarks/sort/segmented_sort.cpp b/cpp/benchmarks/sort/segmented_sort.cpp index 22d2b1c4029..2e835259cbc 100644 --- a/cpp/benchmarks/sort/segmented_sort.cpp +++ b/cpp/benchmarks/sort/segmented_sort.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/benchmarks/sort/sort.cpp b/cpp/benchmarks/sort/sort.cpp index cab25f442bb..267a740aee9 100644 --- a/cpp/benchmarks/sort/sort.cpp +++ b/cpp/benchmarks/sort/sort.cpp @@ -29,8 +29,8 @@ static void BM_sort(benchmark::State& state, bool nulls) { using Type = int; auto const dtype = cudf::type_to_id(); - const cudf::size_type n_rows{(cudf::size_type)state.range(0)}; - const cudf::size_type n_cols{(cudf::size_type)state.range(1)}; + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + cudf::size_type const n_cols{(cudf::size_type)state.range(1)}; // Create table with values in the range [0,100) data_profile const profile = data_profile_builder() diff --git a/cpp/benchmarks/sort/sort_lists.cpp b/cpp/benchmarks/sort/sort_lists.cpp index 3cab60a29ec..4b04323a99f 100644 --- a/cpp/benchmarks/sort/sort_lists.cpp +++ b/cpp/benchmarks/sort/sort_lists.cpp @@ -84,7 +84,7 @@ void sort_lists_of_structs(nvbench::state& state) void nvbench_sort_lists(nvbench::state& state) { - const auto has_lists_of_structs = state.get_int64("lists_of_structs") > 0; + auto const has_lists_of_structs = state.get_int64("lists_of_structs") > 0; if (has_lists_of_structs) { sort_lists_of_structs(state); } else { diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp index 216ebc6bfd7..a6feaf04842 100644 --- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp +++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp @@ -65,7 +65,7 @@ void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns) (column_bytes_out + validity_bytes_out) * num_columns; // writing columns state.SetItemsProcessed(state.iterations() * column_size * num_columns); - state.SetBytesProcessed(static_cast(state.iterations()) * bytes_read + bytes_written); + state.SetBytesProcessed(static_cast(state.iterations()) * (bytes_read + bytes_written)); } } // namespace @@ -73,8 +73,8 @@ void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns) template void BM_apply_boolean_mask(benchmark::State& state, cudf::size_type num_columns) { - const cudf::size_type column_size{static_cast(state.range(0))}; - const cudf::size_type percent_true{static_cast(state.range(1))}; + cudf::size_type const column_size{static_cast(state.range(0))}; + cudf::size_type const percent_true{static_cast(state.range(1))}; data_profile profile = data_profile_builder().cardinality(0).null_probability(0.0).distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index 81eafa3044f..c04b6516903 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/benchmarks/stream_compaction/stable_distinct.cpp b/cpp/benchmarks/stream_compaction/stable_distinct.cpp new file mode 100644 index 00000000000..bcee3048013 --- /dev/null +++ b/cpp/benchmarks/stream_compaction/stable_distinct.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); + +template +void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list) +{ + cudf::size_type const num_rows = state.get_int64("NumRows"); + + data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + + auto source_column = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto input_column = source_column->view(); + auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::stable_distinct(input_table, + {0}, + cudf::duplicate_keep_option::KEEP_ANY, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL); + }); +} + +using data_type = nvbench::type_list; + +NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type)) + .set_name("stable_distinct") + .set_type_axes_names({"Type"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); + +template +void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list) +{ + auto const size = state.get_int64("ColumnSize"); + auto const dtype = cudf::type_to_id(); + double const null_probability = state.get_float64("null_probability"); + + auto builder = data_profile_builder().null_probability(null_probability); + if (dtype == cudf::type_id::LIST) { + builder.distribution(dtype, distribution_id::UNIFORM, 0, 4) + .distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4) + .list_depth(1); + } else { + // We're comparing stable_distinct() on a non-nested column to that on a list column with the + // same number of stable_distinct rows. The max list size is 4 and the number of distinct values + // in the list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 + + // 5^4 = 781 We want this column to also have 781 distinct values. + builder.distribution(dtype, distribution_id::UNIFORM, 0, 781); + } + + auto const table = create_random_table( + {dtype}, table_size_bytes{static_cast(size)}, data_profile{builder}, 0); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::stable_distinct(*table, + {0}, + cudf::duplicate_keep_option::KEEP_ANY, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL); + }); +} + +NVBENCH_BENCH_TYPES(nvbench_stable_distinct_list, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("stable_distinct_list") + .set_type_axes_names({"Type"}) + .add_float64_axis("null_probability", {0.0, 0.1}) + .add_int64_axis("ColumnSize", {100'000'000}); diff --git a/cpp/benchmarks/stream_compaction/unique.cpp b/cpp/benchmarks/stream_compaction/unique.cpp index dafb9d506c7..854bc17e9c1 100644 --- a/cpp/benchmarks/stream_compaction/unique.cpp +++ b/cpp/benchmarks/stream_compaction/unique.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/benchmarks/stream_compaction/unique_count.cpp b/cpp/benchmarks/stream_compaction/unique_count.cpp index f8319e0385c..e003c476685 100644 --- a/cpp/benchmarks/stream_compaction/unique_count.cpp +++ b/cpp/benchmarks/stream_compaction/unique_count.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp index 0cdd5fbac32..385bb7630f8 100644 --- a/cpp/benchmarks/string/case.cpp +++ b/cpp/benchmarks/string/case.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include @@ -26,7 +25,7 @@ void bench_case(nvbench::state& state) { auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const max_width = static_cast(state.get_int64("width")); + auto const max_width = static_cast(state.get_int64("row_width")); auto const encoding = state.get_string("encoding"); if (static_cast(n_rows) * static_cast(max_width) >= @@ -72,7 +71,7 @@ void bench_case(nvbench::state& state) } NVBENCH_BENCH(bench_case) - .set_name("strings_case") - .add_int64_axis("width", {32, 64, 128, 256, 512, 1024, 2048}) + .set_name("case") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) .add_string_axis("encoding", {"ascii", "utf8"}); diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp new file mode 100644 index 00000000000..8e9e595fcef --- /dev/null +++ b/cpp/benchmarks/string/char_types.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +static void bench_char_types(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const api_type = state.get_string("api"); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + auto input_types = cudf::strings::string_character_types::SPACE; + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + // gather some throughput statistics as well + auto chars_size = input.chars_size(); + state.add_global_memory_reads(chars_size); // all bytes are read; + if (api_type == "all") { + state.add_global_memory_writes(num_rows); // output is a bool8 per row + } else { + state.add_global_memory_writes(chars_size); + } + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + if (api_type == "all") { + auto result = cudf::strings::all_characters_of_type(input, input_types); + } else { + auto result = cudf::strings::filter_characters_of_type(input, input_types); + } + }); +} + +NVBENCH_BENCH(bench_char_types) + .set_name("char_types") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_string_axis("api", {"all", "filter"}); diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index 714d50ffce3..af45d5d8fee 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -16,35 +16,46 @@ #include #include -#include #include #include +#include #include -#include #include #include #include -class StringContains : public cudf::benchmark {}; +#include -std::unique_ptr build_input_column(cudf::size_type n_rows, int32_t hit_rate) +std::unique_ptr build_input_column(cudf::size_type n_rows, + cudf::size_type row_width, + int32_t hit_rate) { // build input table using the following data - auto data = cudf::test::strings_column_wrapper({ - "123 abc 4567890 DEFGHI 0987 5W43", // matches both patterns; - "012345 6789 01234 56789 0123 456", // the rest do not match - "abc 4567890 DEFGHI 0987 Wxyz 123", - "abcdefghijklmnopqrstuvwxyz 01234", - "", - "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", - "9876543210,abcdefghijklmnopqrstU", - "9876543210,abcdefghijklmnopqrstU", - "123 édf 4567890 DéFG 0987 X5", - "1", - }); - auto data_view = cudf::column_view(data); + auto raw_data = cudf::test::strings_column_wrapper( + { + "123 abc 4567890 DEFGHI 0987 5W43", // matches both patterns; + "012345 6789 01234 56789 0123 456", // the rest do not match + "abc 4567890 DEFGHI 0987 Wxyz 123", + "abcdefghijklmnopqrstuvwxyz 01234", + "", + "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", + "9876543210,abcdefghijklmnopqrstU", + "9876543210,abcdefghijklmnopqrstU", + "123 édf 4567890 DéFG 0987 X5", + "1", + }) + .release(); + + if (row_width / 32 > 1) { + std::vector columns; + for (int i = 0; i < row_width / 32; ++i) { + columns.push_back(raw_data->view()); + } + raw_data = cudf::strings::concatenate(cudf::table_view(columns)); + } + auto data_view = raw_data->view(); // compute number of rows in n_rows that should match auto matches = static_cast(n_rows * hit_rate) / 100; @@ -68,51 +79,39 @@ std::unique_ptr build_input_column(cudf::size_type n_rows, int32_t return std::move(table->release().front()); } -enum contains_type { contains, count, findall }; - // longer pattern lengths demand more working memory per string std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$"}; -static void BM_contains(benchmark::State& state, contains_type ct) +static void bench_contains(nvbench::state& state) { - auto const n_rows = static_cast(state.range(0)); - auto const pattern_index = static_cast(state.range(1)); - auto const hit_rate = static_cast(state.range(2)); + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const pattern_index = static_cast(state.get_int64("pattern")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); + + if (static_cast(n_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } - auto col = build_input_column(n_rows, hit_rate); + auto col = build_input_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); auto pattern = patterns[pattern_index]; auto program = cudf::strings::regex_program::create(pattern); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - switch (ct) { - case contains_type::contains: // contains_re and matches_re use the same main logic - cudf::strings::contains_re(input, *program); - break; - case contains_type::count: // counts occurrences of matches - cudf::strings::count_re(input, *program); - break; - case contains_type::findall: // returns occurrences of all matches - cudf::strings::findall(input, *program); - break; - } - } + auto chars_size = input.chars_size(); + state.add_element_count(chars_size, "chars_size"); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(input.size()); - state.SetBytesProcessed(state.iterations() * input.chars_size()); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::contains_re(input, *program); }); } -#define STRINGS_BENCHMARK_DEFINE(name, b) \ - BENCHMARK_DEFINE_F(StringContains, name) \ - (::benchmark::State & st) { BM_contains(st, contains_type::b); } \ - BENCHMARK_REGISTER_F(StringContains, name) \ - ->ArgsProduct({{4096, 32768, 262144, 2097152, 16777216}, /* row count */ \ - {0, 1}, /* patterns index */ \ - {1, 5, 10, 25, 70, 100}}) /* hit rate */ \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(contains_re, contains) -STRINGS_BENCHMARK_DEFINE(count_re, count) -STRINGS_BENCHMARK_DEFINE(findall_re, findall) +NVBENCH_BENCH(bench_contains) + .set_name("contains") + .add_int64_axis("row_width", {32, 64, 128, 256, 512}) + .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("hit_rate", {50, 100}) // percentage + .add_int64_axis("pattern", {0, 1}); diff --git a/cpp/benchmarks/string/convert_durations.cpp b/cpp/benchmarks/string/convert_durations.cpp index 0db38bb5add..f12d292c2e7 100644 --- a/cpp/benchmarks/string/convert_durations.cpp +++ b/cpp/benchmarks/string/convert_durations.cpp @@ -31,7 +31,7 @@ class DurationsToString : public cudf::benchmark {}; template void BM_convert_from_durations(benchmark::State& state) { - const cudf::size_type source_size = state.range(0); + cudf::size_type const source_size = state.range(0); // Every element is valid auto data = cudf::detail::make_counting_transform_iterator( @@ -51,7 +51,7 @@ class StringToDurations : public cudf::benchmark {}; template void BM_convert_to_durations(benchmark::State& state) { - const cudf::size_type source_size = state.range(0); + cudf::size_type const source_size = state.range(0); // Every element is valid auto data = cudf::detail::make_counting_transform_iterator( diff --git a/cpp/benchmarks/string/convert_fixed_point.cpp b/cpp/benchmarks/string/convert_fixed_point.cpp index 69fc65333b8..0cc98ee146c 100644 --- a/cpp/benchmarks/string/convert_fixed_point.cpp +++ b/cpp/benchmarks/string/convert_fixed_point.cpp @@ -38,14 +38,14 @@ class StringsToFixedPoint : public cudf::benchmark {}; template void convert_to_fixed_point(benchmark::State& state) { - const auto rows = static_cast(state.range(0)); - const auto strings_col = get_strings_column(rows); - const auto strings_view = cudf::strings_column_view(strings_col->view()); - const auto dtype = cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}; + auto const rows = static_cast(state.range(0)); + auto const strings_col = get_strings_column(rows); + auto const strings_view = cudf::strings_column_view(strings_col->view()); + auto const dtype = cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}; for (auto _ : state) { cuda_event_timer raii(state, true); - volatile auto results = cudf::strings::to_fixed_point(strings_view, dtype); + auto volatile results = cudf::strings::to_fixed_point(strings_view, dtype); } // bytes_processed = bytes_input + bytes_output @@ -58,10 +58,10 @@ class StringsFromFixedPoint : public cudf::benchmark {}; template void convert_from_fixed_point(benchmark::State& state) { - const auto rows = static_cast(state.range(0)); - const auto strings_col = get_strings_column(rows); - const auto dtype = cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}; - const auto fp_col = + auto const rows = static_cast(state.range(0)); + auto const strings_col = get_strings_column(rows); + auto const dtype = cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}; + auto const fp_col = cudf::strings::to_fixed_point(cudf::strings_column_view(strings_col->view()), dtype); std::unique_ptr results = nullptr; diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp new file mode 100644 index 00000000000..08406462632 --- /dev/null +++ b/cpp/benchmarks/string/count.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include + +#include + +static void bench_count(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + + std::string pattern = "\\d+"; + + auto prog = cudf::strings::regex_program::create(pattern); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + // gather some throughput statistics as well + auto chars_size = input.chars_size(); + state.add_element_count(chars_size, "chars_size"); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(input.size()); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::strings::count_re(input, *prog); }); +} + +NVBENCH_BENCH(bench_count) + .set_name("count") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp index 021062ee479..9e67c5a5b52 100644 --- a/cpp/benchmarks/string/extract.cpp +++ b/cpp/benchmarks/string/extract.cpp @@ -14,34 +14,37 @@ * limitations under the License. */ -#include "string_bench_args.hpp" - #include -#include -#include #include #include #include #include +#include -#include +#include -class StringExtract : public cudf::benchmark {}; +#include -static void BM_extract(benchmark::State& state, int groups) +static void bench_extract(nvbench::state& state) { - auto const n_rows = static_cast(state.range(0)); - auto const n_length = static_cast(state.range(1)); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + auto groups = static_cast(state.get_int64("groups")); std::default_random_engine generator; std::uniform_int_distribution words_dist(0, 999); - std::vector samples(100); // 100 unique rows of data to reuse std::generate(samples.begin(), samples.end(), [&]() { std::string row; // build a row of random tokens - while (static_cast(row.size()) < n_length) { + while (static_cast(row.size()) < row_width) { row += std::to_string(words_dist(generator)) + " "; } return row; @@ -55,41 +58,27 @@ static void BM_extract(benchmark::State& state, int groups) cudf::test::strings_column_wrapper samples_column(samples.begin(), samples.end()); data_profile const profile = data_profile_builder().no_validity().distribution( cudf::type_to_id(), distribution_id::UNIFORM, 0ul, samples.size() - 1); - auto map = create_random_column(cudf::type_to_id(), row_count{n_rows}, profile); + auto map = + create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); auto input = cudf::gather( cudf::table_view{{samples_column}}, map->view(), cudf::out_of_bounds_policy::DONT_CHECK); cudf::strings_column_view strings_view(input->get_column(0).view()); auto prog = cudf::strings::regex_program::create(pattern); - for (auto _ : state) { - cuda_event_timer raii(state, true); - auto results = cudf::strings::extract(strings_view, *prog); - } - - state.SetBytesProcessed(state.iterations() * strings_view.chars_size()); -} + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + // gather some throughput statistics as well + auto chars_size = strings_view.chars_size(); + state.add_element_count(chars_size, "chars_size"); // number of bytes; + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(chars_size); // all bytes are written -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_multiplier = 8; - int const min_row_length = 1 << 5; - int const max_row_length = 1 << 13; - int const length_multiplier = 4; - generate_string_bench_args( - b, min_rows, max_rows, row_multiplier, min_row_length, max_row_length, length_multiplier); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::strings::extract(strings_view, *prog); + }); } -#define STRINGS_BENCHMARK_DEFINE(name, instructions) \ - BENCHMARK_DEFINE_F(StringExtract, name) \ - (::benchmark::State & st) { BM_extract(st, instructions); } \ - BENCHMARK_REGISTER_F(StringExtract, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(one, 1) -STRINGS_BENCHMARK_DEFINE(two, 2) -STRINGS_BENCHMARK_DEFINE(four, 4) -STRINGS_BENCHMARK_DEFINE(eight, 8) +NVBENCH_BENCH(bench_extract) + .set_name("extract") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("groups", {1, 2, 4}); diff --git a/cpp/benchmarks/string/gather.cpp b/cpp/benchmarks/string/gather.cpp new file mode 100644 index 00000000000..530b09b7d6a --- /dev/null +++ b/cpp/benchmarks/string/gather.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +static void bench_gather(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const input_table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + + data_profile const map_profile = data_profile_builder().no_validity().distribution( + cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_rows); + auto const map_table = + create_random_table({cudf::type_id::INT32}, row_count{num_rows}, map_profile); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + auto chars_size = cudf::strings_column_view(input_table->view().column(0)).chars_size(); + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(chars_size); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::gather( + input_table->view(), map_table->view().column(0), cudf::out_of_bounds_policy::NULLIFY); + }); +} + +NVBENCH_BENCH(bench_gather) + .set_name("gather") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp new file mode 100644 index 00000000000..a122c0022a9 --- /dev/null +++ b/cpp/benchmarks/string/join_strings.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +static void bench_join(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); + cudf::strings_column_view input(table->view().column(0)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + // gather some throughput statistics as well + auto const chars_size = input.chars_size(); + state.add_element_count(chars_size, "chars_size"); // number of bytes; + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(chars_size); // all bytes are written + + std::string separator(":"); + std::string narep("null"); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::strings::join_strings(input, separator, narep); + }); +} + +NVBENCH_BENCH(bench_join) + .set_name("strings_join") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/benchmarks/string/json.cu b/cpp/benchmarks/string/json.cu index 1d19e63102d..7e89edf3e17 100644 --- a/cpp/benchmarks/string/json.cu +++ b/cpp/benchmarks/string/json.cu @@ -32,7 +32,7 @@ class JsonPath : public cudf::benchmark {}; -const std::vector Books{ +std::vector const Books{ R"json({ "category": "reference", "author": "Nigel Rees", @@ -60,7 +60,7 @@ const std::vector Books{ "price": 22.99 })json"}; constexpr int Approx_book_size = 110; -const std::vector Bicycles{ +std::vector const Bicycles{ R"json({"color": "red", "price": 9.95})json", R"json({"color": "green", "price": 29.95})json", R"json({"color": "blue", "price": 399.95})json", diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp index 4540e4a8f42..36c4bf64a00 100644 --- a/cpp/benchmarks/string/lengths.cpp +++ b/cpp/benchmarks/string/lengths.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include @@ -51,6 +50,6 @@ static void bench_lengths(nvbench::state& state) } NVBENCH_BENCH(bench_lengths) - .set_name("strings_lengths") - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}); + .set_name("lengths") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp index d86c31480dd..6ac832471a5 100644 --- a/cpp/benchmarks/string/like.cpp +++ b/cpp/benchmarks/string/like.cpp @@ -15,12 +15,12 @@ */ #include -#include #include #include #include +#include #include #include #include @@ -28,22 +28,33 @@ #include namespace { -std::unique_ptr build_input_column(cudf::size_type n_rows, int32_t hit_rate) +std::unique_ptr build_input_column(cudf::size_type n_rows, + cudf::size_type row_width, + int32_t hit_rate) { // build input table using the following data - auto data = cudf::test::strings_column_wrapper({ - "123 abc 4567890 DEFGHI 0987 5W43", // matches always; - "012345 6789 01234 56789 0123 456", // the rest do not match - "abc 4567890 DEFGHI 0987 Wxyz 123", - "abcdefghijklmnopqrstuvwxyz 01234", - "", - "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", - "9876543210,abcdefghijklmnopqrstU", - "9876543210,abcdefghijklmnopqrstU", - "123 édf 4567890 DéFG 0987 X5", - "1", - }); - auto data_view = cudf::column_view(data); + auto raw_data = cudf::test::strings_column_wrapper( + { + "123 abc 4567890 DEFGHI 0987 5W43", // matches always; + "012345 6789 01234 56789 0123 456", // the rest do not match + "abc 4567890 DEFGHI 0987 Wxyz 123", + "abcdefghijklmnopqrstuvwxyz 01234", + "", + "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01", + "9876543210,abcdefghijklmnopqrstU", + "9876543210,abcdefghijklmnopqrstU", + "123 édf 4567890 DéFG 0987 X5", + "1", + }) + .release(); + if (row_width / 32 > 1) { + std::vector columns; + for (int i = 0; i < row_width / 32; ++i) { + columns.push_back(raw_data->view()); + } + raw_data = cudf::strings::concatenate(cudf::table_view(columns)); + } + auto data_view = raw_data->view(); // compute number of rows in n_rows that should match auto matches = static_cast(n_rows * hit_rate) / 100; @@ -71,14 +82,20 @@ std::unique_ptr build_input_column(cudf::size_type n_rows, int32_t static void bench_like(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const hit_rate = static_cast(state.get_int64("hit_rate")); + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const hit_rate = static_cast(state.get_int64("hit_rate")); - auto col = build_input_column(n_rows, hit_rate); + if (static_cast(n_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + auto col = build_input_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); // This pattern forces reading the entire target string (when matched expected) - auto pattern = std::string("% 5W4_"); // regex equivalent: ".* 5W4." + auto pattern = std::string("% 5W4_"); // regex equivalent: ".* 5W4.$" state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well @@ -93,5 +110,6 @@ static void bench_like(nvbench::state& state) NVBENCH_BENCH(bench_like) .set_name("strings_like") - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) - .add_int64_axis("hit_rate", {1, 5, 10, 25, 70, 100}); + .add_int64_axis("row_width", {32, 64, 128, 256, 512}) + .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("hit_rate", {10, 25, 70, 100}); diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp index f719fe31bd8..b8efd76ab41 100644 --- a/cpp/benchmarks/string/replace_re.cpp +++ b/cpp/benchmarks/string/replace_re.cpp @@ -14,72 +14,54 @@ * limitations under the License. */ -#include "string_bench_args.hpp" - #include #include -#include - -#include #include #include #include #include -class StringReplace : public cudf::benchmark {}; - -enum replace_type { replace_re, replace_re_multi, replace_backref }; +#include -static void BM_replace(benchmark::State& state, replace_type rt) +static void bench_replace(nvbench::state& state) { - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const rtype = state.get_string("type"); + + if (static_cast(n_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); cudf::strings_column_view input(column->view()); - cudf::test::strings_column_wrapper repls({"#", ""}); - auto prog = cudf::strings::regex_program::create("\\d+"); - auto prog_backref = cudf::strings::regex_program::create("(\\d+)"); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - switch (rt) { - case replace_type::replace_re: // contains_re and matches_re use the same main logic - cudf::strings::replace_re(input, *prog); - break; - case replace_type::replace_re_multi: // counts occurrences of pattern - cudf::strings::replace_re(input, {"\\d+", "\\s+"}, cudf::strings_column_view(repls)); - break; - case replace_type::replace_backref: // returns occurrences of matches - cudf::strings::replace_with_backrefs(input, *prog_backref, "#\\1X"); - break; - } - } + auto program = cudf::strings::regex_program::create("(\\d+)"); - state.SetBytesProcessed(state.iterations() * input.chars_size()); -} + auto chars_size = input.chars_size(); + state.add_element_count(chars_size, "chars_size"); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + if (rtype == "backref") { + auto replacement = std::string("#\\1X"); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::replace_with_backrefs(input, *program, replacement); + }); + } else { + auto replacement = std::string("77"); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::strings::replace_re(input, *program, replacement); + }); + } } -#define STRINGS_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringReplace, name) \ - (::benchmark::State & st) { BM_replace(st, name); } \ - BENCHMARK_REGISTER_F(StringReplace, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(replace_re) -STRINGS_BENCHMARK_DEFINE(replace_re_multi) -STRINGS_BENCHMARK_DEFINE(replace_backref) +NVBENCH_BENCH(bench_replace) + .set_name("replace_re") + .add_int64_axis("row_width", {32, 64, 128, 256, 512}) + .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_string_axis("type", {"replace", "backref"}); diff --git a/cpp/benchmarks/string/reverse.cpp b/cpp/benchmarks/string/reverse.cpp index 4c3846c79bb..31cd4639115 100644 --- a/cpp/benchmarks/string/reverse.cpp +++ b/cpp/benchmarks/string/reverse.cpp @@ -15,7 +15,6 @@ */ #include -#include #include #include @@ -51,6 +50,6 @@ static void bench_reverse(nvbench::state& state) } NVBENCH_BENCH(bench_reverse) - .set_name("strings_reverse") - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) - .add_int64_axis("row_width", {8, 16, 32, 64, 128}); + .set_name("reverse") + .add_int64_axis("row_width", {8, 16, 32, 64, 128}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp index e0b801ea0a7..6c1d7d98d3a 100644 --- a/cpp/benchmarks/string/slice.cpp +++ b/cpp/benchmarks/string/slice.cpp @@ -33,7 +33,7 @@ class StringSlice : public cudf::benchmark {}; -enum slice_type { position, multi_position, delimiter, multi_delimiter }; +enum slice_type { position, multi_position }; static void BM_slice(benchmark::State& state, slice_type rt) { @@ -47,8 +47,6 @@ static void BM_slice(benchmark::State& state, slice_type rt) auto stops_itr = thrust::constant_iterator(max_str_length / 2); cudf::test::fixed_width_column_wrapper starts(starts_itr, starts_itr + n_rows); cudf::test::fixed_width_column_wrapper stops(stops_itr, stops_itr + n_rows); - auto delim_itr = thrust::constant_iterator(" "); - cudf::test::strings_column_wrapper delimiters(delim_itr, delim_itr + n_rows); for (auto _ : state) { cuda_event_timer raii(state, true, cudf::get_default_stream()); @@ -57,10 +55,6 @@ static void BM_slice(benchmark::State& state, slice_type rt) cudf::strings::slice_strings(input, max_str_length / 3, max_str_length / 2); break; case multi_position: cudf::strings::slice_strings(input, starts, stops); break; - case delimiter: cudf::strings::slice_strings(input, std::string{" "}, 1); break; - case multi_delimiter: - cudf::strings::slice_strings(input, cudf::strings_column_view(delimiters), 1); - break; } } @@ -88,5 +82,3 @@ static void generate_bench_args(benchmark::internal::Benchmark* b) STRINGS_BENCHMARK_DEFINE(position) STRINGS_BENCHMARK_DEFINE(multi_position) -STRINGS_BENCHMARK_DEFINE(delimiter) -STRINGS_BENCHMARK_DEFINE(multi_delimiter) diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp index 021a7341ddd..eb724fabfd1 100644 --- a/cpp/benchmarks/string/split.cpp +++ b/cpp/benchmarks/string/split.cpp @@ -15,8 +15,6 @@ */ #include -#include -#include #include @@ -25,63 +23,49 @@ #include #include -#include +#include -class StringSplit : public cudf::benchmark {}; +static void bench_split(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const stype = state.get_string("type"); -enum split_type { split, split_ws, record, record_ws }; + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } -static void BM_split(benchmark::State& state, split_type rt) -{ - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); cudf::string_scalar target("+"); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - switch (rt) { - case split: cudf::strings::split(input, target); break; - case split_ws: cudf::strings::split(input); break; - case record: cudf::strings::split_record(input, target); break; - case record_ws: cudf::strings::split_record(input); break; - } - } - - state.SetBytesProcessed(state.iterations() * input.chars_size()); -} + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + // gather some throughput statistics as well + auto chars_size = input.chars_size(); + state.add_element_count(chars_size, "chars_size"); // number of bytes; + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(chars_size); // all bytes are written -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int constexpr min_rows = 1 << 12; - int constexpr max_rows = 1 << 24; - int constexpr row_mult = 8; - int constexpr min_rowlen = 1 << 5; - int constexpr max_rowlen = 1 << 13; - int constexpr len_mult = 2; - for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { - for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { - // avoid generating combinations that exceed the cudf column limit - size_t total_chars = static_cast(row_count) * rowlen; - if (total_chars < static_cast(std::numeric_limits::max())) { - b->Args({row_count, rowlen}); - } - } + if (stype == "split") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::split(input, target); }); + } else if (stype == "split_ws") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::split(input); }); + } else if (stype == "record") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::split_record(input, target); }); + } else { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::split_record(input); }); } } -#define STRINGS_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringSplit, name) \ - (::benchmark::State & st) { BM_split(st, split_type::name); } \ - BENCHMARK_REGISTER_F(StringSplit, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(split) -STRINGS_BENCHMARK_DEFINE(split_ws) -STRINGS_BENCHMARK_DEFINE(record) -STRINGS_BENCHMARK_DEFINE(record_ws) +NVBENCH_BENCH(bench_split) + .set_name("split") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_string_axis("type", {"split", "split_ws", "record", "record_ws"}); diff --git a/cpp/benchmarks/string/split_re.cpp b/cpp/benchmarks/string/split_re.cpp new file mode 100644 index 00000000000..67aa6f0e008 --- /dev/null +++ b/cpp/benchmarks/string/split_re.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include + +#include + +static void bench_split(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + auto prog = cudf::strings::regex_program::create("\\d+"); + + data_profile const profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + cudf::strings_column_view input(column->view()); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + // gather some throughput statistics as well + auto chars_size = input.chars_size(); + state.add_element_count(chars_size, "chars_size"); // number of bytes; + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(chars_size); // all bytes are written + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::strings::split_record_re(input, *prog); + }); +} + +NVBENCH_BENCH(bench_split) + .set_name("split_re") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp index bbf90e6f68a..5993bb23542 100644 --- a/cpp/benchmarks/synchronization/synchronization.cpp +++ b/cpp/benchmarks/synchronization/synchronization.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,7 +35,7 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state, CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); if (l2_cache_bytes > 0) { - const int memset_value = 0; + int const memset_value = 0; rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream); CUDF_CUDA_TRY( cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value())); diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp new file mode 100644 index 00000000000..8a8bd9ae586 --- /dev/null +++ b/cpp/benchmarks/text/edit_distance.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +#include + +#include + +static void bench_edit_distance(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const strings_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const strings_table = create_random_table( + {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); + cudf::strings_column_view input1(strings_table->view().column(0)); + cudf::strings_column_view input2(strings_table->view().column(1)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + auto chars_size = input1.chars_size() + input2.chars_size(); + state.add_global_memory_reads(chars_size); + // output are integers (one per row) + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = nvtext::edit_distance(input1, input2); }); +} + +NVBENCH_BENCH(bench_edit_distance) + .set_name("edit_distance") + .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) + .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256}); diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp new file mode 100644 index 00000000000..5bbd2fc6819 --- /dev/null +++ b/cpp/benchmarks/text/hash_ngrams.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +#include + +#include + +static void bench_hash_ngrams(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const ngrams = static_cast(state.get_int64("ngrams")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const strings_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const strings_table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); + cudf::strings_column_view input(strings_table->view().column(0)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + auto chars_size = input.chars_size(); + state.add_global_memory_reads(chars_size); + // output are hashes: approximate total number of hashes + state.add_global_memory_writes(num_rows * ngrams); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::hash_character_ngrams(input, ngrams); + }); +} + +NVBENCH_BENCH(bench_hash_ngrams) + .set_name("hash_ngrams") + .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) + .add_int64_axis("row_width", {128, 512, 2048}) + .add_int64_axis("ngrams", {5, 10}); diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp new file mode 100644 index 00000000000..70470b829bd --- /dev/null +++ b/cpp/benchmarks/text/jaccard.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +#include + +#include + +static void bench_jaccard(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const substring_width = static_cast(state.get_int64("substring_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const strings_profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .no_validity(); + auto const input_table = create_random_table( + {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); + cudf::strings_column_view input1(input_table->view().column(0)); + cudf::strings_column_view input2(input_table->view().column(1)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + auto chars_size = input1.chars_size() + input2.chars_size(); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::jaccard_index(input1, input2, substring_width); + }); +} + +NVBENCH_BENCH(bench_jaccard) + .set_name("jaccard") + .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) + .add_int64_axis("row_width", {128, 512, 2048}) + .add_int64_axis("substring_width", {5, 10}); diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 15c39015d74..1b60caa24de 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -15,7 +15,6 @@ */ #include -#include #include @@ -31,6 +30,7 @@ static void bench_minhash(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const hash_width = static_cast(state.get_int64("hash_width")); auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const base64 = state.get_int64("hash_type") == 64; if (static_cast(num_rows) * static_cast(row_width) >= static_cast(std::numeric_limits::max())) { @@ -45,9 +45,9 @@ static void bench_minhash(nvbench::state& state) data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution( cudf::type_to_id(), distribution_id::NORMAL, 0, row_width); - auto const seeds_table = create_random_table( - {cudf::type_to_id()}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); + auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); + auto seeds = seeds_table->get_column(0); seeds.set_null_mask(rmm::device_buffer{}, 0); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -57,13 +57,15 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = nvtext::minhash(input, seeds.view(), hash_width); + auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width) + : nvtext::minhash(input, seeds.view(), hash_width); }); } NVBENCH_BENCH(bench_minhash) .set_name("minhash") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) + .add_int64_axis("num_rows", {1024, 8192, 16364, 131072}) .add_int64_axis("row_width", {128, 512, 2048}) - .add_int64_axis("hash_width", {5, 10, 25}) - .add_int64_axis("seed_count", {2, 26}); + .add_int64_axis("hash_width", {5, 10}) + .add_int64_axis("seed_count", {2, 26}) + .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 733f2da8b2a..6878fa4f8b6 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include @@ -24,51 +23,43 @@ #include -class TextNormalize : public cudf::benchmark {}; +#include -static void BM_normalize(benchmark::State& state, bool to_lower) +static void bench_normalize(nvbench::state& state) { - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const normalize_type = state.get_string("type"); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - nvtext::normalize_characters(input, to_lower); - } + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.SetBytesProcessed(state.iterations() * input.chars_size()); -} + auto chars_size = input.chars_size(); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; - for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { - for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { - // avoid generating combinations that exceed the cudf column limit - size_t total_chars = static_cast(row_count) * rowlen * 4; - if (total_chars < static_cast(std::numeric_limits::max())) { - b->Args({row_count, rowlen}); - } - } + if (normalize_type == "spaces") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); }); + } else { + bool const to_lower = (normalize_type == "to_lower"); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::normalize_characters(input, to_lower); + }); } } -#define NVTEXT_BENCHMARK_DEFINE(name, lower) \ - BENCHMARK_DEFINE_F(TextNormalize, name) \ - (::benchmark::State & st) { BM_normalize(st, lower); } \ - BENCHMARK_REGISTER_F(TextNormalize, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -NVTEXT_BENCHMARK_DEFINE(characters, false) -NVTEXT_BENCHMARK_DEFINE(to_lower, true) +NVBENCH_BENCH(bench_normalize) + .set_name("normalize") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_string_axis("type", {"spaces", "characters", "to_lower"}); diff --git a/cpp/benchmarks/text/normalize_spaces.cpp b/cpp/benchmarks/text/normalize_spaces.cpp deleted file mode 100644 index 82d9316e25b..00000000000 --- a/cpp/benchmarks/text/normalize_spaces.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include - -class TextNormalize : public cudf::benchmark {}; - -static void BM_normalize(benchmark::State& state) -{ - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); - cudf::strings_column_view input(column->view()); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - nvtext::normalize_spaces(input); - } - - state.SetBytesProcessed(state.iterations() * input.chars_size()); -} - -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); -} - -#define NVTEXT_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(TextNormalize, name) \ - (::benchmark::State & st) { BM_normalize(st); } \ - BENCHMARK_REGISTER_F(TextNormalize, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -NVTEXT_BENCHMARK_DEFINE(spaces) diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp index 21d69c4d40e..257f62aa728 100644 --- a/cpp/benchmarks/text/replace.cpp +++ b/cpp/benchmarks/text/replace.cpp @@ -15,8 +15,6 @@ */ #include -#include -#include #include @@ -24,14 +22,19 @@ #include -#include +#include -class TextReplace : public cudf::benchmark {}; +#include -static void BM_replace(benchmark::State& state) +static void bench_replace(nvbench::state& state) { - auto const n_rows = static_cast(state.range(0)); - auto const n_length = static_cast(state.range(1)); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } std::vector words{" ", "one ", "two ", "three ", "four ", "five ", "six ", "sevén ", "eight ", "nine ", @@ -41,46 +44,32 @@ static void BM_replace(benchmark::State& state) std::default_random_engine generator; std::uniform_int_distribution tokens_dist(0, words.size() - 1); std::string row; // build a row of random tokens - while (static_cast(row.size()) < n_length) + while (static_cast(row.size()) < row_width) row += words[tokens_dist(generator)]; std::uniform_int_distribution position_dist(0, 16); auto elements = cudf::detail::make_counting_transform_iterator( 0, [&](auto idx) { return row.c_str() + position_dist(generator); }); - cudf::test::strings_column_wrapper input(elements, elements + n_rows); + cudf::test::strings_column_wrapper input(elements, elements + num_rows); cudf::strings_column_view view(input); cudf::test::strings_column_wrapper targets({"one", "two", "sevén", "zero"}); cudf::test::strings_column_wrapper replacements({"1", "2", "7", "0"}); - for (auto _ : state) { - cuda_event_timer raii(state, true); - nvtext::replace_tokens( - view, cudf::strings_column_view(targets), cudf::strings_column_view(replacements)); - } + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.SetBytesProcessed(state.iterations() * view.chars_size()); -} + auto chars_size = view.chars_size(); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_multiplier = 8; - int const min_row_length = 1 << 5; - int const max_row_length = 1 << 13; - int const length_multiplier = 4; - generate_string_bench_args( - b, min_rows, max_rows, row_multiplier, min_row_length, max_row_length, length_multiplier); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::replace_tokens( + view, cudf::strings_column_view(targets), cudf::strings_column_view(replacements)); + }); } -#define NVTEXT_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(TextReplace, name) \ - (::benchmark::State & st) { BM_replace(st); } \ - BENCHMARK_REGISTER_F(TextReplace, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -NVTEXT_BENCHMARK_DEFINE(replace) +NVBENCH_BENCH(bench_replace) + .set_name("replace") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp index a683214448f..1dd7322a5c8 100644 --- a/cpp/benchmarks/text/subword.cpp +++ b/cpp/benchmarks/text/subword.cpp @@ -27,12 +27,10 @@ #include #include -#define MAX_ROWS_TENSOR 300 - static std::string create_hash_vocab_file() { std::string dir_template{std::filesystem::temp_directory_path().string()}; - if (const char* env_p = std::getenv("WORKSPACE")) dir_template = env_p; + if (char const* env_p = std::getenv("WORKSPACE")) dir_template = env_p; std::string hash_file = dir_template + "/hash_vocab.txt"; // create a fake hashed vocab text file for this test // this only works with words in the strings in the benchmark code below @@ -57,7 +55,7 @@ static std::string create_hash_vocab_file() static void BM_subword_tokenizer(benchmark::State& state) { auto const nrows = static_cast(state.range(0)); - std::vector h_strings(nrows, "This is a test "); + std::vector h_strings(nrows, "This is a test "); cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); std::string hash_file = create_hash_vocab_file(); std::vector offsets{14}; @@ -74,8 +72,7 @@ static void BM_subword_tokenizer(benchmark::State& state) max_sequence_length, stride, do_lower, - do_truncate, - MAX_ROWS_TENSOR); + do_truncate); } } diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index bd80af08a74..423fe667b05 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -16,8 +16,6 @@ #include #include -#include -#include #include @@ -28,73 +26,57 @@ #include #include -class TextTokenize : public cudf::benchmark {}; +#include -enum class tokenize_type { single, multi, count, count_multi, ngrams, characters }; - -static void BM_tokenize(benchmark::State& state, tokenize_type tt) +static void bench_tokenize(nvbench::state& state) { - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const tokenize_type = state.get_string("type"); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); - cudf::test::strings_column_wrapper delimiters({" ", "+", "-"}); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - switch (tt) { - case tokenize_type::single: - // single whitespace delimiter - nvtext::tokenize(input); - break; - case tokenize_type::multi: - nvtext::tokenize(input, cudf::strings_column_view(delimiters)); - break; - case tokenize_type::count: - // single whitespace delimiter - nvtext::count_tokens(input); - break; - case tokenize_type::count_multi: - nvtext::count_tokens(input, cudf::strings_column_view(delimiters)); - break; - case tokenize_type::ngrams: - // default is bigrams - nvtext::ngrams_tokenize(input); - break; - case tokenize_type::characters: - // every character becomes a string - nvtext::character_tokenize(input); - break; - } - } + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.SetBytesProcessed(state.iterations() * input.chars_size()); -} + auto chars_size = input.chars_size(); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + if (tokenize_type == "whitespace") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = nvtext::tokenize(input); }); + } else if (tokenize_type == "multi") { + cudf::test::strings_column_wrapper delimiters({" ", "+", "-"}); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::tokenize(input, cudf::strings_column_view(delimiters)); + }); + } else if (tokenize_type == "count") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = nvtext::count_tokens(input); }); + } else if (tokenize_type == "count_multi") { + cudf::test::strings_column_wrapper delimiters({" ", "+", "-"}); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters)); + }); + } else if (tokenize_type == "ngrams") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); }); + } else if (tokenize_type == "characters") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); }); + } } -#define NVTEXT_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(TextTokenize, name) \ - (::benchmark::State & st) { BM_tokenize(st, tokenize_type::name); } \ - BENCHMARK_REGISTER_F(TextTokenize, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -NVTEXT_BENCHMARK_DEFINE(single) -NVTEXT_BENCHMARK_DEFINE(multi) -NVTEXT_BENCHMARK_DEFINE(count) -NVTEXT_BENCHMARK_DEFINE(count_multi) -NVTEXT_BENCHMARK_DEFINE(ngrams) -NVTEXT_BENCHMARK_DEFINE(characters) +NVBENCH_BENCH(bench_tokenize) + .set_name("tokenize") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_string_axis("type", {"whitespace", "multi", "count", "count_multi", "ngrams", "characters"}); diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu index 362d3825f81..3f985cffb1f 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu @@ -126,7 +126,7 @@ struct RowHandle { template __global__ void device_dispatching_kernel(cudf::mutable_table_device_view source) { - const cudf::size_type n_rows = source.num_rows(); + cudf::size_type const n_rows = source.num_rows(); cudf::size_type index = threadIdx.x + blockIdx.x * blockDim.x; while (index < n_rows) { @@ -141,8 +141,8 @@ __global__ void device_dispatching_kernel(cudf::mutable_table_device_view source template void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_thread) { - const cudf::size_type n_rows = input.num_rows(); - const cudf::size_type n_cols = input.num_columns(); + cudf::size_type const n_rows = input.num_rows(); + cudf::size_type const n_cols = input.num_columns(); cudf::detail::grid_1d grid_config{n_rows, block_size}; int grid_size = grid_config.num_blocks; @@ -169,9 +169,9 @@ void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_threa template void type_dispatcher_benchmark(::benchmark::State& state) { - const auto n_cols = static_cast(state.range(0)); - const auto source_size = static_cast(state.range(1)); - const auto work_per_thread = static_cast(state.range(2)); + auto const n_cols = static_cast(state.range(0)); + auto const source_size = static_cast(state.range(1)); + auto const work_per_thread = static_cast(state.range(2)); auto init = cudf::make_fixed_width_scalar(static_cast(0)); diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index c877c9c6466..894dc9649e2 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -162,13 +162,14 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB rapids_cpm_find( Arrow ${VERSION} - GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static - arrow_dataset_static + GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static + parquet_static arrow_acero_static arrow_dataset_static CPM_ARGS GIT_REPOSITORY https://github.com/apache/arrow.git GIT_TAG apache-arrow-${VERSION} GIT_SHALLOW TRUE SOURCE_SUBDIR cpp OPTIONS "CMAKE_VERBOSE_MAKEFILE ON" + "ARROW_ACERO ON" "ARROW_IPC ON" "ARROW_DATASET ON" "ARROW_WITH_BACKTRACE ON" @@ -221,7 +222,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB # Set this to enable `find_package(Parquet)` set(Parquet_DIR "${Arrow_DIR}") endif() - # Set this to enable `find_package(ArrowDataset)` + # Set this to enable `find_package(ArrowDataset)`. This will call find_package(ArrowAcero) for + # us set(ArrowDataset_DIR "${Arrow_DIR}") find_package(ArrowDataset REQUIRED QUIET) endif() @@ -295,9 +297,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB APPEND arrow_code_string " - if(NOT TARGET xsimd) - add_library(xsimd INTERFACE IMPORTED) - target_include_directories(xsimd INTERFACE \"${Arrow_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install/include\") + if(NOT TARGET arrow::xsimd) + add_library(arrow::xsimd INTERFACE IMPORTED) + target_include_directories(arrow::xsimd INTERFACE \"${Arrow_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install/include\") endif() " ) @@ -314,6 +316,26 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB if(ENABLE_PARQUET) + set(arrow_acero_code_string + [=[ + if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared)) + add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared) + endif() + if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static)) + add_library(arrow_acero_static ALIAS cudf::arrow_acero_static) + endif() + ]=] + ) + + rapids_export( + BUILD ArrowAcero + VERSION ${VERSION} + EXPORT_SET arrow_acero_targets + GLOBAL_TARGETS arrow_acero_shared arrow_acero_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_acero_code_string + ) + set(arrow_dataset_code_string [=[ if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared)) @@ -381,7 +403,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow) set(CUDF_VERSION_Arrow # This version must be kept in sync with the libarrow version pinned for builds in # dependencies.yaml. - 11.0.0 + 12.0.1 CACHE STRING "The version of Arrow to find (or build)" ) endif() diff --git a/cpp/cmake/thirdparty/get_cufile.cmake b/cpp/cmake/thirdparty/get_cufile.cmake index 21088f4ec0f..c0235eba508 100644 --- a/cpp/cmake/thirdparty/get_cufile.cmake +++ b/cpp/cmake/thirdparty/get_cufile.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -16,7 +16,7 @@ function(find_and_configure_cufile) list(APPEND CMAKE_MODULE_PATH ${CUDF_SOURCE_DIR}/cmake/Modules) - rapids_find_package(cuFile QUIET) + rapids_find_package(cuFile) if(cuFile_FOUND AND NOT BUILD_SHARED_LIBS) include("${rapids-cmake-dir}/export/find_package_file.cmake") diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake new file mode 100644 index 00000000000..0e03352c335 --- /dev/null +++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake @@ -0,0 +1,37 @@ +# ============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# This function finds thrust and sets any additional necessary environment variables. +function(find_and_configure_libcudacxx) + # Make sure we install libcudacxx beside our patched version of thrust + include(GNUInstallDirs) + set(CMAKE_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/libcudf") + set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_INCLUDEDIR}/lib") + + include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) + rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) + + if(libcudacxx_SOURCE_DIR) + # Store where CMake can find our custom Thrust install + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root( + INSTALL + libcudacxx + [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/libcudacxx]=] + cudf-exports + ) + endif() +endfunction() + +find_and_configure_libcudacxx() diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake index 25a4c9dd3ba..39a9de15fa6 100644 --- a/cpp/cmake/thirdparty/get_thrust.cmake +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -37,8 +37,8 @@ function(find_and_configure_thrust) # Store where CMake can find our custom Thrust install include("${rapids-cmake-dir}/export/find_package_root.cmake") rapids_export_find_package_root( - INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/cmake/thrust]=] - cudf-exports + INSTALL Thrust + [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/thrust]=] cudf-exports ) endif() endfunction() diff --git a/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff b/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff index 3e7a0f8ed77..04f96f49b48 100644 --- a/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff +++ b/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff @@ -1,15 +1,17 @@ diff --git a/nvbench/main.cuh b/nvbench/main.cuh -index 0ba82d7..7ab02c1 100644 +index 0ba82d7..cca5273 100644 --- a/nvbench/main.cuh +++ b/nvbench/main.cuh -@@ -54,6 +54,14 @@ +@@ -54,6 +54,16 @@ // clang-format on #endif +#ifndef NVBENCH_ENVIRONMENT +namespace nvbench { +struct no_environment -+{}; ++{ ++ no_environment(int, char const *const *) {} ++}; +} +#define NVBENCH_ENVIRONMENT nvbench::no_environment +#endif @@ -17,11 +19,11 @@ index 0ba82d7..7ab02c1 100644 #define NVBENCH_MAIN_PARSE(argc, argv) \ nvbench::option_parser parser; \ parser.parse(argc, argv) -@@ -77,6 +85,7 @@ +@@ -77,6 +87,7 @@ printer.set_total_state_count(total_states); \ \ printer.set_completed_state_count(0); \ -+ [[maybe_unused]] auto env_state = NVBENCH_ENVIRONMENT(); \ ++ [[maybe_unused]] auto env_state = NVBENCH_ENVIRONMENT(argc, argv); \ for (auto &bench_ptr : benchmarks) \ { \ bench_ptr->set_printer(printer); \ diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json index d5df222ae37..7be868081b6 100644 --- a/cpp/cmake/thirdparty/patches/nvbench_override.json +++ b/cpp/cmake/thirdparty/patches/nvbench_override.json @@ -12,11 +12,6 @@ "file" : "nvbench/use_existing_fmt.diff", "issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]", "fixed_in" : "" - }, - { - "file" : "nvbench/public_fmt_dep_in_conda.diff", - "issue" : "Propagate fmt requirement in conda envs [https://github.com/NVIDIA/nvbench/pull/127]", - "fixed_in" : "" } ] } diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index e1e8a0fa31b..b072d252881 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -1,4 +1,4 @@ -# Doxyfile 1.8.20 +# Doxyfile 1.9.1 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. @@ -32,13 +32,13 @@ DOXYFILE_ENCODING = UTF-8 # title of most generated pages and in a few other places. # The default value is: My Project. -PROJECT_NAME = "libcudf" +PROJECT_NAME = libcudf # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 23.06.00 +PROJECT_NUMBER = 23.10.00 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES = NO OUTPUT_LANGUAGE = English +# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all generated output in the proper direction. +# Possible values are: None, LTR, RTL and Context. +# The default value is: None. + +OUTPUT_TEXT_DIRECTION = None + # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. @@ -305,7 +313,10 @@ OPTIMIZE_OUTPUT_SLICE = NO # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. +# the files are not read by doxygen. When specifying no_extension you should add +# * to the FILE_PATTERNS. +# +# Note see also the list of default file extension mappings. EXTENSION_MAPPING = cu=C++ \ cuh=C++ @@ -516,6 +527,13 @@ EXTRACT_LOCAL_METHODS = NO EXTRACT_ANON_NSPACES = NO +# If this flag is set to YES, the name of an unnamed parameter in a declaration +# will be determined by the corresponding definition. By default unnamed +# parameters remain unnamed in the output. +# The default value is: YES. + +RESOLVE_UNNAMED_PARAMS = YES + # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation @@ -553,11 +571,18 @@ HIDE_IN_BODY_DOCS = NO INTERNAL_DOCS = NO -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES, upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# (including Cygwin) and Mac users are advised to set this option to NO. +# With the correct setting of option CASE_SENSE_NAMES doxygen will better be +# able to match the capabilities of the underlying filesystem. In case the +# filesystem is case sensitive (i.e. it supports files in the same directory +# whose names only differ in casing), the option must be set to YES to properly +# deal with such files in case they appear in the input. For filesystems that +# are not case sensitive the option should be be set to NO to properly deal with +# output files written for symbols that only differ in casing, such as for two +# classes, one named CLASS and the other named Class, and to also support +# references to files without having to specify the exact matching casing. On +# Windows (including Cygwin) and MacOS, users should typically set this option +# to NO, whereas on Linux or other Unix flavors it should typically be set to +# YES. # The default value is: system dependent. CASE_SENSE_NAMES = YES @@ -796,7 +821,10 @@ WARN_IF_DOC_ERROR = YES WARN_NO_PARAMDOC = YES # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when -# a warning is encountered. +# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS +# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but +# at the end of the doxygen process doxygen will return with a non-zero status. +# Possible values are: NO, YES and FAIL_ON_WARNINGS. # The default value is: NO. WARN_AS_ERROR = NO @@ -846,8 +874,8 @@ INPUT = main_page.md \ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: https://www.gnu.org/software/libiconv/) for the list of -# possible encodings. +# documentation (see: +# https://www.gnu.org/software/libiconv/) for the list of possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 @@ -860,13 +888,15 @@ INPUT_ENCODING = UTF-8 # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # +# Note the list of default checked file patterns might differ from the list of +# default file extension mappings. +# # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), -# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen -# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, -# *.vhdl, *.ucf, *.qsf and *.ice. +# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl, +# *.ucf, *.qsf and *.ice. FILE_PATTERNS = *.cpp \ *.hpp \ @@ -1270,10 +1300,11 @@ HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: https://developer.apple.com/xcode/), introduced with OSX -# 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in +# environment (see: +# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To +# create a documentation set, doxygen will generate a Makefile in the HTML +# output directory. Running make will produce the docset in that directory and +# running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy # genXcode/_index.html for more information. @@ -1315,8 +1346,8 @@ DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. +# (see: +# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML @@ -1391,7 +1422,8 @@ QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace -# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). +# (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1399,8 +1431,8 @@ QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- -# folders). +# Folders (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1408,16 +1440,16 @@ QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom -# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = @@ -1429,9 +1461,9 @@ QHP_CUST_FILTER_ATTRS = QHP_SECT_FILTER_ATTRS = -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. +# The QHG_LOCATION tag can be used to specify the location (absolute path +# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to +# run qhelpgenerator on the generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = @@ -1558,7 +1590,7 @@ USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. @@ -1588,7 +1620,8 @@ MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. @@ -1635,7 +1668,8 @@ SERVER_BASED_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: https://xapian.org/). +# Xapian (see: +# https://xapian.org/). # # See the section "External Indexing and Searching" for details. # The default value is: NO. @@ -1648,8 +1682,9 @@ EXTERNAL_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: https://xapian.org/). See the section "External Indexing and -# Searching" for details. +# Xapian (see: +# https://xapian.org/). See the section "External Indexing and Searching" for +# details. # This tag requires that the tag SEARCHENGINE is set to YES. SEARCHENGINE_URL = @@ -1839,6 +1874,16 @@ LATEX_BATCHMODE = NO LATEX_HIDE_INDICES = NO +# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source +# code with syntax highlighting in the LaTeX output. +# +# Note that which sources are shown also depends on other settings such as +# SOURCE_BROWSER. +# The default value is: NO. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_SOURCE_CODE = NO + # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. See # https://en.wikipedia.org/wiki/BibTeX and \cite for more info. @@ -1919,6 +1964,16 @@ RTF_STYLESHEET_FILE = RTF_EXTENSIONS_FILE = +# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code +# with syntax highlighting in the RTF output. +# +# Note that which sources are shown also depends on other settings such as +# SOURCE_BROWSER. +# The default value is: NO. +# This tag requires that the tag GENERATE_RTF is set to YES. + +RTF_SOURCE_CODE = NO + #--------------------------------------------------------------------------- # Configuration options related to the man page output #--------------------------------------------------------------------------- @@ -2015,6 +2070,15 @@ GENERATE_DOCBOOK = NO DOCBOOK_OUTPUT = docbook +# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the +# program listings (including syntax highlighting and cross-referencing +# information) to the DOCBOOK output. Note that enabling this will significantly +# increase the size of the DOCBOOK output. +# The default value is: NO. +# This tag requires that the tag GENERATE_DOCBOOK is set to YES. + +DOCBOOK_PROGRAMLISTING = NO + #--------------------------------------------------------------------------- # Configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- @@ -2162,7 +2226,7 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/23.06 +TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/23.10 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to @@ -2301,10 +2365,32 @@ UML_LOOK = NO # but if the number exceeds 15, the total amount of fields shown is limited to # 10. # Minimum value: 0, maximum value: 100, default value: 10. -# This tag requires that the tag HAVE_DOT is set to YES. +# This tag requires that the tag UML_LOOK is set to YES. UML_LIMIT_NUM_FIELDS = 10 +# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and +# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS +# tag is set to YES, doxygen will add type and arguments for attributes and +# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen +# will not generate fields with class member information in the UML graphs. The +# class diagrams will look similar to the default class diagrams but using UML +# notation for the relationships. +# Possible values are: NO, YES and NONE. +# The default value is: NO. +# This tag requires that the tag UML_LOOK is set to YES. + +DOT_UML_DETAILS = NO + +# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters +# to display on a single line. If the actual line length exceeds this threshold +# significantly it will wrapped across multiple lines. Some heuristics are apply +# to avoid ugly line breaks. +# Minimum value: 0, maximum value: 1000, default value: 17. +# This tag requires that the tag HAVE_DOT is set to YES. + +DOT_WRAP_THRESHOLD = 17 + # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and # collaboration graphs will show the relations between templates and their # instances. @@ -2494,9 +2580,11 @@ DOT_MULTI_TARGETS = NO GENERATE_LEGEND = YES -# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot +# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate # files that are used to generate the various graphs. +# +# Note: This setting is not only used for dot files but also for msc and +# plantuml temporary files. # The default value is: YES. -# This tag requires that the tag HAVE_DOT is set to YES. DOT_CLEANUP = YES diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index 91c3dccfdc6..fc2f72de33c 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -440,17 +440,18 @@ libcudf throws under different circumstances, see the [section on error handling ## Streams -CUDA streams are not yet exposed in external libcudf APIs. However, in order to ease the transition -to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be -implemented using asynchronous APIs on the default stream (e.g., stream 0). - -The recommended pattern for doing this is to make the definition of the external API invoke an -internal API in the `detail` namespace. The internal `detail` API has the same parameters as the -public API, plus a `rmm::cuda_stream_view` parameter at the end with no default value. If the -detail API also accepts a memory resource parameter, the stream parameter should be ideally placed -just *before* the memory resource. The public API will call the detail API and provide -`cudf::get_default_stream()`. The implementation should be wholly contained in the `detail` API -definition and use only asynchronous versions of CUDA APIs with the stream parameter. +libcudf is in the process of adding support for asynchronous execution using +CUDA streams. In order to facilitate the usage of streams, all new libcudf APIs +that allocate device memory or execute a kernel should accept an +`rmm::cuda_stream_view` parameter at the end with a default value of +`cudf::get_default_stream()`. There is one exception to this rule: if the API +also accepts a memory resource parameter, the stream parameter should be placed +just *before* the memory resource. This API should then forward the call to a +corresponding `detail` API with an identical signature, except that the +`detail` API should not have a default parameter for the stream ([detail APIs +should always avoid default parameters](#default-parameters)). The +implementation should be wholly contained in the `detail` API definition and +use only asynchronous versions of CUDA APIs with the stream parameter. In order to make the `detail` API callable from other libcudf functions, it should be exposed in a header placed in the `cudf/cpp/include/detail/` directory. @@ -488,7 +489,7 @@ void external_function(...){ when a non-pointer value is returned from the API that is the result of an asynchronous device-to-host copy, the stream used for the copy should be synchronized before returning. However, when a column is returned, the stream should not be synchronized because doing so will break -asynchrony if and when we add an asynchronous API to libcudf. +asynchrony. **Note:** `cudaDeviceSynchronize()` should *never* be used. This limits the ability to do any multi-stream/multi-threaded work with libcudf APIs. diff --git a/cpp/doxygen/developer_guide/TESTING.md b/cpp/doxygen/developer_guide/TESTING.md index 2d9b32362bf..c19976a956b 100644 --- a/cpp/doxygen/developer_guide/TESTING.md +++ b/cpp/doxygen/developer_guide/TESTING.md @@ -458,3 +458,69 @@ Column comparison functions in the `cudf::test::detail` namespace should **NOT** `include/cudf_test/column_utilities.hpp` defines various functions and overloads for printing columns (`print`), converting column data to string (`to_string`, `to_strings`), and copying data to the host (`to_host`). + + +## Validating Stream Usage + +### Background + +libcudf employs a custom-built [preload library +docs](https://man7.org/linux/man-pages/man8/ld.so.8.html) to validate its internal stream usage (the +code may be found +[`here`](https://github.com/rapidsai/cudf/blob/main/cpp/tests/utilities/identify_stream_usage.cpp)). +This library wraps every asynchronous CUDA runtime API call that accepts a stream with a check to +ensure that the passed CUDA stream is a valid one, immediately throwing an exception if an invalid +stream is detected. Running tests with this library loaded immediately triggers errors if any test +accidentally runs code on an invalid stream. + +Stream validity is determined by overloading the definition of libcudf's default stream. Normally, in +libcudf `cudf::get_default_stream` returns one of `rmm`'s default stream values (depending on +whether or not libcudf is compiled with per thread default stream enabled). In the preload library, +this function is redefined to instead return a new user-created stream managed using a +function-local static `rmm::cuda_stream`. An invalid stream in this situation is defined as any of +CUDA's default stream values (cudaStreamLegacy, cudaStreamDefault, or cudaStreamPerThread), since +any kernel that properly uses `cudf::get_default_stream` will now instead be using the custom stream +created by the preload library. + +The preload library supports two different modes, `cudf` mode and `testing` mode. The previous +paragraph describes the behavior of `cudf` mode, where `cudf::get_default_stream` is overloaded. In +`cudf` mode, the preload library ensures that all CUDA runtime APIs are being provided cudf's +default stream. This will detect oversights where, for example, a Thrust call has no stream specified, or +when one of CUDA's default stream values is explicitly specified to a kernel. However, it will not +detect cases where a stream is not correctly forwarded down the call stack, for instance if +some `detail` function that accepts a stream parameter fails to forward it along and instead +erroneously calls `cudf::get_default_stream` instead. + +In `testing` mode, the library instead overloads `cudf::test::get_default_stream`. This function +defined in the `cudf::test` namespace enables a more stringent mode of testing. In `testing` mode, +the preload library instead verifies that all CUDA runtime APIs are instead called using the test +namespace's default stream. This distinction is important because cudf internals never use +`cudf::test::get_default_stream`, so this stream value can only appear internally if it was provided +to a public API and forwarded properly all the way down the call stack. While `testing` mode is more +strict than `cudf` mode, it is also more intrusive. `cudf` mode can operate with no changes to the +library or the tests because the preload library overwrites the relevant APIs in place. `testing` +mode, however, can only be used to validate tests that are correctly passing +`cudf::test::get_default_stream` to public libcudf APIs. + +In addition to the preload library, the test suite also implements a [custom memory +resource](https://github.com/rapidsai/cudf/blob/main/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp) +that performs analogous stream verification when its `do_allocate` method is called. During testing +this rmm's default memory resource is set to use this adaptor for additional stream validation. + +### Usage + +When writing tests for a libcudf API, a special set of additional tests should be added to validate +the API's stream usage. These tests should be placed in the `cpp/tests/streams` directory in a file +corresponding to the header containing the tested APIs, e.g. `cpp/tests/streams/copying_test.cpp` +for all APIs declared in `cpp/include/cudf/copying.hpp`. These tests should contain a minimal +invocation of the tested API with no additional assertions since they are solely designed to check +stream usage. When adding these tests to `cpp/tests/CMakeLists.txt`, the `ConfigureTest` CMake +function should be provided the arguments `STREAM_MODE testing`. This change is sufficient for +CTest to set up the test to automatically load the preload library compiled in `testing` mode when +running the test. + +The rest of the test suite is configured to run with the preload library in `cudf` mode. As a +result, all test runs with `ctest` will always include stream validation. Since this configuration +is managed via CMake and CTest, direct execution of the test executables will not use the preload +library at all. Tests will still run and pass normally in this situation, however (with the +exception of the test of the preload library itself). diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt index 0922611482a..1c1952c4616 100644 --- a/cpp/examples/basic/CMakeLists.txt +++ b/cpp/examples/basic/CMakeLists.txt @@ -1,6 +1,6 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -cmake_minimum_required(VERSION 3.23.1) +cmake_minimum_required(VERSION 3.26.4) project( basic_example @@ -16,7 +16,7 @@ file( ) include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) -set(CUDF_TAG branch-23.06) +set(CUDF_TAG branch-23.10) CPMFindPackage( NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf GIT_TAG ${CUDF_TAG} diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt index f9c49e24bf5..31a6b12a4bc 100644 --- a/cpp/examples/strings/CMakeLists.txt +++ b/cpp/examples/strings/CMakeLists.txt @@ -1,6 +1,6 @@ # Copyright (c) 2022-2023, NVIDIA CORPORATION. -cmake_minimum_required(VERSION 3.23.1) +cmake_minimum_required(VERSION 3.26.4) project( strings_examples @@ -16,7 +16,7 @@ file( ) include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) -set(CUDF_TAG branch-23.06) +set(CUDF_TAG branch-23.10) CPMFindPackage( NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf GIT_TAG ${CUDF_TAG} diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp index dbd3c4dbd1b..2fd9daf9339 100644 --- a/cpp/examples/strings/common.hpp +++ b/cpp/examples/strings/common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,7 +36,7 @@ * @brief Main example function returns redacted strings column. * * This function returns a redacted version of the input `names` column - * using the the `visibilities` column as in the following example + * using the `visibilities` column as in the following example * ``` * names visibility --> redacted * John Doe public D John diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu index a956550f505..0af4c47e947 100644 --- a/cpp/examples/strings/custom_prealloc.cu +++ b/cpp/examples/strings/custom_prealloc.cu @@ -41,7 +41,7 @@ __global__ void redact_kernel(cudf::column_device_view const d_names, cudf::column_device_view const d_visibilities, cudf::string_view redaction, char* working_memory, - cudf::offset_type const* d_offsets, + cudf::size_type const* d_offsets, cudf::string_view* d_output) { // The row index is resolved from the CUDA thread/block objects diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp index 84fb7cfbd5a..db0abe435b0 100644 --- a/cpp/include/cudf/ast/detail/expression_parser.hpp +++ b/cpp/include/cudf/ast/detail/expression_parser.hpp @@ -15,12 +15,12 @@ */ #pragma once +#include #include #include #include #include -#include #include #include @@ -72,24 +72,6 @@ struct alignas(8) device_data_reference { } }; -// Type trait for wrapping nullable types in a thrust::optional. Non-nullable -// types are returned as is. -template -struct possibly_null_value; - -template -struct possibly_null_value { - using type = thrust::optional; -}; - -template -struct possibly_null_value { - using type = T; -}; - -template -using possibly_null_value_t = typename possibly_null_value::type; - // Type used for intermediate storage in expression evaluation. template using IntermediateDataType = possibly_null_value_t; @@ -193,6 +175,13 @@ class expression_parser { */ cudf::size_type visit(operation const& expr); + /** + * @brief Visit a column name reference expression. + * + * @param expr Column name reference expression. + * @return cudf::size_type Index of device data reference for the expression. + */ + cudf::size_type visit(column_name_reference const& expr); /** * @brief Internal class used to track the utilization of intermediate storage locations. * diff --git a/cpp/include/cudf/ast/detail/expression_transformer.hpp b/cpp/include/cudf/ast/detail/expression_transformer.hpp new file mode 100644 index 00000000000..a6529c338e6 --- /dev/null +++ b/cpp/include/cudf/ast/detail/expression_transformer.hpp @@ -0,0 +1,64 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf::ast::detail { +/** + * @brief Base "visitor" pattern class with the `expression` class for expression transformer. + * + * This class can be used to implement recursive traversal of AST tree, and used to validate or + * translate an AST expression. + */ +class expression_transformer { + public: + /** + * @brief Visit a literal expression. + * + * @param expr Literal expression + * @return Reference wrapper of transformed expression + */ + virtual std::reference_wrapper visit(literal const& expr) = 0; + + /** + * @brief Visit a column reference expression. + * + * @param expr Column reference expression + * @return Reference wrapper of transformed expression + */ + virtual std::reference_wrapper visit(column_reference const& expr) = 0; + + /** + * @brief Visit an expression expression + * + * @param expr Expression expression + * @return Reference wrapper of transformed expression + */ + virtual std::reference_wrapper visit(operation const& expr) = 0; + + /** + * @brief Visit a column name reference expression. + * + * @param expr Column name reference expression + * @return Reference wrapper of transformed expression + */ + virtual std::reference_wrapper visit(column_name_reference const& expr) = 0; + + virtual ~expression_transformer() {} +}; +} // namespace cudf::ast::detail diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index 350ce99bcf4..ed7f2d97cef 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -20,6 +20,8 @@ #include #include +#include + #include #include @@ -33,6 +35,24 @@ namespace ast { namespace detail { +// Type trait for wrapping nullable types in a thrust::optional. Non-nullable +// types are returned as is. +template +struct possibly_null_value; + +template +struct possibly_null_value { + using type = thrust::optional; +}; + +template +struct possibly_null_value { + using type = T; +}; + +template +using possibly_null_value_t = typename possibly_null_value::type; + // Traits for valid operator / type combinations template constexpr bool is_valid_binary_op = cuda::std::is_invocable_v; @@ -124,6 +144,9 @@ CUDF_HOST_DEVICE inline constexpr void ast_operator_dispatcher(ast_operator op, case ast_operator::IDENTITY: f.template operator()(std::forward(args)...); break; + case ast_operator::IS_NULL: + f.template operator()(std::forward(args)...); + break; case ast_operator::SIN: f.template operator()(std::forward(args)...); break; @@ -534,6 +557,17 @@ struct operator_functor { } }; +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) -> bool + { + return false; + } +}; + template <> struct operator_functor { static constexpr auto arity{1}; @@ -831,6 +865,19 @@ struct operator_functor { } }; +// IS_NULL(null) is true, IS_NULL(valid) is false +template <> +struct operator_functor { + using NonNullOperator = operator_functor; + static constexpr auto arity = NonNullOperator::arity; + + template + __device__ inline auto operator()(LHS const lhs) -> decltype(!lhs.has_value()) + { + return !lhs.has_value(); + } +}; + // NULL_EQUAL(null, null) is true, NULL_EQUAL(null, valid) is false, and NULL_EQUAL(valid, valid) == // EQUAL(valid, valid) template <> diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp index 6df6ba71b4c..c5172486fa6 100644 --- a/cpp/include/cudf/ast/expressions.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -29,7 +29,8 @@ namespace ast { // Forward declaration. namespace detail { class expression_parser; -} +class expression_transformer; +} // namespace detail /** * @brief A generic expression that can be evaluated to return a value. @@ -46,6 +47,15 @@ struct expression { */ virtual cudf::size_type accept(detail::expression_parser& visitor) const = 0; + /** + * @brief Accepts a visitor class. + * + * @param visitor The `expression_transformer` transforming this expression tree + * @return Reference wrapper of transformed expression + */ + virtual std::reference_wrapper accept( + detail::expression_transformer& visitor) const = 0; + /** * @brief Returns true if the expression may evaluate to null. * @@ -112,6 +122,7 @@ enum class ast_operator : int32_t { ///< LOGICAL_OR(valid, valid) // Unary operators IDENTITY, ///< Identity function + IS_NULL, ///< Check if operand is null SIN, ///< Trigonometric sine COS, ///< Trigonometric cosine TAN, ///< Trigonometric tangent @@ -300,13 +311,16 @@ class literal : public expression { [[nodiscard]] generic_scalar_device_view get_value() const { return value; } /** - * @brief Accepts a visitor class. - * - * @param visitor The `expression_parser` parsing this expression tree - * @return Index of device data reference for this instance + * @copydoc expression::accept */ cudf::size_type accept(detail::expression_parser& visitor) const override; + /** + * @copydoc expression::accept + */ + std::reference_wrapper accept( + detail::expression_transformer& visitor) const override; + [[nodiscard]] bool may_evaluate_null(table_view const& left, table_view const& right, rmm::cuda_stream_view stream) const override @@ -396,13 +410,16 @@ class column_reference : public expression { } /** - * @brief Accepts a visitor class. - * - * @param visitor The `expression_parser` parsing this expression tree - * @return Index of device data reference for this instance + * @copydoc expression::accept */ cudf::size_type accept(detail::expression_parser& visitor) const override; + /** + * @copydoc expression::accept + */ + std::reference_wrapper accept( + detail::expression_transformer& visitor) const override; + [[nodiscard]] bool may_evaluate_null(table_view const& left, table_view const& right, rmm::cuda_stream_view stream) const override @@ -459,13 +476,16 @@ class operation : public expression { std::vector> get_operands() const { return operands; } /** - * @brief Accepts a visitor class. - * - * @param visitor The `expression_parser` parsing this expression tree - * @return Index of device data reference for this instance + * @copydoc expression::accept */ cudf::size_type accept(detail::expression_parser& visitor) const override; + /** + * @copydoc expression::accept + */ + std::reference_wrapper accept( + detail::expression_transformer& visitor) const override; + [[nodiscard]] bool may_evaluate_null(table_view const& left, table_view const& right, rmm::cuda_stream_view stream) const override @@ -482,6 +502,48 @@ class operation : public expression { std::vector> const operands; }; +/** + * @brief A expression referring to data from a column in a table. + */ +class column_name_reference : public expression { + public: + /** + * @brief Construct a new column name reference object + * + * @param column_name Name of this column in the table metadata (provided when the expression is + * evaluated). + */ + column_name_reference(std::string column_name) : column_name(std::move(column_name)) {} + + /** + * @brief Get the column name. + * + * @return The name of this column reference + */ + [[nodiscard]] std::string get_column_name() const { return column_name; } + + /** + * @copydoc expression::accept + */ + cudf::size_type accept(detail::expression_parser& visitor) const override; + + /** + * @copydoc expression::accept + */ + std::reference_wrapper accept( + detail::expression_transformer& visitor) const override; + + [[nodiscard]] bool may_evaluate_null(table_view const& left, + table_view const& right, + rmm::cuda_stream_view stream) const override + { + return true; + } + + private: + std::string column_name; +}; + } // namespace ast } // namespace cudf diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp index c160cecbf84..a38186458c4 100644 --- a/cpp/include/cudf/column/column.hpp +++ b/cpp/include/cudf/column/column.hpp @@ -80,21 +80,17 @@ class column { * @brief Construct a new column by taking ownership of the contents of a device_uvector. * * @param other The device_uvector whose contents will be moved into the new column. - * @param null_mask Optional, column's null value indicator bitmask. May - * be empty if `null_count` is 0 or `UNKNOWN_NULL_COUNT`. - * @param null_count Optional, the count of null elements. If unknown, specify - * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on - * the first invocation of `null_count()`. + * @param null_mask Column's null value indicator bitmask. May be empty if `null_count` is 0. + * @param null_count The count of null elements. */ template () or cudf::is_chrono())> - column(rmm::device_uvector&& other, - rmm::device_buffer&& null_mask = {}, - size_type null_count = UNKNOWN_NULL_COUNT) + column(rmm::device_uvector&& other, rmm::device_buffer&& null_mask, size_type null_count) : _type{cudf::data_type{cudf::type_to_id()}}, _size{[&]() { CUDF_EXPECTS( other.size() <= static_cast(std::numeric_limits::max()), - "The device_uvector size exceeds the maximum size_type."); + "The device_uvector size exceeds the column size limit", + std::overflow_error); return static_cast(other.size()); }()}, _data{other.release()}, @@ -111,22 +107,19 @@ class column { * * @throws cudf::logic_error if `size < 0` * - * @param[in] dtype The element type - * @param[in] size The number of elements in the column - * @param[in] data The column's data - * @param[in] null_mask Optional, column's null value indicator bitmask. May - * be empty if `null_count` is 0 or `UNKNOWN_NULL_COUNT`. - * @param null_count Optional, the count of null elements. If unknown, specify - * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on - * the first invocation of `null_count()`. + * @param dtype The element type + * @param size The number of elements in the column + * @param data The column's data + * @param null_mask Column's null value indicator bitmask. May be empty if `null_count` is 0. + * @param null_count Optional, the count of null elements. * @param children Optional, vector of child columns */ template column(data_type dtype, size_type size, B1&& data, - B2&& null_mask = {}, - size_type null_count = UNKNOWN_NULL_COUNT, + B2&& null_mask, + size_type null_count, std::vector>&& children = {}) : _type{dtype}, _size{size}, @@ -169,14 +162,9 @@ class column { /** * @brief Returns the count of null elements. * - * @note If the column was constructed with `UNKNOWN_NULL_COUNT`, or if at any - * point `set_null_count(UNKNOWN_NULL_COUNT)` was invoked, then the - * first invocation of `null_count()` will compute and store the count of null - * elements indicated by the `null_mask` (if it exists). - * * @return The number of null elements */ - [[nodiscard]] size_type null_count() const; + [[nodiscard]] size_type null_count() const { return _null_count; } /** * @brief Sets the column's null value indicator bitmask to `new_null_mask`. @@ -186,13 +174,10 @@ class column { * * @param new_null_mask New null value indicator bitmask (rvalue overload & * moved) to set the column's null value indicator mask. May be empty if - * `new_null_count` is 0 or `UNKOWN_NULL_COUNT`. - * @param new_null_count Optional, the count of null elements. If unknown, - * specify `UNKNOWN_NULL_COUNT` to indicate that the null count should be - * computed on the first invocation of `null_count()`. + * `new_null_count` is 0. + * @param new_null_count The count of null elements. */ - void set_null_mask(rmm::device_buffer&& new_null_mask, - size_type new_null_count = UNKNOWN_NULL_COUNT); + void set_null_mask(rmm::device_buffer&& new_null_mask, size_type new_null_count); /** * @brief Sets the column's null value indicator bitmask to `new_null_mask`. @@ -201,25 +186,18 @@ class column { * does not match the size of this column. * * @param new_null_mask New null value indicator bitmask (lvalue overload & copied) to set the - * column's null value indicator mask. May be empty if `new_null_count` is 0 or - * `UNKOWN_NULL_COUNT`. - * @param new_null_count Optional, the count of null elements. If unknown, specify - * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on the first invocation - * of `null_count()`. + * column's null value indicator mask. May be empty if `new_null_count` is 0. + * @param new_null_count The count of null elements * @param stream The stream on which to perform the allocation and copy. Uses the default CUDF * stream if none is specified. */ void set_null_mask(rmm::device_buffer const& new_null_mask, - size_type new_null_count = UNKNOWN_NULL_COUNT, + size_type new_null_count, rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Updates the count of null elements. * - * @note `UNKNOWN_NULL_COUNT` can be specified as `new_null_count` to force - * the next invocation of `null_count()` to recompute the null count from the - * null mask. - * * @throws cudf::logic_error if `new_null_count > 0 and nullable() == false` * * @param new_null_count The new null count. @@ -321,14 +299,8 @@ class column { operator column_view() const { return this->view(); }; /** - * @brief Creates a mutable, non-owning view of the column's data and - * children. - * - * @note Creating a mutable view of a `column` invalidates the `column`'s - * `null_count()` by setting it to `UNKNOWN_NULL_COUNT`. The user can - * either explicitly update the null count with `set_null_count()`, or - * if not, the null count will be recomputed on the next invocation of - *`null_count()`. + * @brief Creates a mutable, non-owning view of the column's data, null mask, + * and children * * @return The mutable, non-owning view */ @@ -338,13 +310,10 @@ class column { * @brief Implicit conversion operator to a `mutable_column_view`. * * This allows passing a `column` object into a function that accepts a - *`mutable_column_view`. The conversion is automatic. - - * @note Creating a mutable view of a `column` invalidates the `column`'s - * `null_count()` by setting it to `UNKNOWN_NULL_COUNT`. For best performance, - * the user should explicitly update the null count with `set_null_count()`. - * Otherwise, the null count will be recomputed on the next invocation of - * `null_count()`. + * `mutable_column_view`. The conversion is automatic. + * + * The caller is expected to update the null count appropriately if the null mask + * is modified. * * @return Mutable, non-owning `mutable_column_view` */ @@ -357,9 +326,9 @@ class column { ///< buffer containing the column elements rmm::device_buffer _null_mask{}; ///< Bitmask used to represent null values. ///< May be empty if `null_count() == 0` - mutable cudf::size_type _null_count{UNKNOWN_NULL_COUNT}; ///< The number of null elements - std::vector> _children{}; ///< Depending on element type, child - ///< columns may contain additional data + mutable cudf::size_type _null_count{}; ///< The number of null elements + std::vector> _children{}; ///< Depending on element type, child + ///< columns may contain additional data }; /** @} */ // end of group diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index efb96cb6af6..05ef21bd750 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -275,7 +275,7 @@ class alignas(16) column_device_view_base { } /** - * @brief Returns the the specified bitmask word from the `null_mask()`. + * @brief Returns the specified bitmask word from the `null_mask()`. * * @note It is undefined behavior to call this function if `nullable() == * false`. @@ -442,8 +442,8 @@ class alignas(16) column_device_view : public detail::column_device_view_base { __device__ T element(size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset - const auto* d_offsets = d_children[strings_column_view::offsets_column_index].data(); - const char* d_strings = d_children[strings_column_view::chars_column_index].data(); + auto const* d_offsets = d_children[strings_column_view::offsets_column_index].data(); + char const* d_strings = d_children[strings_column_view::chars_column_index].data(); size_type offset = d_offsets[index]; return string_view{d_strings + offset, d_offsets[index + 1] - offset}; } diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index f0f7bf092d3..68d7df7e4eb 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -374,7 +374,7 @@ std::unique_ptr make_fixed_width_column( * @return Constructed strings column */ std::unique_ptr make_strings_column( - cudf::device_span const> strings, + cudf::device_span const> strings, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -406,7 +406,7 @@ std::unique_ptr make_strings_column( */ std::unique_ptr make_strings_column( cudf::device_span string_views, - const string_view null_placeholder, + string_view const null_placeholder, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 6d722675626..d80c720a255 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -160,14 +160,9 @@ class column_view_base { /** * @brief Returns the count of null elements * - * @note If the column was constructed with `UNKNOWN_NULL_COUNT`, or if at any - * point `set_null_count(UNKNOWN_NULL_COUNT)` was invoked, then the - * first invocation of `null_count()` will compute and store the count of null - * elements indicated by the `null_mask` (if it exists). - * * @return The count of null elements */ - [[nodiscard]] size_type null_count() const; + [[nodiscard]] size_type null_count() const { return _null_count; } /** * @brief Returns the count of null elements in the range [begin, end) @@ -263,10 +258,6 @@ class column_view_base { * * If `null_count()` is zero, `null_mask` is optional. * - * If the null count of the `null_mask` is not specified, it defaults to - * `UNKNOWN_NULL_COUNT`. The first invocation of `null_count()` will then - * compute the null count if `null_mask` exists. - * * If `type` is `EMPTY`, the specified `null_count` will be ignored and * `null_count()` will always return the same value as `size()` * @@ -280,17 +271,17 @@ class column_view_base { * @param type The element type * @param size The number of elements * @param data Pointer to device memory containing the column elements - * @param null_mask Optional, pointer to device memory containing the null + * @param null_mask Pointer to device memory containing the null * indicator bitmask - * @param null_count Optional, the number of null elements. - * @param offset optional, index of the first element + * @param null_count The number of null elements. + * @param offset Optional, index of the first element */ column_view_base(data_type type, size_type size, void const* data, - bitmask_type const* null_mask = nullptr, - size_type null_count = UNKNOWN_NULL_COUNT, - size_type offset = 0); + bitmask_type const* null_mask, + size_type null_count, + size_type offset = 0); }; class mutable_column_view_base : public column_view_base { @@ -357,10 +348,6 @@ class column_view : public detail::column_view_base { * * If `null_count()` is zero, `null_mask` is optional. * - * If the null count of the `null_mask` is not specified, it defaults to - * `UNKNOWN_NULL_COUNT`. The first invocation of `null_count()` will then - * compute the null count if `null_mask` exists. - * * If `type` is `EMPTY`, the specified `null_count` will be ignored and * `null_count()` will always return the same value as `size()` * @@ -374,18 +361,18 @@ class column_view : public detail::column_view_base { * @param type The element type * @param size The number of elements * @param data Pointer to device memory containing the column elements - * @param null_mask Optional, pointer to device memory containing the null + * @param null_mask Pointer to device memory containing the null * indicator bitmask - * @param null_count Optional, the number of null elements. - * @param offset optional, index of the first element - * @param children optional, depending on the element type, child columns may + * @param null_count The number of null elements. + * @param offset Optional, index of the first element + * @param children Optional, depending on the element type, child columns may * contain additional data */ column_view(data_type type, size_type size, void const* data, - bitmask_type const* null_mask = nullptr, - size_type null_count = UNKNOWN_NULL_COUNT, + bitmask_type const* null_mask, + size_type null_count, size_type offset = 0, std::vector const& children = {}); @@ -435,8 +422,9 @@ class column_view : public detail::column_view_base { cudf::data_type{cudf::type_to_id()}, data.size(), data.data(), nullptr, 0, 0, {}) { CUDF_EXPECTS( - data.size() < static_cast(std::numeric_limits::max()), - "Data exceeds the maximum size of a column view."); + data.size() <= static_cast(std::numeric_limits::max()), + "Data exceeds the column size limit", + std::overflow_error); } /** @@ -509,12 +497,8 @@ class mutable_column_view : public detail::column_view_base { /** * @brief Construct a `mutable_column_view` from pointers to device memory for - *the elements and bitmask of the column. + * the elements and bitmask of the column. - * If the null count of the `null_mask` is not specified, it defaults to - * `UNKNOWN_NULL_COUNT`. The first invocation of `null_count()` will then - * compute the null count. - * * If `type` is `EMPTY`, the specified `null_count` will be ignored and * `null_count()` will always return the same value as `size()` * @@ -528,19 +512,19 @@ class mutable_column_view : public detail::column_view_base { * @param type The element type * @param size The number of elements * @param data Pointer to device memory containing the column elements - * @param null_mask Optional, pointer to device memory containing the null + * @param null_mask Pointer to device memory containing the null indicator * bitmask - * @param null_count Optional, the number of null elements. - * @param offset optional, index of the first element - * @param children optional, depending on the element type, child columns may + * @param null_count The number of null elements. + * @param offset Optional, index of the first element + * @param children Optional, depending on the element type, child columns may * contain additional data */ mutable_column_view(data_type type, size_type size, void* data, - bitmask_type* null_mask = nullptr, - size_type null_count = cudf::UNKNOWN_NULL_COUNT, + bitmask_type* null_mask, + size_type null_count, size_type offset = 0, std::vector const& children = {}); diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp index 2b4eee607e2..9ee55275a5e 100644 --- a/cpp/include/cudf/concatenate.hpp +++ b/cpp/include/cudf/concatenate.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -34,61 +35,64 @@ namespace cudf { /** * @brief Concatenates `views[i]`'s bitmask from the bits * `[views[i].offset(), views[i].offset() + views[i].size())` for all elements - * views[i] in views into a `device_buffer` + * `views` into an `rmm::device_buffer` * - * Returns empty `device_buffer` if the column is not nullable + * Returns an empty buffer if the column is not nullable. * - * @param views host_span of column views whose bitmasks will be concatenated - * @param mr Device memory resource used for allocating the new device_buffer - * @return A `device_buffer` containing the bitmasks of all the column views in the views vector + * @param views Column views whose bitmasks will be concatenated + * @param mr Device memory resource used for allocating the returned memory + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Bitmasks of all the column views in the views vector */ rmm::device_buffer concatenate_masks( host_span views, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Concatenates multiple columns into a single column. + * @brief Concatenates multiple columns into a single column * * @throws cudf::logic_error If types of the input columns mismatch - * @throws std::overflow_error If the the total number of output rows exceeds cudf::size_type + * @throws std::overflow_error If the total number of output rows exceeds cudf::size_type * - * @param columns_to_concat host_span of column views to be concatenated into a single column + * @param columns_to_concat Column views to be concatenated into a single column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return A single column having all the rows from the elements of `columns_to_concat` respectively * in the same order. */ std::unique_ptr concatenate( host_span columns_to_concat, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Columns of `tables_to_concat` are concatenated vertically to return a * single table * - * @ingroup column_concatenate - * - * example: - * ``` - * column_view c0; //Contains {0,1,2,3} - * column_view c1; //Contains {4,5,6,7} + * @code{.pseudo} + * column_view c0 is {0,1,2,3} + * column_view c1 is {4,5,6,7} * table_view t0{{c0, c0}}; * table_view t1{{c1, c1}}; * ... * auto t = concatenate({t0.view(), t1.view()}); - * column_view tc0 = (t->view()).column(0); //Contains {0,1,2,3,4,5,6,7} - * column_view tc1 = (t->view()).column(1); //Contains {0,1,2,3,4,5,6,7} - * ``` + * column_view tc0 = (t->view()).column(0) is {0,1,2,3,4,5,6,7} + * column_view tc1 = (t->view()).column(1) is {0,1,2,3,4,5,6,7} + * @endcode * * @throws cudf::logic_error If number of columns mismatch - * @throws std::overflow_error If the the total number of output rows exceeds cudf::size_type + * @throws std::overflow_error If the total number of output rows exceeds cudf::size_type * - * @param tables_to_concat host_span of table views to be concatenated into a single table + * @param tables_to_concat Table views to be concatenated into a single table + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return A single table having all the rows from the elements of * `tables_to_concat` respectively in the same order. */ std::unique_ptr concatenate( host_span tables_to_concat, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp index 62d668a98cb..bf10f1fd489 100644 --- a/cpp/include/cudf/contiguous_split.hpp +++ b/cpp/include/cudf/contiguous_split.hpp @@ -28,7 +28,7 @@ namespace cudf { * @addtogroup column_copy * @{ * @file - * @brief Table APIs for contiguous_split, pack, unpack, and metadadata + * @brief Table APIs for contiguous_split, pack, unpack, and metadata */ /** @@ -127,6 +127,153 @@ std::vector contiguous_split( std::vector const& splits, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +namespace detail { +struct contiguous_split_state; +}; + +/** + * @brief Perform a chunked "pack" operation of the input `table_view` using a user provided + * buffer of size `user_buffer_size`. + * + * The intent of this operation is to be used in a streamed fashion at times of GPU + * out-of-memory, where we want to minimize the number of small cudaMemcpy calls and + * tracking of all the metadata associated with cudf tables. Because of the memory constraints, + * all thrust and scratch memory allocations are using the passed-in memory resource exclusively, + * not a per-device memory resource. + * + * This class defines two methods that must be used in concert to carry out the chunked_pack: + * has_next and next. Here is an example: + * + * @code{.pseudo} + * // Create a table_view + * cudf::table_view tv = ...; + * + * // Choose a memory resource (optional). This memory resource is used for scratch/thrust temporary + * // data. In memory constrained cases, this can be used to set aside scratch memory + * // for `chunked_pack` at the beginning of a program. + * auto mr = rmm::mr::get_current_device_resource(); + * + * // Define a buffer size for each chunk: the larger the buffer is, the more SMs can be + * // occupied by this algorithm. + * // + * // Internally, the GPU unit of work is a 1MB batch. When we instantiate `cudf::chunked_pack`, + * // all the 1MB batches for the source table_view are computed up front. Additionally, + * // chunked_pack calculates the number of iterations that are required to go through all those + * // batches given a `user_buffer_size` buffer. The number of 1MB batches in each iteration (chunk) + * // equals the number of CUDA blocks that will be used for the main kernel launch. + * // + * std::size_t user_buffer_size = 128*1024*1024; + * + * auto chunked_packer = cudf::chunked_pack::create(tv, user_buffer_size, mr); + * + * std::size_t host_offset = 0; + * auto host_buffer = ...; // obtain a host buffer you would like to copy to + * + * while (chunked_packer->has_next()) { + * // get a user buffer of size `user_buffer_size` + * cudf::device_span user_buffer = ...; + * std::size_t bytes_copied = chunked_packer->next(user_buffer); + * + * // buffer will hold the contents of at most `user_buffer_size` bytes + * // of the contiguously packed input `table_view`. You are now free to copy + * // this memory somewhere else, for example, to host. + * cudaMemcpyAsync( + * host_buffer.data() + host_offset, + * user_buffer.data(), + * bytes_copied, + * cudaMemcpyDefault, + * stream); + * + * host_offset += bytes_copied; + * } + * @endcode + */ +class chunked_pack { + public: + /** + * @brief Construct a `chunked_pack` class. + * + * @param input source `table_view` to pack + * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be + * at least 1MB + * @param temp_mr An optional memory resource to be used for temporary and scratch allocations + * only + */ + explicit chunked_pack( + cudf::table_view const& input, + std::size_t user_buffer_size, + rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource()); + + /** + * @brief Destructor that will be implemented as default. Declared with definition here because + * contiguous_split_state is incomplete at this stage. + */ + ~chunked_pack(); + + /** + * @brief Obtain the total size of the contiguously packed `table_view`. + * + * @return total size (in bytes) of all the chunks + */ + [[nodiscard]] std::size_t get_total_contiguous_size() const; + + /** + * @brief Function to check if there are chunks left to be copied. + * + * @return true if there are chunks left to be copied, and false otherwise + */ + [[nodiscard]] bool has_next() const; + + /** + * @brief Packs the next chunk into `user_buffer`. This should be called as long as + * `has_next` returns true. If `next` is called when `has_next` is false, an exception + * is thrown. + * + * @throws cudf::logic_error If the size of `user_buffer` is different than `user_buffer_size` + * @throws cudf::logic_error If called after all chunks have been copied + * + * @param user_buffer device span target for the chunk. The size of this span must equal + * the `user_buffer_size` parameter passed at construction + * @return The number of bytes that were written to `user_buffer` (at most + * `user_buffer_size`) + */ + [[nodiscard]] std::size_t next(cudf::device_span const& user_buffer); + + /** + * @brief Build the opaque metadata for all added columns. + * + * @return A vector containing the serialized column metadata + */ + [[nodiscard]] std::unique_ptr> build_metadata() const; + + /** + * @brief Creates a `chunked_pack` instance to perform a "pack" of the `table_view` + * "input", where a buffer of `user_buffer_size` is filled with chunks of the + * overall operation. This operation can be used in cases where GPU memory is constrained. + * + * The memory resource (`temp_mr`) could be a special memory resource to be used in + * situations when GPU memory is low and we want scratch and temporary allocations to + * happen from a small reserved pool of memory. Note that it defaults to the regular cuDF + * per-device resource. + * + * @throws cudf::logic_error When user_buffer_size is less than 1MB + * + * @param input source `table_view` to pack + * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be + * at least 1MB + * @param temp_mr RMM memory resource to be used for temporary and scratch allocations only + * @return a unique_ptr of chunked_pack + */ + [[nodiscard]] static std::unique_ptr create( + cudf::table_view const& input, + std::size_t user_buffer_size, + rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource()); + + private: + // internal state of contiguous split + std::unique_ptr state; +}; + /** * @brief Deep-copy a `table_view` into a serialized contiguous memory format. * @@ -147,7 +294,7 @@ packed_columns pack(cudf::table_view const& input, * * The metadata from the `table_view` is copied into a host vector of bytes which can be used to * construct a `packed_columns` or `packed_table` structure. The caller is responsible for - * guaranteeing that that all of the columns in the table point into `contiguous_buffer`. + * guaranteeing that all of the columns in the table point into `contiguous_buffer`. * * @param table View of the table to pack * @param contiguous_buffer A contiguous buffer of device memory which contains the data referenced diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index 921ef5f65f1..63680473c14 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -68,22 +68,24 @@ enum class out_of_bounds_policy : bool { * * @throws cudf::logic_error if gather_map contains null values. * - * @param[in] source_table The input columns whose rows will be gathered - * @param[in] gather_map View into a non-nullable column of integral indices that maps the + * @param source_table The input columns whose rows will be gathered + * @param gather_map View into a non-nullable column of integral indices that maps the * rows in the source columns to rows in the destination columns. - * @param[in] bounds_policy Policy to apply to account for possible out-of-bounds indices + * @param bounds_policy Policy to apply to account for possible out-of-bounds indices * `DONT_CHECK` skips all bounds checking for gather map values. `NULLIFY` coerces rows that * corresponds to out-of-bounds indices in the gather map to be null elements. Callers should * use `DONT_CHECK` when they are certain that the gather_map contains only valid indices for * better performance. If `policy` is set to `DONT_CHECK` and there are out-of-bounds indices * in the gather map, the behavior is undefined. Defaults to `DONT_CHECK`. - * @param[in] mr Device memory resource used to allocate the returned table's device memory + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table's device memory * @return Result of the gather */ std::unique_ptr
gather( table_view const& source_table, column_view const& gather_map, out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -97,11 +99,13 @@ std::unique_ptr
gather( * ``` * * @param source_table Table that will be reversed + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return Reversed table */ std::unique_ptr
reverse( table_view const& source_table, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -115,11 +119,13 @@ std::unique_ptr
reverse( * ``` * * @param source_column Column that will be reversed + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return Reversed column */ std::unique_ptr reverse( column_view const& source_column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -153,6 +159,7 @@ std::unique_ptr reverse( * to or less than the number of elements in the source columns. * @param target The set of columns into which values from the source_table * are to be scattered + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return Result of scattering values from source to target */ @@ -160,6 +167,7 @@ std::unique_ptr
scatter( table_view const& source, column_view const& scatter_map, table_view const& target, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -189,13 +197,15 @@ std::unique_ptr
scatter( * the rows in the target table to be replaced by source. * @param target The set of columns into which values from the source_table * are to be scattered + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return Result of scattering values from source to target */ std::unique_ptr
scatter( - std::vector> const& source, + std::vector> const& source, column_view const& indices, table_view const& target, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -228,15 +238,20 @@ std::unique_ptr empty_like(scalar const& input); * * Supports only fixed-width types. * - * @param[in] input Immutable view of input column to emulate - * @param[in] mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN - * @param[in] mr Device memory resource used to allocate the returned column's device memory + * If the `mask_alloc` allocates a validity mask that mask is also uninitialized + * and the validity bits and the null count should be set by the caller. + * + * @param input Immutable view of input column to emulate + * @param mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches * @return A column with sufficient uninitialized capacity to hold the same * number of elements as `input` of the same type as `input.type()` */ std::unique_ptr allocate_like( column_view const& input, mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -244,10 +259,14 @@ std::unique_ptr allocate_like( * * Supports only fixed-width types. * - * @param[in] input Immutable view of input column to emulate - * @param[in] size The desired number of elements that the new column should have capacity for - * @param[in] mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN - * @param[in] mr Device memory resource used to allocate the returned column's device memory + * If the `mask_alloc` allocates a validity mask that mask is also uninitialized + * and the validity bits and the null count should be set by the caller. + * + * @param input Immutable view of input column to emulate + * @param size The desired number of elements that the new column should have capacity for + * @param mask_alloc Optional, Policy for allocating null mask. Defaults to RETAIN + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory * @return A column with sufficient uninitialized capacity to hold the specified number of elements * as `input` of the same type as `input.type()` */ @@ -255,6 +274,7 @@ std::unique_ptr allocate_like( column_view const& input, size_type size, mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -299,12 +319,14 @@ std::unique_ptr
empty_like(table_view const& input_table); * @param source_end The index of the last element in the source range * (exclusive) * @param target_begin The starting index of the target range (inclusive) + * @param stream CUDA stream used for device memory operations and kernel launches */ void copy_range_in_place(column_view const& source, mutable_column_view& target, size_type source_begin, size_type source_end, - size_type target_begin); + size_type target_begin, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Copies a range of elements out-of-place from one column to another. @@ -332,6 +354,7 @@ void copy_range_in_place(column_view const& source, * @param source_end The index of the last element in the source range * (exclusive) * @param target_begin The starting index of the target range (inclusive) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return The result target column */ @@ -341,6 +364,7 @@ std::unique_ptr copy_range( size_type source_begin, size_type source_end, size_type target_begin, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -372,6 +396,7 @@ std::unique_ptr copy_range( * @param input Column to be shifted * @param offset The offset by which to shift the input * @param fill_value Fill value for indeterminable outputs + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory * * @throw cudf::logic_error if @p input dtype is neither fixed-width nor string type @@ -383,6 +408,7 @@ std::unique_ptr shift( column_view const& input, size_type offset, scalar const& fill_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -413,14 +439,19 @@ std::unique_ptr shift( * * @param input View of column to slice * @param indices Indices used to take slices of `input` + * @param stream CUDA stream used for device memory operations and kernel launches * @return Vector of views of `input` indicated by the ranges in `indices` */ -std::vector slice(column_view const& input, host_span indices); +std::vector slice(column_view const& input, + host_span indices, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @ingroup copy_slice - * @copydoc cudf::slice(column_view const&, host_span) + * @copydoc cudf::slice(column_view const&, host_span, rmm::cuda_stream_view) */ -std::vector slice(column_view const& input, std::initializer_list indices); +std::vector slice(column_view const& input, + std::initializer_list indices, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Slices a `table_view` into a set of `table_view`s according to a set of indices. @@ -452,14 +483,19 @@ std::vector slice(column_view const& input, std::initializer_list slice(table_view const& input, host_span indices); +std::vector slice(table_view const& input, + host_span indices, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @ingroup copy_slice - * @copydoc cudf::slice(table_view const&, host_span) + * @copydoc cudf::slice(table_view const&, host_span, rmm::cuda_stream_view stream) */ -std::vector slice(table_view const& input, std::initializer_list indices); +std::vector slice(table_view const& input, + std::initializer_list indices, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Splits a `column_view` into a set of `column_view`s according to a set of indices @@ -491,14 +527,19 @@ std::vector slice(table_view const& input, std::initializer_list split(column_view const& input, host_span splits); +std::vector split(column_view const& input, + host_span splits, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @ingroup copy_split - * @copydoc cudf::split(column_view const&, host_span) + * @copydoc cudf::split(column_view const&, host_span, rmm::cuda_stream_view) */ -std::vector split(column_view const& input, std::initializer_list splits); +std::vector split(column_view const& input, + std::initializer_list splits, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Splits a `table_view` into a set of `table_view`s according to a set of indices @@ -532,14 +573,19 @@ std::vector split(column_view const& input, std::initializer_list split(table_view const& input, host_span splits); +std::vector split(table_view const& input, + host_span splits, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @ingroup copy_split - * @copydoc cudf::split(table_view const&, host_span) + * @copydoc cudf::split(table_view const&, host_span, rmm::cuda_stream_view) */ -std::vector split(table_view const& input, std::initializer_list splits); +std::vector split(table_view const& input, + std::initializer_list splits, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Returns a new column, where each element is selected from either @p lhs or @@ -552,11 +598,12 @@ std::vector split(table_view const& input, std::initializer_list copy_if_else( column_view const& lhs, column_view const& rhs, column_view const& boolean_mask, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -576,11 +624,12 @@ std::unique_ptr copy_if_else( * @throws cudf::logic_error if lhs and rhs are not of the same type * @throws cudf::logic_error if boolean mask is not of type bool * @throws cudf::logic_error if boolean mask is not of the same length as rhs - * @param[in] lhs left-hand scalar - * @param[in] rhs right-hand column_view - * @param[in] boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)" + * @param lhs left-hand scalar + * @param rhs right-hand column_view + * @param boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)" * boolean for each element. Null element represents false. - * @param[in] mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory * * @returns new column with the selected elements */ @@ -588,6 +637,7 @@ std::unique_ptr copy_if_else( scalar const& lhs, column_view const& rhs, column_view const& boolean_mask, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -600,11 +650,12 @@ std::unique_ptr copy_if_else( * @throws cudf::logic_error if lhs and rhs are not of the same type * @throws cudf::logic_error if boolean mask is not of type bool * @throws cudf::logic_error if boolean mask is not of the same length as lhs - * @param[in] lhs left-hand column_view - * @param[in] rhs right-hand scalar - * @param[in] boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)" + * @param lhs left-hand column_view + * @param rhs right-hand scalar + * @param boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)" * boolean for each element. Null element represents false. - * @param[in] mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory * * @returns new column with the selected elements */ @@ -612,6 +663,7 @@ std::unique_ptr copy_if_else( column_view const& lhs, scalar const& rhs, column_view const& boolean_mask, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -622,11 +674,12 @@ std::unique_ptr copy_if_else( * rule: `output[i] = (boolean_mask.valid(i) and boolean_mask[i]) ? lhs : rhs` * * @throws cudf::logic_error if boolean mask is not of type bool - * @param[in] lhs left-hand scalar - * @param[in] rhs right-hand scalar - * @param[in] boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)" + * @param lhs left-hand scalar + * @param rhs right-hand scalar + * @param boolean_mask column of `type_id::BOOL8` representing "left (true) / right (false)" * boolean for each element. null element represents false. - * @param[in] mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory * * @returns new column with the selected elements */ @@ -634,6 +687,7 @@ std::unique_ptr copy_if_else( scalar const& lhs, scalar const& rhs, column_view const& boolean_mask, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -665,10 +719,11 @@ std::unique_ptr copy_if_else( * @throws cudf::logic_error if boolean_mask.size() != target.num_rows() * @throws cudf::logic_error if number of `true` in `boolean_mask` > input.num_rows() * - * @param[in] input table_view (set of dense columns) to scatter - * @param[in] target table_view to modify with scattered values from `input` - * @param[in] boolean_mask column_view which acts as boolean mask - * @param[in] mr Device memory resource used to allocate device memory of the returned table + * @param input table_view (set of dense columns) to scatter + * @param target table_view to modify with scattered values from `input` + * @param boolean_mask column_view which acts as boolean mask + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate device memory of the returned table * * @returns Returns a table by scattering `input` into `target` as per `boolean_mask` */ @@ -676,6 +731,7 @@ std::unique_ptr
boolean_mask_scatter( table_view const& input, table_view const& target, column_view const& boolean_mask, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -702,17 +758,19 @@ std::unique_ptr
boolean_mask_scatter( * @throws cudf::logic_error if boolean_mask.type() != bool * @throws cudf::logic_error if boolean_mask.size() != target.size() * - * @param[in] input scalars to scatter - * @param[in] target table_view to modify with scattered values from `input` - * @param[in] boolean_mask column_view which acts as boolean mask - * @param[in] mr Device memory resource used to allocate device memory of the returned table + * @param input scalars to scatter + * @param target table_view to modify with scattered values from `input` + * @param boolean_mask column_view which acts as boolean mask + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate device memory of the returned table * * @returns Returns a table by scattering `input` into `target` as per `boolean_mask` */ std::unique_ptr
boolean_mask_scatter( - std::vector> const& input, + std::vector> const& input, table_view const& target, column_view const& boolean_mask, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -725,12 +783,14 @@ std::unique_ptr
boolean_mask_scatter( * * @param input Column view to get the element from * @param index Index into `input` to get the element at + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned scalar's device memory * @return Scalar containing the single value */ std::unique_ptr get_element( column_view const& input, size_type index, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -764,6 +824,7 @@ enum class sample_with_replacement : bool { * @param n non-negative number of samples expected from `input` * @param replacement Allow or disallow sampling of the same row more than once * @param seed Seed value to initiate random number generator + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * * @return Table containing samples from `input` @@ -773,6 +834,7 @@ std::unique_ptr
sample( size_type const n, sample_with_replacement replacement = sample_with_replacement::FALSE, int64_t const seed = 0, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -788,10 +850,12 @@ std::unique_ptr
sample( * * @param input The column which is (and whose descendants are) to be checked for * non-empty null rows. + * @param stream CUDA stream used for device memory operations and kernel launches * @return true If either the column or its descendants have non-empty null rows * @return false If neither the column or its descendants have non-empty null rows */ -bool has_nonempty_nulls(column_view const& input); +bool has_nonempty_nulls(column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Approximates if a column or its descendants *may* have non-empty null elements @@ -881,11 +945,13 @@ bool may_have_nonempty_nulls(column_view const& input); * @endcode * * @param input The column whose null rows are to be checked and purged + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return A new column with equivalent contents to `input`, but with null rows purged */ std::unique_ptr purge_nonempty_nulls( column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index fb04336871f..44736ca0762 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,7 +78,7 @@ std::unique_ptr extract_day( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts day from any datetime type and returns an int16_t + * @brief Extracts a weekday from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp index 89bab94faaf..41eec156c47 100644 --- a/cpp/include/cudf/detail/aggregation/result_cache.hpp +++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include @@ -36,7 +36,7 @@ struct pair_column_aggregation_equal_to { struct pair_column_aggregation_hash { size_t operator()(std::pair const& key) const { - return hash_combine(shallow_hash(key.first), key.second.do_hash()); + return cudf::hashing::detail::hash_combine(shallow_hash(key.first), key.second.do_hash()); } }; @@ -45,7 +45,7 @@ class result_cache { result_cache() = delete; ~result_cache() = default; result_cache(result_cache const&) = delete; - result_cache& operator=(const result_cache& other) = delete; + result_cache& operator=(result_cache const& other) = delete; result_cache(size_t num_columns) : _cache(num_columns) {} diff --git a/cpp/include/cudf/detail/concatenate.cuh b/cpp/include/cudf/detail/concatenate_masks.hpp similarity index 76% rename from cpp/include/cudf/detail/concatenate.cuh rename to cpp/include/cudf/detail/concatenate_masks.hpp index 51bcb1afa1f..e7086ea17a5 100644 --- a/cpp/include/cudf/detail/concatenate.cuh +++ b/cpp/include/cudf/detail/concatenate_masks.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,14 +17,11 @@ #include #include -#include -#include -#include #include #include - -#include +#include +#include namespace cudf { //! Inner interfaces and implementations @@ -39,12 +36,13 @@ namespace detail { * @param dest_mask The output buffer to copy null masks into * @param output_size The total number of null masks bits that are being copied * @param stream CUDA stream used for device memory operations and kernel launches. + * @return The number of nulls */ -void concatenate_masks(device_span d_views, - device_span d_offsets, - bitmask_type* dest_mask, - size_type output_size, - rmm::cuda_stream_view stream); +size_type concatenate_masks(device_span d_views, + device_span d_offsets, + bitmask_type* dest_mask, + size_type output_size, + rmm::cuda_stream_view stream); /** * @brief Concatenates `views[i]`'s bitmask from the bits @@ -54,10 +52,11 @@ void concatenate_masks(device_span d_views, * @param views Column views whose bitmasks will be concatenated * @param dest_mask The output buffer to copy null masks into * @param stream CUDA stream used for device memory operations and kernel launches. + * @return The number of nulls */ -void concatenate_masks(host_span views, - bitmask_type* dest_mask, - rmm::cuda_stream_view stream); +size_type concatenate_masks(host_span views, + bitmask_type* dest_mask, + rmm::cuda_stream_view stream); /** * @copydoc cudf::concatenate_masks(host_span, rmm::mr::device_memory_resource*) diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index cb3e20b36f2..1dd91dcd865 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -44,6 +43,8 @@ #include +#include + #include namespace cudf { @@ -126,7 +127,7 @@ __launch_bounds__(block_size) __global__ cudf::size_type tmp_block_sum = 0; // get output location using a scan of the mask result - const cudf::size_type local_index = block_scan_mask(mask_true, tmp_block_sum); + cudf::size_type const local_index = block_scan_mask(mask_true, tmp_block_sum); block_sum += tmp_block_sum; if (has_validity) { @@ -141,7 +142,7 @@ __launch_bounds__(block_size) __global__ // scatter validity mask to shared memory if (has_validity and input_view.is_valid(tid)) { // determine aligned offset for this warp's output - const cudf::size_type aligned_offset = block_offset % cudf::detail::warp_size; + cudf::size_type const aligned_offset = block_offset % cudf::detail::warp_size; temp_valids[local_index + aligned_offset] = true; } } @@ -161,10 +162,10 @@ __launch_bounds__(block_size) __global__ constexpr int num_warps = block_size / cudf::detail::warp_size; // account for partial blocks with non-warp-aligned offsets - const int last_index = tmp_block_sum + (block_offset % cudf::detail::warp_size) - 1; - const int last_warp = min(num_warps, last_index / cudf::detail::warp_size); - const int wid = threadIdx.x / cudf::detail::warp_size; - const int lane = threadIdx.x % cudf::detail::warp_size; + int const last_index = tmp_block_sum + (block_offset % cudf::detail::warp_size) - 1; + int const last_warp = min(num_warps, last_index / cudf::detail::warp_size); + int const wid = threadIdx.x / cudf::detail::warp_size; + int const lane = threadIdx.x % cudf::detail::warp_size; cudf::size_type tmp_warp_valid_counts{0}; @@ -181,7 +182,9 @@ __launch_bounds__(block_size) __global__ if (wid > 0 && wid < last_warp) output_valid[valid_index] = valid_warp; else { - atomicOr(&output_valid[valid_index], valid_warp); + cuda::atomic_ref ref{ + output_valid[valid_index]}; + ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed); } } @@ -190,7 +193,9 @@ __launch_bounds__(block_size) __global__ uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[block_size + threadIdx.x]); if (lane == 0 && valid_warp != 0) { tmp_warp_valid_counts += __popc(valid_warp); - atomicOr(&output_valid[valid_index + num_warps], valid_warp); + cuda::atomic_ref ref{ + output_valid[valid_index + num_warps]}; + ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed); } } } @@ -206,7 +211,8 @@ __launch_bounds__(block_size) __global__ cudf::detail::single_lane_block_sum_reduce(warp_valid_counts); if (threadIdx.x == 0) { // one thread computes and adds to null count - atomicAdd(output_null_count, block_sum - block_valid_count); + cuda::atomic_ref ref{*output_null_count}; + ref.fetch_add(block_sum - block_valid_count, cuda::std::memory_order_relaxed); } } diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 083b12edbf8..04ad1f20196 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -44,9 +44,9 @@ __launch_bounds__(block_size) __global__ mutable_column_device_view out, size_type* __restrict__ const valid_count) { - const size_type tid = threadIdx.x + blockIdx.x * block_size; - const int warp_id = tid / warp_size; - const size_type warps_per_grid = gridDim.x * block_size / warp_size; + size_type const tid = threadIdx.x + blockIdx.x * block_size; + int const warp_id = tid / warp_size; + size_type const warps_per_grid = gridDim.x * block_size / warp_size; // begin/end indices for the column data size_type begin = 0; @@ -59,7 +59,7 @@ __launch_bounds__(block_size) __global__ // lane id within the current warp constexpr size_type leader_lane{0}; - const int lane_id = threadIdx.x % warp_size; + int const lane_id = threadIdx.x % warp_size; size_type warp_valid_count{0}; diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index 0d5aa509e08..16e4e7a1297 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -54,17 +54,17 @@ __global__ void copy_range_kernel(SourceValueIterator source_value_begin, "copy_range_kernel assumes bitmask element size in bits == warp size"); constexpr cudf::size_type leader_lane{0}; - const int lane_id = threadIdx.x % warp_size; + int const lane_id = threadIdx.x % warp_size; - const cudf::size_type tid = threadIdx.x + blockIdx.x * blockDim.x; - const int warp_id = tid / warp_size; + cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x; + int const warp_id = tid / warp_size; - const cudf::size_type offset = target.offset(); - const cudf::size_type begin_mask_idx = cudf::word_index(offset + target_begin); - const cudf::size_type end_mask_idx = cudf::word_index(offset + target_end); + cudf::size_type const offset = target.offset(); + cudf::size_type const begin_mask_idx = cudf::word_index(offset + target_begin); + cudf::size_type const end_mask_idx = cudf::word_index(offset + target_end); cudf::size_type mask_idx = begin_mask_idx + warp_id; - const cudf::size_type masks_per_grid = gridDim.x * blockDim.x / warp_size; + cudf::size_type const masks_per_grid = gridDim.x * blockDim.x / warp_size; cudf::size_type target_offset = begin_mask_idx * warp_size - (offset + target_begin); cudf::size_type source_idx = tid + target_offset; @@ -79,10 +79,10 @@ __global__ void copy_range_kernel(SourceValueIterator source_value_begin, if (in_range) target.element(index) = *(source_value_begin + source_idx); if (has_validity) { // update bitmask - const bool valid = in_range && *(source_validity_begin + source_idx); - const int active_mask = __ballot_sync(0xFFFF'FFFFu, in_range); - const int valid_mask = __ballot_sync(0xFFFF'FFFFu, valid); - const int warp_mask = active_mask & valid_mask; + bool const valid = in_range && *(source_validity_begin + source_idx); + int const active_mask = __ballot_sync(0xFFFF'FFFFu, in_range); + int const valid_mask = __ballot_sync(0xFFFF'FFFFu, valid); + int const warp_mask = active_mask & valid_mask; cudf::bitmask_type old_mask = target.get_mask_word(mask_idx); if (lane_id == leader_lane) { @@ -154,11 +154,6 @@ void copy_range(SourceValueIterator source_value_begin, auto grid = cudf::detail::grid_1d{num_items, block_size, 1}; if (target.nullable()) { - // TODO: if null_count is UNKNOWN_NULL_COUNT, no need to update null - // count (if null_count is UNKNOWN_NULL_COUNT, invoking null_count() - // will scan the entire bitmask array, and this can be surprising - // in performance if the copy range is small and the column size is - // large). rmm::device_scalar null_count(target.null_count(), stream); auto kernel = diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh index 3146005ca49..0ab9da0dbd0 100644 --- a/cpp/include/cudf/detail/indexalator.cuh +++ b/cpp/include/cudf/detail/indexalator.cuh @@ -582,7 +582,7 @@ struct indexalator_factory { }; /** - * @brief An index accessor that returns an index value if corresponding validity flag is true. + * @brief An index accessor that returns an index value if the scalar's validity flag is true. * * This is suitable as an `optional_iterator`. */ @@ -605,7 +605,7 @@ struct indexalator_factory { }; /** - * @brief Create an index iterator with a nullable index accessor. + * @brief Create an index iterator with an optional index accessor. */ static auto make_input_optional_iterator(column_view const& col) { @@ -613,7 +613,7 @@ struct indexalator_factory { } /** - * @brief Create an index iterator with a nullable index accessor for a scalar. + * @brief Create an index iterator with an optional index accessor for a scalar. */ static auto make_input_optional_iterator(scalar const& input) { diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp index c424c20d7c7..6fcf10aef57 100644 --- a/cpp/include/cudf/detail/join.hpp +++ b/cpp/include/cudf/detail/join.hpp @@ -16,7 +16,7 @@ #pragma once #include -#include +#include #include #include #include @@ -86,7 +86,6 @@ struct hash_join { * @brief Constructor that internally builds the hash table based on the given `build` table. * * @throw cudf::logic_error if the number of columns in `build` table is 0. - * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE. * * @param build The build table, from which the hash table is built. * @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or @@ -177,7 +176,6 @@ struct hash_join { * @copydoc cudf::detail::hash_join::probe_join_indices * * @throw cudf::logic_error if probe table is empty. - * @throw cudf::logic_error if the size of probe table exceeds `MAX_JOIN_SIZE`. * @throw cudf::logic_error if the number of columns in build table and probe table do not match. * @throw cudf::logic_error if the column data types in build table and probe table do not match. */ diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 3ff3bb4cf3c..78cd3d7bcb7 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -66,9 +66,11 @@ __global__ void offset_bitmask_binop(Binop op, size_type source_size_bits, size_type* count_ptr) { - constexpr auto const word_size{detail::size_in_bits()}; auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const last_bit_index = source_size_bits - 1; + auto const last_word_index = cudf::word_index(last_bit_index); + size_type thread_count = 0; for (size_type destination_word_index = tid; destination_word_index < destination.size(); @@ -86,20 +88,19 @@ __global__ void offset_bitmask_binop(Binop op, source_begin_bits[i] + source_size_bits)); } + if (destination_word_index == last_word_index) { + // mask out any bits not part of this word + auto const num_bits_in_last_word = intra_word_index(last_bit_index); + if (num_bits_in_last_word < + static_cast(detail::size_in_bits() - 1)) { + destination_word &= set_least_significant_bits(num_bits_in_last_word + 1); + } + } + destination[destination_word_index] = destination_word; thread_count += __popc(destination_word); } - // Subtract any slack bits from the last word - if (tid == 0) { - size_type const last_bit_index = source_size_bits - 1; - size_type const num_slack_bits = word_size - (last_bit_index % word_size) - 1; - if (num_slack_bits > 0) { - size_type const word_index = cudf::word_index(last_bit_index); - thread_count -= __popc(destination[word_index] & set_most_significant_bits(num_slack_bits)); - } - } - using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; size_type block_count = BlockReduce(temp_storage).Sum(thread_count); @@ -262,7 +263,7 @@ __global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bi */ struct bit_to_word_index { bit_to_word_index(bool inclusive) : inclusive(inclusive) {} - __device__ inline size_type operator()(const size_type& bit_index) const + __device__ inline size_type operator()(size_type const& bit_index) const { return word_index(bit_index) + ((inclusive || intra_word_index(bit_index) == 0) ? 0 : 1); } @@ -378,13 +379,13 @@ size_type validate_segmented_indices(IndexIterator indices_begin, IndexIterator } struct index_alternator { - __device__ inline size_type operator()(const size_type& i) const + __device__ inline size_type operator()(size_type const& i) const { return *(d_indices + 2 * i + (is_end ? 1 : 0)); } bool const is_end = false; - const size_type* d_indices; + size_type const* d_indices; }; /** diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index 7f1b15893c5..8c10bbe416f 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -141,20 +141,9 @@ cudf::size_type valid_count(bitmask_type const* bitmask, rmm::cuda_stream_view stream); /** - * @brief Given a validity bitmask, counts the number of null elements (unset bits) - * in the range `[start, stop)`. - * - * If `bitmask == nullptr`, all elements are assumed to be valid and the - * function returns ``. - * - * @throws cudf::logic_error if `start > stop` - * @throws cudf::logic_error if `start < 0` + * @copydoc null_count(bitmask_type const* bitmask, size_type start, size_type stop) * - * @param[in] bitmask Validity bitmask residing in device memory. - * @param[in] start Index of the first bit to count (inclusive). - * @param[in] stop Index of the last bit to count (exclusive). - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * @return The number of null elements in the specified range. + * @param stream Stream view on which to allocate resources and queue execution. */ cudf::size_type null_count(bitmask_type const* bitmask, size_type start, diff --git a/cpp/include/cudf/detail/nvtx/nvtx3.hpp b/cpp/include/cudf/detail/nvtx/nvtx3.hpp index 8d7323cb88e..4b840724034 100644 --- a/cpp/include/cudf/detail/nvtx/nvtx3.hpp +++ b/cpp/include/cudf/detail/nvtx/nvtx3.hpp @@ -1167,7 +1167,7 @@ class registered_message { * Registers `msg` with NVTX and associates a handle with the registered * message. * - * A particular message should should only be registered once and the handle + * A particular message should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message @@ -1183,7 +1183,7 @@ class registered_message { * Registers `msg` with NVTX and associates a handle with the registered * message. * - * A particular message should should only be registered once and the handle + * A particular message should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message @@ -1196,7 +1196,7 @@ class registered_message { * Registers `msg` with NVTX and associates a handle with the registered * message. * - * A particular message should should only be registered once and the handle + * A particular message should only be registered once and the handle * reused throughout the rest of the application. * * @param msg The contents of the message diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp index 39ae4fe1944..94c795f31b2 100644 --- a/cpp/include/cudf/detail/scatter.hpp +++ b/cpp/include/cudf/detail/scatter.hpp @@ -106,7 +106,7 @@ std::unique_ptr
scatter(table_view const& source, * @param mr Device memory resource used to allocate the returned table's device memory * @return Result of scattering values from source to target */ -std::unique_ptr
scatter(std::vector> const& source, +std::unique_ptr
scatter(std::vector> const& source, column_view const& indices, table_view const& target, rmm::cuda_stream_view stream, @@ -136,7 +136,7 @@ std::unique_ptr
boolean_mask_scatter(table_view const& source, * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr
boolean_mask_scatter( - std::vector> const& source, + std::vector> const& source, table_view const& target, column_view const& boolean_mask, rmm::cuda_stream_view stream, diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh index 0017ddb305d..155b1ce5691 100644 --- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh +++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh @@ -244,7 +244,7 @@ static sizes_to_offsets_iterator make_sizes_to_offsets_i * auto const bytes = cudf::detail::sizes_to_offsets( * d_offsets, d_offsets + strings_count + 1, d_offsets, stream); * CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), - * "Size of output exceeds column size limit", std::overflow_error); + * "Size of output exceeds the column size limit", std::overflow_error); * @endcode * * @tparam SizesIterator Iterator type for input of the scan using addition operation @@ -303,9 +303,9 @@ std::pair, size_type> make_offsets_child_column( { auto count = static_cast(std::distance(begin, end)); auto offsets_column = make_numeric_column( - data_type{type_to_id()}, count + 1, mask_state::UNALLOCATED, stream, mr); + data_type{type_to_id()}, count + 1, mask_state::UNALLOCATED, stream, mr); auto offsets_view = offsets_column->mutable_view(); - auto d_offsets = offsets_view.template data(); + auto d_offsets = offsets_view.template data(); // The number of offsets is count+1 so to build the offsets from the sizes // using exclusive-scan technically requires count+1 input values even though @@ -319,7 +319,7 @@ std::pair, size_type> make_offsets_child_column( auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream); CUDF_EXPECTS( total_elements <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit", + "Size of output exceeds the column size limit", std::overflow_error); offsets_column->set_null_count(0); diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index e0fc7b71cd9..5476000fc29 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -86,24 +86,9 @@ std::unique_ptr
distinct(table_view const& input, rmm::mr::device_memory_resource* mr); /** - * @brief Create a new table without duplicate rows. + * @copydoc cudf::stable_distinct * - * Given an `input` table_view, each row is copied to the output table to create a set of distinct - * rows. The row order is guaranteed to be preserved as in the input. - * - * If there are duplicate rows, which row to be copied depends on the specified value of the `keep` - * parameter. - * - * This API produces exactly the same set of output rows as `cudf::distinct`. - * - * @param input The input table - * @param keys Vector of indices indicating key columns in the `input` table - * @param keep Copy any, first, last, or none of the found duplicates - * @param nulls_equal Flag to specify whether null elements should be considered as equal - * @param nans_equal Flag to specify whether NaN elements should be considered as equal - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned table - * @return A table containing the resulting distinct rows + * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr
stable_distinct(table_view const& input, std::vector const& keys, diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp index d9fb0efed45..b529d4a2c53 100644 --- a/cpp/include/cudf/detail/tdigest/tdigest.hpp +++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp @@ -27,9 +27,10 @@ namespace tdigest { namespace detail { /** - * @brief Generate a tdigest column from a grouped set of numeric input values. + * @brief Generate a tdigest column from a grouped, sorted set of numeric input values. * - * The tdigest column produced is of the following structure: + * The input is expected to be sorted in ascending order within each group, with + * nulls at the end. * * struct { * // centroids for the digest @@ -166,96 +167,6 @@ std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -/** - * @brief Generate a tdigest column from a grouped, sorted set of numeric input values. - * - * The input is expected to be sorted in ascending order within each group, with - * nulls at the end. - * - * The tdigest column produced is of the following structure: - ** struct { - * // centroids for the digest - * list { - * struct { - * double // mean - * double // weight - * }, - * ... - * } - * // these are from the input stream, not the centroids. they are used - * // during the percentile_approx computation near the beginning or - * // end of the quantiles - * double // min - * double // max - * } - * - * Each output row is a single tdigest. The length of the row is the "size" of the - * tdigest, each element of which represents a weighted centroid (mean, weight). - * - * @param values Grouped (and sorted) values to merge. - * @param group_offsets Offsets of groups' starting points within @p values. - * @param group_labels 0-based ID of group that the corresponding value belongs to - * @param group_valid_counts Per-group counts of valid elements. - * @param num_groups Number of groups. - * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher - * values result in a larger, more precise tdigest. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory - * - * @returns tdigest column, with 1 tdigest per row - */ -std::unique_ptr group_tdigest(column_view const& values, - cudf::device_span group_offsets, - cudf::device_span group_labels, - cudf::device_span group_valid_counts, - size_type num_groups, - int max_centroids, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - -/** - * @brief Merges tdigests within the same group to generate a new tdigest. - * - * The tdigest column produced is of the following structure: - * - * struct { - * // centroids for the digest - * list { - * struct { - * double // mean - * double // weight - * }, - * ... - * } - * // these are from the input stream, not the centroids. they are used - * // during the percentile_approx computation near the beginning or - * // end of the quantiles - * double // min - * double // max - * } - * - * Each output row is a single tdigest. The length of the row is the "size" of the - * tdigest, each element of which represents a weighted centroid (mean, weight). - * - * @param values Grouped tdigests to merge. - * @param group_offsets Offsets of groups' starting points within @p values. - * @param group_labels 0-based ID of group that the corresponding value belongs to - * @param num_groups Number of groups. - * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher - * values result in a larger, more precise tdigest. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned column's device memory - * - * @returns tdigest column, with 1 tdigest per row - */ -std::unique_ptr group_merge_tdigest(column_view const& values, - cudf::device_span group_offsets, - cudf::device_span group_labels, - size_type num_groups, - int max_centroids, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - /** * @brief Generate a tdigest scalar from a set of numeric input values. * diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp index 5b64f61f11a..215ad50aed6 100644 --- a/cpp/include/cudf/detail/transform.hpp +++ b/cpp/include/cudf/detail/transform.hpp @@ -41,8 +41,8 @@ std::unique_ptr transform(column_view const& input, * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr compute_column(table_view const table, - ast::operation const& expr, +std::unique_ptr compute_column(table_view const& table, + ast::expression const& expr, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index cdbc26701d1..264302df0e9 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,8 +41,8 @@ static constexpr size_type warp_size{32}; */ class grid_1d { public: - const int num_threads_per_block; - const int num_blocks; + int const num_threads_per_block; + int const num_blocks; /** * @param overall_num_elements The number of elements the kernel needs to * handle/process, in its main, one-dimensional/linear input (e.g. one or more @@ -65,6 +65,56 @@ class grid_1d { CUDF_EXPECTS(num_threads_per_block > 0, "num_threads_per_block must be > 0"); CUDF_EXPECTS(num_blocks > 0, "num_blocks must be > 0"); } + + /** + * @brief Returns the global thread index in a 1D grid. + * + * The returned index is unique across the entire grid. + * + * @param thread_id The thread index within the block + * @param block_id The block index within the grid + * @param num_threads_per_block The number of threads per block + * @return thread_index_type The global thread index + */ + static constexpr thread_index_type global_thread_id(thread_index_type thread_id, + thread_index_type block_id, + thread_index_type num_threads_per_block) + { + return thread_id + block_id * num_threads_per_block; + } + + /** + * @brief Returns the global thread index of the current thread in a 1D grid. + * + * @return thread_index_type The global thread index + */ + static __device__ thread_index_type global_thread_id() + { + return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x); + } + + /** + * @brief Returns the stride of a 1D grid. + * + * The returned stride is the total number of threads in the grid. + * + * @param thread_id The thread index within the block + * @param block_id The block index within the grid + * @param num_threads_per_block The number of threads per block + * @return thread_index_type The global thread index + */ + static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block, + thread_index_type num_blocks_per_grid) + { + return num_threads_per_block * num_blocks_per_grid; + } + + /** + * @brief Returns the stride of the current 1D grid. + * + * @return thread_index_type The number of threads in the grid. + */ + static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); } }; /** @@ -106,6 +156,10 @@ __device__ T single_lane_block_sum_reduce(T lane_value) lane_value = (lane_id < warps_per_block) ? lane_values[lane_id] : T{0}; result = cub::WarpReduce(temp).Sum(lane_value); } + // Shared memory has block scope, so sync here to ensure no data + // races between successive calls to this function in the same + // kernel. + __syncthreads(); return result; } diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index c1fc96d6f43..c56e88f07a8 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,6 @@ * cudf::duration_us, cudf::duration_ns and bool * where CUDA atomic operations are, `atomicAdd`, `atomicMin`, `atomicMax`, * `atomicCAS`. - * `atomicAnd`, `atomicOr`, `atomicXor` are also supported for integer data types. * Also provides `cudf::genericAtomicOperation` which performs atomic operation * with the given binary operator. */ @@ -161,7 +160,6 @@ struct genericAtomicOperationImpl { // specialized functions for operators // `atomicAdd` supports int32, float, double (signed int64 is not supported.) // `atomicMin`, `atomicMax` support int32_t, int64_t -// `atomicAnd`, `atomicOr`, `atomicXor` support int32_t, int64_t template <> struct genericAtomicOperationImpl { using T = float; @@ -252,63 +250,6 @@ struct genericAtomicOperationImpl { return ret; } }; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op) - { - return atomicAnd(addr, update_value); - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op) - { - using T_int = long long int; - static_assert(sizeof(T) == sizeof(T_int)); - T ret = atomicAnd(reinterpret_cast(addr), type_reinterpret(update_value)); - return ret; - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op) - { - return atomicOr(addr, update_value); - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op) - { - using T_int = long long int; - static_assert(sizeof(T) == sizeof(T_int)); - T ret = atomicOr(reinterpret_cast(addr), type_reinterpret(update_value)); - return ret; - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op) - { - return atomicXor(addr, update_value); - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op) - { - using T_int = long long int; - static_assert(sizeof(T) == sizeof(T_int)); - T ret = atomicXor(reinterpret_cast(addr), type_reinterpret(update_value)); - return ret; - } -}; // ----------------------------------------------------------------------- // the implementation of `typesAtomicCASImpl` template @@ -598,66 +539,3 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val) { return cudf::detail::typesAtomicCASImpl()(address, compare, val); } - -/** - * @brief Overloads for `atomicAnd` - * reads the `old` located at the `address` in global or shared memory, - * computes (old & val), and stores the result back to memory at the same - * address. These three operations are performed in one atomic transaction. - * - * The supported types for `atomicAnd` are: - * singed/unsigned integer 8/16/32/64 bits - * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * - * @returns The old value at `address` - */ -template , T>* = nullptr> -__forceinline__ __device__ T atomicAnd(T* address, T val) -{ - return cudf::genericAtomicOperation(address, val, cudf::DeviceAnd{}); -} - -/** - * @brief Overloads for `atomicOr` - * reads the `old` located at the `address` in global or shared memory, - * computes (old | val), and stores the result back to memory at the same - * address. These three operations are performed in one atomic transaction. - * - * The supported types for `atomicOr` are: - * singed/unsigned integer 8/16/32/64 bits - * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * - * @returns The old value at `address` - */ -template , T>* = nullptr> -__forceinline__ __device__ T atomicOr(T* address, T val) -{ - return cudf::genericAtomicOperation(address, val, cudf::DeviceOr{}); -} - -/** - * @brief Overloads for `atomicXor` - * reads the `old` located at the `address` in global or shared memory, - * computes (old ^ val), and stores the result back to memory at the same - * address. These three operations are performed in one atomic transaction. - * - * The supported types for `atomicXor` are: - * singed/unsigned integer 8/16/32/64 bits - * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * - * @returns The old value at `address` - */ -template , T>* = nullptr> -__forceinline__ __device__ T atomicXor(T* address, T val) -{ - return cudf::genericAtomicOperation(address, val, cudf::DeviceXor{}); -} diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh index f6d9d32b398..46f424e051b 100644 --- a/cpp/include/cudf/detail/utilities/device_operators.cuh +++ b/cpp/include/cudf/detail/utilities/device_operators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,7 +62,7 @@ CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs) */ struct DeviceSum { template ()>* = nullptr> - CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs + rhs) + CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs) -> decltype(lhs + rhs) { return lhs + rhs; } @@ -93,13 +93,13 @@ struct DeviceSum { */ struct DeviceCount { template ()>* = nullptr> - CUDF_HOST_DEVICE inline T operator()(const T& lhs, const T& rhs) + CUDF_HOST_DEVICE inline T operator()(T const& lhs, T const& rhs) { return T{DeviceCount{}(lhs.time_since_epoch(), rhs.time_since_epoch())}; } template ()>* = nullptr> - CUDF_HOST_DEVICE inline T operator()(const T&, const T& rhs) + CUDF_HOST_DEVICE inline T operator()(T const&, T const& rhs) { return rhs + T{1}; } @@ -116,7 +116,7 @@ struct DeviceCount { */ struct DeviceMin { template - CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) + CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs) -> decltype(cudf::detail::min(lhs, rhs)) { return numeric::detail::min(lhs, rhs); @@ -164,7 +164,7 @@ struct DeviceMin { */ struct DeviceMax { template - CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) + CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs) -> decltype(cudf::detail::max(lhs, rhs)) { return numeric::detail::max(lhs, rhs); @@ -211,7 +211,7 @@ struct DeviceMax { */ struct DeviceProduct { template ()>* = nullptr> - CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs * rhs) + CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs) -> decltype(lhs * rhs) { return lhs * rhs; } @@ -230,44 +230,11 @@ struct DeviceProduct { } }; -/** - * @brief binary `and` operator - */ -struct DeviceAnd { - template >* = nullptr> - CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs & rhs) - { - return (lhs & rhs); - } -}; - -/** - * @brief binary `or` operator - */ -struct DeviceOr { - template >* = nullptr> - CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs | rhs) - { - return (lhs | rhs); - } -}; - -/** - * @brief binary `xor` operator - */ -struct DeviceXor { - template >* = nullptr> - CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs ^ rhs) - { - return (lhs ^ rhs); - } -}; - /** * @brief Operator for calculating Lead/Lag window function. */ struct DeviceLeadLag { - const size_type row_offset; + size_type const row_offset; explicit CUDF_HOST_DEVICE inline DeviceLeadLag(size_type offset_) : row_offset(offset_) {} }; diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh deleted file mode 100644 index ca9c16043a3..00000000000 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ /dev/null @@ -1,381 +0,0 @@ -/* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace cudf { -namespace detail { - -/** - * Normalization of floating point NaNs, passthrough for all other values. - */ -template -T __device__ inline normalize_nans(T const& key) -{ - if constexpr (cudf::is_floating_point()) { - if (std::isnan(key)) { return std::numeric_limits::quiet_NaN(); } - } - return key; -} - -/** - * Normalization of floating point NaNs and zeros, passthrough for all other values. - */ -template -T __device__ inline normalize_nans_and_zeros(T const& key) -{ - if constexpr (cudf::is_floating_point()) { - if (key == T{0.0}) { return T{0.0}; } - } - return normalize_nans(key); -} - -__device__ inline uint32_t rotate_bits_left(uint32_t x, uint32_t r) -{ - // This function is equivalent to (x << r) | (x >> (32 - r)) - return __funnelshift_l(x, x, r); -} - -__device__ inline uint32_t rotate_bits_right(uint32_t x, uint32_t r) -{ - // This function is equivalent to (x >> r) | (x << (32 - r)) - return __funnelshift_r(x, x, r); -} - -__device__ inline uint64_t rotate_bits_right(uint64_t x, uint32_t r) -{ - return (x >> r) | (x << (64 - r)); -} - -// Swap the endianness of a 32 bit value -__device__ inline uint32_t swap_endian(uint32_t x) -{ - // The selector 0x0123 reverses the byte order - return __byte_perm(x, 0, 0x0123); -} - -// Swap the endianness of a 64 bit value -// There is no CUDA intrinsic for permuting bytes in 64 bit integers -__device__ inline uint64_t swap_endian(uint64_t x) -{ - // Reverse the endianness of each 32 bit section - uint32_t low_bits = swap_endian(static_cast(x)); - uint32_t high_bits = swap_endian(static_cast(x >> 32)); - // Reassemble a 64 bit result, swapping the low bits and high bits - return (static_cast(low_bits) << 32) | (static_cast(high_bits)); -}; - -template -struct hash_circular_buffer { - uint8_t storage[capacity]; - uint8_t* cur; - int available_space{capacity}; - hash_step_callable hash_step; - - __device__ inline hash_circular_buffer(hash_step_callable hash_step) - : cur{storage}, hash_step{hash_step} - { - } - - __device__ inline void put(uint8_t const* in, int size) - { - int copy_start = 0; - while (size >= available_space) { - // The buffer will be filled by this chunk of data. Copy a chunk of the - // data to fill the buffer and trigger a hash step. - memcpy(cur, in + copy_start, available_space); - hash_step(storage); - size -= available_space; - copy_start += available_space; - cur = storage; - available_space = capacity; - } - // The buffer will not be filled by the remaining data. That is, `size >= 0 - // && size < capacity`. We copy the remaining data into the buffer but do - // not trigger a hash step. - memcpy(cur, in + copy_start, size); - cur += size; - available_space -= size; - } - - __device__ inline void pad(int const space_to_leave) - { - if (space_to_leave > available_space) { - memset(cur, 0x00, available_space); - hash_step(storage); - cur = storage; - available_space = capacity; - } - memset(cur, 0x00, available_space - space_to_leave); - cur += available_space - space_to_leave; - available_space = space_to_leave; - } - - __device__ inline const uint8_t& operator[](int idx) const { return storage[idx]; } -}; - -// Get a uint8_t pointer to a column element and its size as a pair. -template -auto __device__ inline get_element_pointer_and_size(Element const& element) -{ - if constexpr (is_fixed_width() && !is_chrono()) { - return thrust::make_pair(reinterpret_cast(&element), sizeof(Element)); - } else { - CUDF_UNREACHABLE("Unsupported type."); - } -} - -template <> -auto __device__ inline get_element_pointer_and_size(string_view const& element) -{ - return thrust::make_pair(reinterpret_cast(element.data()), element.size_bytes()); -} - -/** - * Modified GPU implementation of - * https://johnnylee-sde.github.io/Fast-unsigned-integer-to-hex-string/ - * Copyright (c) 2015 Barry Clark - * Licensed under the MIT license. - * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT - */ -void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destination) -{ - // Transform 0xABCD'1234 => 0x0000'ABCD'0000'1234 => 0x0B0A'0D0C'0201'0403 - uint64_t x = num; - x = ((x & 0xFFFF'0000u) << 16) | ((x & 0xFFFF)); - x = ((x & 0x000F'0000'000Fu) << 8) | ((x & 0x00F0'0000'00F0u) >> 4) | - ((x & 0x0F00'0000'0F00u) << 16) | ((x & 0xF000'0000'F000) << 4); - - // Calculate a mask of ascii value offsets for bytes that contain alphabetical hex digits - uint64_t offsets = (((x + 0x0606'0606'0606'0606) >> 4) & 0x0101'0101'0101'0101) * 0x27; - - x |= 0x3030'3030'3030'3030; - x += offsets; - std::memcpy(destination, reinterpret_cast(&x), 8); -} - -// MurmurHash3_32 implementation from -// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. -// Note - The x86 and x64 versions do _not_ produce the same results, as the -// algorithms are optimized for their respective platforms. You can still -// compile and run any of them on any platform, but your performance with the -// non-native version will be less than optimal. -template -struct MurmurHash3_32 { - using result_type = hash_value_type; - - constexpr MurmurHash3_32() = default; - constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {} - - [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const - { - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - return h; - } - - [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data, - cudf::size_type offset) const - { - // Read a 4-byte value from the data pointer as individual bytes for safe - // unaligned access (very likely for string types). - auto const block = reinterpret_cast(data + offset); - return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); - } - - [[nodiscard]] result_type __device__ inline operator()(Key const& key) const - { - return compute(detail::normalize_nans_and_zeros(key)); - } - - template - result_type __device__ inline compute(T const& key) const - { - return compute_bytes(reinterpret_cast(&key), sizeof(T)); - } - - result_type __device__ inline compute_remaining_bytes(std::byte const* data, - cudf::size_type len, - cudf::size_type tail_offset, - result_type h) const - { - // Process remaining bytes that do not fill a four-byte chunk. - uint32_t k1 = 0; - switch (len % 4) { - case 3: k1 ^= std::to_integer(data[tail_offset + 2]) << 16; [[fallthrough]]; - case 2: k1 ^= std::to_integer(data[tail_offset + 1]) << 8; [[fallthrough]]; - case 1: - k1 ^= std::to_integer(data[tail_offset]); - k1 *= c1; - k1 = cudf::detail::rotate_bits_left(k1, rot_c1); - k1 *= c2; - h ^= k1; - }; - return h; - } - - result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const - { - constexpr cudf::size_type BLOCK_SIZE = 4; - cudf::size_type const nblocks = len / BLOCK_SIZE; - cudf::size_type const tail_offset = nblocks * BLOCK_SIZE; - result_type h = m_seed; - - // Process all four-byte chunks. - for (cudf::size_type i = 0; i < nblocks; i++) { - uint32_t k1 = getblock32(data, i * BLOCK_SIZE); - k1 *= c1; - k1 = cudf::detail::rotate_bits_left(k1, rot_c1); - k1 *= c2; - h ^= k1; - h = cudf::detail::rotate_bits_left(h, rot_c2); - h = h * 5 + c3; - } - - h = compute_remaining_bytes(data, len, tail_offset, h); - - // Finalize hash. - h ^= len; - h = fmix32(h); - return h; - } - - private: - uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; - static constexpr uint32_t c1 = 0xcc9e2d51; - static constexpr uint32_t c2 = 0x1b873593; - static constexpr uint32_t c3 = 0xe6546b64; - static constexpr uint32_t rot_c1 = 15; - static constexpr uint32_t rot_c2 = 13; -}; - -template <> -hash_value_type __device__ inline MurmurHash3_32::operator()(bool const& key) const -{ - return compute(static_cast(key)); -} - -template <> -hash_value_type __device__ inline MurmurHash3_32::operator()(float const& key) const -{ - return compute(detail::normalize_nans_and_zeros(key)); -} - -template <> -hash_value_type __device__ inline MurmurHash3_32::operator()(double const& key) const -{ - return compute(detail::normalize_nans_and_zeros(key)); -} - -template <> -hash_value_type __device__ inline MurmurHash3_32::operator()( - cudf::string_view const& key) const -{ - auto const data = reinterpret_cast(key.data()); - auto const len = key.size_bytes(); - return compute_bytes(data, len); -} - -template <> -hash_value_type __device__ inline MurmurHash3_32::operator()( - numeric::decimal32 const& key) const -{ - return compute(key.value()); -} - -template <> -hash_value_type __device__ inline MurmurHash3_32::operator()( - numeric::decimal64 const& key) const -{ - return compute(key.value()); -} - -template <> -hash_value_type __device__ inline MurmurHash3_32::operator()( - numeric::decimal128 const& key) const -{ - return compute(key.value()); -} - -template <> -hash_value_type __device__ inline MurmurHash3_32::operator()( - cudf::list_view const& key) const -{ - CUDF_UNREACHABLE("List column hashing is not supported"); -} - -template <> -hash_value_type __device__ inline MurmurHash3_32::operator()( - cudf::struct_view const& key) const -{ - CUDF_UNREACHABLE("Direct hashing of struct_view is not supported"); -} - -/** - * @brief This hash function simply returns the value that is asked to be hash - * reinterpreted as the result_type of the functor. - */ -template -struct IdentityHash { - using result_type = hash_value_type; - IdentityHash() = default; - constexpr IdentityHash(uint32_t seed) : m_seed(seed) {} - - template - constexpr std::enable_if_t, return_type> operator()( - Key const& key) const - { - CUDF_UNREACHABLE("IdentityHash does not support this data type"); - } - - template - constexpr std::enable_if_t, return_type> operator()( - Key const& key) const - { - return static_cast(key); - } - - private: - uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; -}; - -template -using default_hash = MurmurHash3_32; - -} // namespace detail -} // namespace cudf diff --git a/cpp/include/cudf/detail/utilities/int_fastdiv.h b/cpp/include/cudf/detail/utilities/int_fastdiv.h index b56fe0e88c1..ff442af5194 100644 --- a/cpp/include/cudf/detail/utilities/int_fastdiv.h +++ b/cpp/include/cudf/detail/utilities/int_fastdiv.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Copyright 2014 Maxim Milakov * @@ -58,7 +58,7 @@ class int_fastdiv { int p; unsigned int ad, anc, delta, q1, r1, q2, r2, t; - const unsigned two31 = 0x8000'0000u; + unsigned const two31 = 0x8000'0000u; ad = (d == 0) ? 1 : abs(d); t = two31 + ((unsigned int)d >> 31); anc = t - 1 - t % ad; @@ -95,11 +95,11 @@ class int_fastdiv { n_add_sign = 0; } - __host__ __device__ __forceinline__ friend int operator/(const int divident, - const int_fastdiv& divisor); + __host__ __device__ __forceinline__ friend int operator/(int const divident, + int_fastdiv const& divisor); }; -__host__ __device__ __forceinline__ int operator/(const int n, const int_fastdiv& divisor) +__host__ __device__ __forceinline__ int operator/(int const n, int_fastdiv const& divisor) { int q; #ifdef __CUDA_ARCH__ @@ -115,61 +115,61 @@ __host__ __device__ __forceinline__ int operator/(const int n, const int_fastdiv return q; } -__host__ __device__ __forceinline__ int operator%(const int n, const int_fastdiv& divisor) +__host__ __device__ __forceinline__ int operator%(int const n, int_fastdiv const& divisor) { int quotient = n / divisor; int remainder = n - quotient * divisor; return remainder; } -__host__ __device__ __forceinline__ int operator/(const unsigned int n, const int_fastdiv& divisor) +__host__ __device__ __forceinline__ int operator/(unsigned int const n, int_fastdiv const& divisor) { return ((int)n) / divisor; } -__host__ __device__ __forceinline__ int operator%(const unsigned int n, const int_fastdiv& divisor) +__host__ __device__ __forceinline__ int operator%(unsigned int const n, int_fastdiv const& divisor) { return ((int)n) % divisor; } -__host__ __device__ __forceinline__ int operator/(const short n, const int_fastdiv& divisor) +__host__ __device__ __forceinline__ int operator/(short const n, int_fastdiv const& divisor) { return ((int)n) / divisor; } -__host__ __device__ __forceinline__ int operator%(const short n, const int_fastdiv& divisor) +__host__ __device__ __forceinline__ int operator%(short const n, int_fastdiv const& divisor) { return ((int)n) % divisor; } -__host__ __device__ __forceinline__ int operator/(const unsigned short n, - const int_fastdiv& divisor) +__host__ __device__ __forceinline__ int operator/(unsigned short const n, + int_fastdiv const& divisor) { return ((int)n) / divisor; } -__host__ __device__ __forceinline__ int operator%(const unsigned short n, - const int_fastdiv& divisor) +__host__ __device__ __forceinline__ int operator%(unsigned short const n, + int_fastdiv const& divisor) { return ((int)n) % divisor; } -__host__ __device__ __forceinline__ int operator/(const char n, const int_fastdiv& divisor) +__host__ __device__ __forceinline__ int operator/(char const n, int_fastdiv const& divisor) { return ((int)n) / divisor; } -__host__ __device__ __forceinline__ int operator%(const char n, const int_fastdiv& divisor) +__host__ __device__ __forceinline__ int operator%(char const n, int_fastdiv const& divisor) { return ((int)n) % divisor; } -__host__ __device__ __forceinline__ int operator/(const unsigned char n, const int_fastdiv& divisor) +__host__ __device__ __forceinline__ int operator/(unsigned char const n, int_fastdiv const& divisor) { return ((int)n) / divisor; } -__host__ __device__ __forceinline__ int operator%(const unsigned char n, const int_fastdiv& divisor) +__host__ __device__ __forceinline__ int operator%(unsigned char const n, int_fastdiv const& divisor) { return ((int)n) % divisor; } diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index 40faae7e9f4..8b709f2a8f8 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -1,7 +1,7 @@ /* * Copyright 2019 BlazingDB, Inc. * Copyright 2019 Eyal Rozenberg - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ namespace util { * `modulus` is positive. The safety is in regard to rollover. */ template -S round_up_safe(S number_to_round, S modulus) +constexpr S round_up_safe(S number_to_round, S modulus) { auto remainder = number_to_round % modulus; if (remainder == 0) { return number_to_round; } @@ -67,7 +67,7 @@ S round_up_safe(S number_to_round, S modulus) * `modulus` is positive and does not check for overflow. */ template -S round_down_safe(S number_to_round, S modulus) noexcept +constexpr S round_down_safe(S number_to_round, S modulus) noexcept { auto remainder = number_to_round % modulus; auto rounded_down = number_to_round - remainder; @@ -107,7 +107,7 @@ constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept * the result will be incorrect */ template -constexpr S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept +constexpr S div_rounding_up_unsafe(S const& dividend, T const& divisor) noexcept { return (dividend + divisor - 1) / divisor; } diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp index 83f061e9407..9e2b85ea129 100644 --- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp @@ -50,7 +50,7 @@ class pinned_allocator { public: using value_type = void; ///< The type of the elements in the allocator using pointer = void*; ///< The type returned by address() / allocate() - using const_pointer = const void*; ///< The type returned by address() + using const_pointer = void const*; ///< The type returned by address() using size_type = std::size_t; ///< The type used for the size of the allocation using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers @@ -76,9 +76,9 @@ class pinned_allocator { public: using value_type = T; ///< The type of the elements in the allocator using pointer = T*; ///< The type returned by address() / allocate() - using const_pointer = const T*; ///< The type returned by address() + using const_pointer = T const*; ///< The type returned by address() using reference = T&; ///< The parameter type for address() - using const_reference = const T&; ///< The parameter type for address() + using const_reference = T const&; ///< The parameter type for address() using size_type = std::size_t; ///< The type used for the size of the allocation using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers diff --git a/cpp/include/cudf/detail/utilities/stacktrace.hpp b/cpp/include/cudf/detail/utilities/stacktrace.hpp new file mode 100644 index 00000000000..c3ec9ce7a52 --- /dev/null +++ b/cpp/include/cudf/detail/utilities/stacktrace.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf::detail { +/** + * @addtogroup utility_stacktrace + * @{ + * @file + */ + +/** + * @brief Specify whether the last stackframe is included in the stacktrace. + */ +enum class capture_last_stackframe : bool { YES, NO }; + +/** + * @brief Query the current stacktrace and return the whole stacktrace as one string. + * + * Depending on the value of the flag `capture_last_frame`, the caller that executes stacktrace + * retrieval can be included in the output result. + * + * @param capture_last_frame Flag to specify if the current stackframe will be included into + * the output + * @return A string storing the whole current stacktrace + */ +std::string get_stacktrace(capture_last_stackframe capture_last_frame); + +/** @} */ // end of group + +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index c446a7b5148..90ad98741ad 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -398,7 +398,7 @@ thrust::host_vector make_host_vector_async( } /** - * @brief Synchronously construct a `std::vector` containing a copy of data from a + * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a * `device_span` * * @note This function does a synchronize on `stream`. @@ -417,7 +417,7 @@ thrust::host_vector make_host_vector_sync(device_span v, rmm::cuda_s } /** - * @brief Synchronously construct a `std::vector` containing a copy of data from a device + * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a device * container * * @note This function synchronizes `stream`. diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index 76d6fd719a4..f3f95dad017 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -49,8 +49,8 @@ __global__ void valid_if_kernel( { constexpr size_type leader_lane{0}; auto const lane_id{threadIdx.x % warp_size}; - thread_index_type i = threadIdx.x + blockIdx.x * blockDim.x; - thread_index_type const stride = blockDim.x * gridDim.x; + auto i = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); size_type warp_valid_count{0}; auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size); @@ -119,7 +119,7 @@ std::pair valid_if(InputIterator begin, * Given a set of bitmasks, `masks`, the state of bit `j` in mask `i` is * determined by `p( *(begin1 + i), *(begin2 + j))`. If the predicate evaluates - * to true, the the bit is set to `1`. If false, set to `0`. + * to true, the bit is set to `1`. If false, set to `0`. * * Example Arguments: * begin1: zero-based counting iterator, diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp index 8688e97ab7e..1268f488919 100644 --- a/cpp/include/cudf/filling.hpp +++ b/cpp/include/cudf/filling.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #pragma once #include +#include #include @@ -54,11 +55,13 @@ namespace cudf { * @param begin The starting index of the fill range (inclusive) * @param end The index of the last element in the fill range (exclusive) * @param value The scalar value to fill + * @param stream CUDA stream used for device memory operations and kernel launches */ void fill_in_place(mutable_column_view& destination, size_type begin, size_type end, - scalar const& value); + scalar const& value, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Fills a range of elements in a column out-of-place with a scalar @@ -79,6 +82,7 @@ void fill_in_place(mutable_column_view& destination, * @param begin The starting index of the fill range (inclusive) * @param end The index of the last element in the fill range (exclusive) * @param value The scalar value to fill + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return The result output column */ @@ -87,6 +91,7 @@ std::unique_ptr fill( size_type begin, size_type end, scalar const& value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -113,12 +118,14 @@ std::unique_ptr fill( * * @param input_table Input table * @param count Non-nullable column of an integral type + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return The result table containing the repetitions */ std::unique_ptr
repeat( table_view const& input_table, column_view const& count, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -131,19 +138,19 @@ std::unique_ptr
repeat( * count = 2 * return = [4,4,5,5,6,6] * ``` - * @throws cudf::logic_error if the data type of @p count is not size_type. - * @throws cudf::logic_error if @p count is invalid or @p count is negative. - * @throws cudf::logic_error if @p input_table.num_rows() * @p count overflows - * size_type. + * @throws cudf::logic_error if @p count is negative. + * @throws std::overflow_error if @p input_table.num_rows() * @p count overflows size_type. * * @param input_table Input table * @param count Number of repetitions + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return The result table containing the repetitions */ std::unique_ptr
repeat( table_view const& input_table, size_type count, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -166,6 +173,7 @@ std::unique_ptr
repeat( * @param size Size of the output column * @param init First value in the sequence * @param step Increment value + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return The result column containing the generated sequence */ @@ -173,6 +181,7 @@ std::unique_ptr sequence( size_type size, scalar const& init, scalar const& step, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -192,12 +201,14 @@ std::unique_ptr sequence( * * @param size Size of the output column * @param init First value in the sequence + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return The result column containing the generated sequence */ std::unique_ptr sequence( size_type size, scalar const& init, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -219,6 +230,7 @@ std::unique_ptr sequence( * @param size Number of timestamps to generate * @param init The initial timestamp * @param months Months to increment + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @return Timestamps column with sequences of months @@ -227,6 +239,7 @@ std::unique_ptr calendrical_month_sequence( size_type size, scalar const& init, size_type months, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index b178700cfc3..7c59c2f9194 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -20,11 +20,8 @@ #include #include -// Note: The versions are used in order for Jitify to work with our fixed_point type. -// Jitify is needed for several algorithms (binaryop, rolling, etc) -#include #include -#include // add cuda namespace +#include #include #include diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp index 8492916bb3c..1de7f66127b 100644 --- a/cpp/include/cudf/fixed_point/temporary.hpp +++ b/cpp/include/cudf/fixed_point/temporary.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,12 +15,12 @@ */ #pragma once +// To avoid https://github.com/NVIDIA/libcudacxx/issues/460 +// in libcudacxx with CTK 12.0/12.1 +#include #include -// Note: The versions are used in order for Jitify to work with our fixed_point type. -// Jitify is needed for several algorithms (binaryop, rolling, etc) -#include #include #include diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index 2f5c0d53e72..6e575685daa 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -186,6 +186,15 @@ class groupby { host_span requests, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** + * @copydoc aggregate(host_span, rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ + std::pair, std::vector> aggregate( + host_span requests, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Performs grouped scans on the specified values. * @@ -294,7 +303,7 @@ class groupby { std::pair, std::unique_ptr
> shift( table_view const& values, host_span offsets, - std::vector> const& fill_values, + std::vector> const& fill_values, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index a8f4f271309..72e32715ed4 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,8 +22,6 @@ namespace cudf { -using hash_value_type = uint32_t; ///< Type of hash value - /** * @addtogroup column_hash * @{ @@ -31,7 +29,14 @@ using hash_value_type = uint32_t; ///< Type of hash value */ /** - * @brief Identifies the hash function to be used + * @brief Type of hash value + * + */ +using hash_value_type = uint32_t; + +/** + * @brief Identifies the hash function to be used + * */ enum class hash_id { HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed @@ -48,9 +53,12 @@ static constexpr uint32_t DEFAULT_HASH_SEED = 0; /** * @brief Computes the hash value of each row in the input set of columns. * + * @deprecated Since 23.08 + * * @param input The table of columns to hash * @param hash_function The hash function enum to use * @param seed Optional seed value to use for the hash function + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @returns A column where each row is the hash of a column from the input @@ -59,7 +67,103 @@ std::unique_ptr hash( table_view const& input, hash_id hash_function = hash_id::HASH_MURMUR3, uint32_t seed = DEFAULT_HASH_SEED, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +//! Hash APIs +namespace hashing { + +/** + * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table + * + * This function computes the hash of each column using the `seed` for the first column + * and the resulting hash as a seed for the next column and so on. + * The result is a uint32 value for each row. + * + * @param input The table of columns to hash + * @param seed Optional seed value to use for the hash function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr murmurhash3_x86_32( + table_view const& input, + uint32_t seed = DEFAULT_HASH_SEED, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Computes the MurmurHash3 64-bit hash value of each row in the given table + * + * This function takes a 64-bit seed value and returns hash values using the + * MurmurHash3_x64_128 algorithm. The hash produces in two uint64 values per row. + * + * @param input The table of columns to hash + * @param seed Optional seed value to use for the hash function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A table of two UINT64 columns + */ +std::unique_ptr
murmurhash3_x64_128( + table_view const& input, + uint64_t seed = DEFAULT_HASH_SEED, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table + * + * This function computes the hash similar to MurmurHash3_x86_32 with special processing + * to match Spark's implementation results. + * + * @param input The table of columns to hash + * @param seed Optional seed value to use for the hash function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr spark_murmurhash3_x86_32( + table_view const& input, + uint32_t seed = DEFAULT_HASH_SEED, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Computes the MD5 hash value of each row in the given table + * + * @param input The table of columns to hash + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr md5( + table_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Computes the XXHash_64 hash value of each row in the given table + * + * This function takes a 64-bit seed value and returns a column of type UINT64. + * + * @param input The table of columns to hash + * @param seed Optional seed value to use for the hash function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr xxhash_64( + table_view const& input, + uint64_t seed = DEFAULT_HASH_SEED, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +} // namespace hashing + /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/hashing/detail/default_hash.cuh b/cpp/include/cudf/hashing/detail/default_hash.cuh new file mode 100644 index 00000000000..37e13d8842f --- /dev/null +++ b/cpp/include/cudf/hashing/detail/default_hash.cuh @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf::hashing::detail { + +/** + * @brief The default hash algorithm for use within libcudf internal functions + * + * This is declared here so it may be changed to another algorithm without modifying + * all those places that use it. Internal function implementations are encourage to + * use the `cudf::hashing::detail::default_hash` where possible. + * + * @tparam Key The key type for use by the hash class + */ +template +using default_hash = MurmurHash3_x86_32; + +} // namespace cudf::hashing::detail diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh new file mode 100644 index 00000000000..7a3d1990791 --- /dev/null +++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf::hashing::detail { + +/** + * Normalization of floating point NaNs, passthrough for all other values. + */ +template +T __device__ inline normalize_nans(T const& key) +{ + if constexpr (cudf::is_floating_point()) { + if (std::isnan(key)) { return std::numeric_limits::quiet_NaN(); } + } + return key; +} + +/** + * Normalization of floating point NaNs and zeros, passthrough for all other values. + */ +template +T __device__ inline normalize_nans_and_zeros(T const& key) +{ + if constexpr (cudf::is_floating_point()) { + if (key == T{0.0}) { return T{0.0}; } + } + return normalize_nans(key); +} + +__device__ inline uint32_t rotate_bits_left(uint32_t x, uint32_t r) +{ + // This function is equivalent to (x << r) | (x >> (32 - r)) + return __funnelshift_l(x, x, r); +} + +__device__ inline uint64_t rotate_bits_left(uint64_t x, uint32_t r) +{ + return (x << r) | (x >> (64 - r)); +} + +__device__ inline uint32_t rotate_bits_right(uint32_t x, uint32_t r) +{ + // This function is equivalent to (x >> r) | (x << (32 - r)) + return __funnelshift_r(x, x, r); +} + +__device__ inline uint64_t rotate_bits_right(uint64_t x, uint32_t r) +{ + return (x >> r) | (x << (64 - r)); +} + +} // namespace cudf::hashing::detail diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp similarity index 62% rename from cpp/include/cudf/detail/hashing.hpp rename to cpp/include/cudf/hashing/detail/hashing.hpp index 771b3e150ec..f08d0fbb849 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -24,32 +24,32 @@ #include namespace cudf { +namespace hashing { namespace detail { -/** - * @copydoc cudf::hash - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ -std::unique_ptr hash(table_view const& input, - hash_id hash_function, - uint32_t seed, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr murmurhash3_x86_32(table_view const& input, + uint32_t seed, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource* mr); + +std::unique_ptr
murmurhash3_x64_128(table_view const& input, + uint64_t seed, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource* mr); -std::unique_ptr murmur_hash3_32(table_view const& input, - uint32_t seed, - rmm::cuda_stream_view, - rmm::mr::device_memory_resource* mr); +std::unique_ptr spark_murmurhash3_x86_32(table_view const& input, + uint32_t seed, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource* mr); -std::unique_ptr spark_murmur_hash3_32(table_view const& input, - uint32_t seed, - rmm::cuda_stream_view, - rmm::mr::device_memory_resource* mr); +std::unique_ptr md5(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); -std::unique_ptr md5_hash(table_view const& input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr xxhash_64(table_view const& input, + uint64_t seed, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource* mr); /* Copyright 2005-2014 Daniel James. * @@ -94,6 +94,7 @@ constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs) } } // namespace detail +} // namespace hashing } // namespace cudf // specialization of std::hash for cudf::data_type @@ -102,8 +103,8 @@ template <> struct hash { std::size_t operator()(cudf::data_type const& type) const noexcept { - return cudf::detail::hash_combine(std::hash{}(static_cast(type.id())), - std::hash{}(type.scale())); + return cudf::hashing::detail::hash_combine( + std::hash{}(static_cast(type.id())), std::hash{}(type.scale())); } }; } // namespace std diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh new file mode 100644 index 00000000000..c986a908706 --- /dev/null +++ b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +#include + +namespace cudf::hashing::detail { + +// MurmurHash3_x64_128 implementation from +// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. +template +struct MurmurHash3_x64_128 { + using result_type = thrust::pair; + + constexpr MurmurHash3_x64_128() = default; + constexpr MurmurHash3_x64_128(uint64_t seed) : m_seed(seed) {} + + __device__ inline uint32_t getblock32(std::byte const* data, cudf::size_type offset) const + { + // Read a 4-byte value from the data pointer as individual bytes for safe + // unaligned access (very likely for string types). + auto block = reinterpret_cast(data + offset); + return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); + } + + __device__ inline uint64_t getblock64(std::byte const* data, cudf::size_type offset) const + { + uint64_t result = getblock32(data, offset + 4); + result = result << 32; + return result | getblock32(data, offset); + } + + __device__ inline uint64_t fmix64(uint64_t k) const + { + k ^= k >> 33; + k *= 0xff51afd7ed558ccdUL; + k ^= k >> 33; + k *= 0xc4ceb9fe1a85ec53UL; + k ^= k >> 33; + return k; + } + + result_type __device__ inline operator()(Key const& key) const { return compute(key); } + + template + result_type __device__ inline compute(T const& key) const + { + return compute_bytes(reinterpret_cast(&key), sizeof(T)); + } + + result_type __device__ inline compute_remaining_bytes(std::byte const* data, + cudf::size_type len, + cudf::size_type tail_offset, + result_type h) const + { + // Process remaining bytes that do not fill a 8-byte chunk. + uint64_t k1 = 0; + uint64_t k2 = 0; + auto const tail = reinterpret_cast(data) + tail_offset; + switch (len & (BLOCK_SIZE - 1)) { + case 15: k2 ^= static_cast(tail[14]) << 48; + case 14: k2 ^= static_cast(tail[13]) << 40; + case 13: k2 ^= static_cast(tail[12]) << 32; + case 12: k2 ^= static_cast(tail[11]) << 24; + case 11: k2 ^= static_cast(tail[10]) << 16; + case 10: k2 ^= static_cast(tail[9]) << 8; + case 9: + k2 ^= static_cast(tail[8]) << 0; + k2 *= c2; + k2 = rotate_bits_left(k2, 33); + k2 *= c1; + h.second ^= k2; + + case 8: k1 ^= static_cast(tail[7]) << 56; + case 7: k1 ^= static_cast(tail[6]) << 48; + case 6: k1 ^= static_cast(tail[5]) << 40; + case 5: k1 ^= static_cast(tail[4]) << 32; + case 4: k1 ^= static_cast(tail[3]) << 24; + case 3: k1 ^= static_cast(tail[2]) << 16; + case 2: k1 ^= static_cast(tail[1]) << 8; + case 1: + k1 ^= static_cast(tail[0]) << 0; + k1 *= c1; + k1 = rotate_bits_left(k1, 31); + k1 *= c2; + h.first ^= k1; + }; + return h; + } + + result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const + { + auto const nblocks = len / BLOCK_SIZE; + uint64_t h1 = m_seed; + uint64_t h2 = m_seed; + + // Process all four-byte chunks. + for (cudf::size_type i = 0; i < nblocks; i++) { + uint64_t k1 = getblock64(data, (i * BLOCK_SIZE)); // 1st 8 bytes + uint64_t k2 = getblock64(data, (i * BLOCK_SIZE) + (BLOCK_SIZE / 2)); // 2nd 8 bytes + + k1 *= c1; + k1 = rotate_bits_left(k1, 31); + k1 *= c2; + + h1 ^= k1; + h1 = rotate_bits_left(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + + k2 *= c2; + k2 = rotate_bits_left(k2, 33); + k2 *= c1; + + h2 ^= k2; + h2 = rotate_bits_left(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + } + + thrust::tie(h1, h2) = compute_remaining_bytes(data, len, nblocks * BLOCK_SIZE, {h1, h2}); + + // Finalize hash. + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return {h1, h2}; + } + + private: + uint64_t m_seed{}; + static constexpr uint32_t BLOCK_SIZE = 16; // 2 x 64-bit = 16 bytes + + static constexpr uint64_t c1 = 0x87c37b91114253d5UL; + static constexpr uint64_t c2 = 0x4cf5ad432745937fUL; +}; + +template <> +MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( + bool const& key) const +{ + return compute(key); +} + +template <> +MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( + float const& key) const +{ + return compute(normalize_nans(key)); +} + +template <> +MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( + double const& key) const +{ + return compute(normalize_nans(key)); +} + +template <> +MurmurHash3_x64_128::result_type + __device__ inline MurmurHash3_x64_128::operator()( + cudf::string_view const& key) const +{ + auto const data = reinterpret_cast(key.data()); + auto const len = key.size_bytes(); + return compute_bytes(data, len); +} + +template <> +MurmurHash3_x64_128::result_type + __device__ inline MurmurHash3_x64_128::operator()( + numeric::decimal32 const& key) const +{ + return compute(key.value()); +} + +template <> +MurmurHash3_x64_128::result_type + __device__ inline MurmurHash3_x64_128::operator()( + numeric::decimal64 const& key) const +{ + return compute(key.value()); +} + +template <> +MurmurHash3_x64_128::result_type + __device__ inline MurmurHash3_x64_128::operator()( + numeric::decimal128 const& key) const +{ + return compute(key.value()); +} + +} // namespace cudf::hashing::detail diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh new file mode 100644 index 00000000000..6cf0b0fe817 --- /dev/null +++ b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2017-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf::hashing::detail { + +// MurmurHash3_x86_32 implementation from +// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. +template +struct MurmurHash3_x86_32 { + using result_type = hash_value_type; + + constexpr MurmurHash3_x86_32() = default; + constexpr MurmurHash3_x86_32(uint32_t seed) : m_seed(seed) {} + + [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const + { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; + } + + [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data, + cudf::size_type offset) const + { + // Read a 4-byte value from the data pointer as individual bytes for safe + // unaligned access (very likely for string types). + auto const block = reinterpret_cast(data + offset); + return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); + } + + [[nodiscard]] result_type __device__ inline operator()(Key const& key) const + { + return compute(normalize_nans_and_zeros(key)); + } + + template + result_type __device__ inline compute(T const& key) const + { + return compute_bytes(reinterpret_cast(&key), sizeof(T)); + } + + result_type __device__ inline compute_remaining_bytes(std::byte const* data, + cudf::size_type len, + cudf::size_type tail_offset, + result_type h) const + { + // Process remaining bytes that do not fill a four-byte chunk. + uint32_t k1 = 0; + switch (len % 4) { + case 3: k1 ^= std::to_integer(data[tail_offset + 2]) << 16; [[fallthrough]]; + case 2: k1 ^= std::to_integer(data[tail_offset + 1]) << 8; [[fallthrough]]; + case 1: + k1 ^= std::to_integer(data[tail_offset]); + k1 *= c1; + k1 = rotate_bits_left(k1, rot_c1); + k1 *= c2; + h ^= k1; + }; + return h; + } + + result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const + { + constexpr cudf::size_type BLOCK_SIZE = 4; + cudf::size_type const nblocks = len / BLOCK_SIZE; + cudf::size_type const tail_offset = nblocks * BLOCK_SIZE; + result_type h = m_seed; + + // Process all four-byte chunks. + for (cudf::size_type i = 0; i < nblocks; i++) { + uint32_t k1 = getblock32(data, i * BLOCK_SIZE); + k1 *= c1; + k1 = rotate_bits_left(k1, rot_c1); + k1 *= c2; + h ^= k1; + h = rotate_bits_left(h, rot_c2); + h = h * 5 + c3; + } + + h = compute_remaining_bytes(data, len, tail_offset, h); + + // Finalize hash. + h ^= len; + h = fmix32(h); + return h; + } + + private: + uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; + static constexpr uint32_t c1 = 0xcc9e2d51; + static constexpr uint32_t c2 = 0x1b873593; + static constexpr uint32_t c3 = 0xe6546b64; + static constexpr uint32_t rot_c1 = 15; + static constexpr uint32_t rot_c2 = 13; +}; + +template <> +hash_value_type __device__ inline MurmurHash3_x86_32::operator()(bool const& key) const +{ + return compute(static_cast(key)); +} + +template <> +hash_value_type __device__ inline MurmurHash3_x86_32::operator()(float const& key) const +{ + return compute(normalize_nans_and_zeros(key)); +} + +template <> +hash_value_type __device__ inline MurmurHash3_x86_32::operator()(double const& key) const +{ + return compute(normalize_nans_and_zeros(key)); +} + +template <> +hash_value_type __device__ inline MurmurHash3_x86_32::operator()( + cudf::string_view const& key) const +{ + auto const data = reinterpret_cast(key.data()); + auto const len = key.size_bytes(); + return compute_bytes(data, len); +} + +template <> +hash_value_type __device__ inline MurmurHash3_x86_32::operator()( + numeric::decimal32 const& key) const +{ + return compute(key.value()); +} + +template <> +hash_value_type __device__ inline MurmurHash3_x86_32::operator()( + numeric::decimal64 const& key) const +{ + return compute(key.value()); +} + +template <> +hash_value_type __device__ inline MurmurHash3_x86_32::operator()( + numeric::decimal128 const& key) const +{ + return compute(key.value()); +} + +template <> +hash_value_type __device__ inline MurmurHash3_x86_32::operator()( + cudf::list_view const& key) const +{ + CUDF_UNREACHABLE("List column hashing is not supported"); +} + +template <> +hash_value_type __device__ inline MurmurHash3_x86_32::operator()( + cudf::struct_view const& key) const +{ + CUDF_UNREACHABLE("Direct hashing of struct_view is not supported"); +} + +} // namespace cudf::hashing::detail diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp new file mode 100644 index 00000000000..5f79f05c5a1 --- /dev/null +++ b/cpp/include/cudf/io/arrow_io_source.hpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "datasource.hpp" + +#include +#include + +#include +#include + +namespace cudf::io { +/** + * @addtogroup io_datasources + * @{ + * @file + */ + +/** + * @brief Implementation class for reading from an Apache Arrow file. The file + * could be a memory-mapped file or other implementation supported by Arrow. + */ +class arrow_io_source : public datasource { + public: + /** + * @brief Constructs an object from an Apache Arrow Filesystem URI + * + * @param arrow_uri Apache Arrow Filesystem URI + */ + explicit arrow_io_source(std::string const& arrow_uri); + + /** + * @brief Constructs an object from an `arrow` source object. + * + * @param file The `arrow` object from which the data is read + */ + explicit arrow_io_source(std::shared_ptr file) : arrow_file(file) {} + + /** + * @brief Returns a buffer with a subset of data from the `arrow` source. + * + * @param offset The offset in bytes from which to read + * @param size The number of bytes to read + * @return A buffer with the read data + */ + std::unique_ptr host_read(size_t offset, size_t size) override; + + /** + * @brief Reads a selected range from the `arrow` source into a preallocated buffer. + * + * @param[in] offset The offset in bytes from which to read + * @param[in] size The number of bytes to read + * @param[out] dst The preallocated buffer to read into + * @return The number of bytes read + */ + size_t host_read(size_t offset, size_t size, uint8_t* dst) override; + /** + * @brief Returns the size of the data in the `arrow` source. + * + * @return The size of the data in the `arrow` source + */ + [[nodiscard]] size_t size() const override; + + private: + std::shared_ptr filesystem; + std::shared_ptr arrow_file; +}; + +/** @} */ // end of group +} // namespace cudf::io diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index b5669438b4f..c84ca7e6c73 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -208,7 +208,7 @@ class csv_reader_options { [[nodiscard]] std::size_t get_byte_range_padding() const { auto const num_names = _names.size(); - auto const num_dtypes = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes); + auto const num_dtypes = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes); auto const num_columns = std::max(num_dtypes, num_names); auto const max_row_bytes = 16 * 1024; // 16KB @@ -567,31 +567,33 @@ class csv_reader_options { /** * @brief Sets number of rows to skip from start. * - * @param skip Number of rows to skip + * @param skiprows Number of rows to skip */ - void set_skiprows(size_type skip) + void set_skiprows(size_type skiprows) { - if ((skip != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) { - CUDF_FAIL( - "skiprows can't be a non zero value if range offset and/or range size has been set"); + if ((skiprows != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) { + CUDF_FAIL("skiprows must be zero if range offset or range size has been set", + std::invalid_argument); } - _skiprows = skip; + _skiprows = skiprows; } /** * @brief Sets number of rows to skip from end. * - * @param skip Number of rows to skip + * @param skipfooter Number of rows to skip */ - void set_skipfooter(size_type skip) + void set_skipfooter(size_type skipfooter) { - CUDF_EXPECTS((skip == 0) or (_nrows == -1), "Cannot use both `nrows` and `skipfooter`"); - if ((skip != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) { - CUDF_FAIL( - "skipfooter can't be a non zero value if range offset and/or range size has been set"); + CUDF_EXPECTS((skipfooter == 0) or (_nrows == -1), + "Cannot use both `nrows` and `skipfooter`", + std::invalid_argument); + if ((skipfooter != 0) and ((_byte_range_offset != 0) or (_byte_range_size != 0))) { + CUDF_FAIL("skipfooter must be zero if range offset or range size has been set", + std::invalid_argument); } - _skipfooter = skip; + _skipfooter = skipfooter; } /** diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp index 88f9c188530..69d8a388d45 100644 --- a/cpp/include/cudf/io/data_sink.hpp +++ b/cpp/include/cudf/io/data_sink.hpp @@ -30,6 +30,13 @@ namespace cudf { //! IO interfaces namespace io { + +/** + * @addtogroup io_datasinks + * @{ + * @file + */ + /** * @brief Interface class for storing the output data from the writers */ @@ -41,7 +48,7 @@ class data_sink { * @param[in] filepath Path to the file to use * @return Constructed data_sink object */ - static std::unique_ptr create(const std::string& filepath); + static std::unique_ptr create(std::string const& filepath); /** * @brief Create a sink from a std::vector @@ -200,5 +207,6 @@ class data_sink { virtual size_t bytes_written() = 0; }; +/** @} */ // end of group } // namespace io } // namespace cudf diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 5c37be5a56f..28263d466f3 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -22,35 +22,6 @@ #include -#include - -// We disable warning 611 because some Arrow subclasses of -// `arrow::fs::FileSystem` only partially override the `Equals` method, -// triggering warning 611-D from nvcc. -#ifdef __CUDACC__ -#pragma nv_diag_suppress 611 -#endif -#include -#include -#ifdef __CUDACC__ -#pragma nv_diag_default 611 -#endif - -// We disable warning 2810 to workaround the compile issue (warning treated as error): -// result.h(263): error #2810-D: ignoring return value type with "nodiscard" attribute -#ifdef __CUDACC__ -#pragma nv_diag_suppress 2810 -#endif -#include -#ifdef __CUDACC__ -#pragma nv_diag_default 2810 -#endif - -#include -#include -#include -#include - #include #include @@ -58,6 +29,12 @@ namespace cudf { //! IO interfaces namespace io { +/** + * @addtogroup io_datasources + * @{ + * @file + */ + /** * @brief Interface class for providing input data to the readers. */ @@ -113,7 +90,7 @@ class datasource { * @param[in] size Bytes from the offset; use zero for entire file (the default is zero) * @return Constructed datasource object */ - static std::unique_ptr create(const std::string& filepath, + static std::unique_ptr create(std::string const& filepath, size_t offset = 0, size_t size = 0); @@ -143,15 +120,6 @@ class datasource { */ static std::unique_ptr create(cudf::device_span buffer); - /** - * @brief Creates a source from a from an Arrow file. - * - * @param[in] arrow_file RandomAccessFile to which the API calls are forwarded - * @return Constructed datasource object - */ - static std::unique_ptr create( - std::shared_ptr arrow_file); - /** * @brief Creates a source from an user implemented datasource object. * @@ -406,107 +374,6 @@ class datasource { }; }; -/** - * @brief Implementation class for reading from an Apache Arrow file. The file - * could be a memory-mapped file or other implementation supported by Arrow. - */ -class arrow_io_source : public datasource { - /** - * @brief Implementation for an owning buffer where `arrow::Buffer` holds the data. - */ - class arrow_io_buffer : public buffer { - std::shared_ptr arrow_buffer; - - public: - explicit arrow_io_buffer(std::shared_ptr arrow_buffer) - : arrow_buffer(arrow_buffer) - { - } - [[nodiscard]] size_t size() const override { return arrow_buffer->size(); } - [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); } - }; - - public: - /** - * @brief Constructs an object from an Apache Arrow Filesystem URI - * - * @param arrow_uri Apache Arrow Filesystem URI - */ - explicit arrow_io_source(std::string_view arrow_uri) - { - const std::string uri_start_delimiter = "//"; - const std::string uri_end_delimiter = "?"; - - arrow::Result> result = - arrow::fs::FileSystemFromUri(static_cast(arrow_uri)); - CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI."); - filesystem = result.ValueOrDie(); - - // Parse the path from the URI - size_t start = arrow_uri.find(uri_start_delimiter) == std::string::npos - ? 0 - : arrow_uri.find(uri_start_delimiter) + uri_start_delimiter.size(); - size_t end = arrow_uri.find(uri_end_delimiter) - start; - std::string_view path = arrow_uri.substr(start, end); - - arrow::Result> in_stream = - filesystem->OpenInputFile(static_cast(path).c_str()); - CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile"); - arrow_file = in_stream.ValueOrDie(); - } - - /** - * @brief Constructs an object from an `arrow` source object. - * - * @param file The `arrow` object from which the data is read - */ - explicit arrow_io_source(std::shared_ptr file) : arrow_file(file) {} - - /** - * @brief Returns a buffer with a subset of data from the `arrow` source. - * - * @param offset The offset in bytes from which to read - * @param size The number of bytes to read - * @return A buffer with the read data - */ - std::unique_ptr host_read(size_t offset, size_t size) override - { - auto result = arrow_file->ReadAt(offset, size); - CUDF_EXPECTS(result.ok(), "Cannot read file data"); - return std::make_unique(result.ValueOrDie()); - } - - /** - * @brief Reads a selected range from the `arrow` source into a preallocated buffer. - * - * @param[in] offset The offset in bytes from which to read - * @param[in] size The number of bytes to read - * @param[out] dst The preallocated buffer to read into - * @return The number of bytes read - */ - size_t host_read(size_t offset, size_t size, uint8_t* dst) override - { - auto result = arrow_file->ReadAt(offset, size, dst); - CUDF_EXPECTS(result.ok(), "Cannot read file data"); - return result.ValueOrDie(); - } - - /** - * @brief Returns the size of the data in the `arrow` source. - * - * @return The size of the data in the `arrow` source - */ - [[nodiscard]] size_t size() const override - { - auto result = arrow_file->GetSize(); - CUDF_EXPECTS(result.ok(), "Cannot get file size"); - return result.ValueOrDie(); - } - - private: - std::shared_ptr filesystem; - std::shared_ptr arrow_file; -}; - +/** @} */ // end of group } // namespace io } // namespace cudf diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index a62c9873e75..b7ee5e05e96 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -32,7 +32,7 @@ #include -namespace cudf::io::json::experimental::detail { +namespace cudf::io::json::detail { // Unicode code point escape sequence static constexpr char UNICODE_SEQ = 0x7F; @@ -181,7 +181,7 @@ process_string(in_iterator_t in_begin, cudf::io::parse_options_view const& options) { int32_t bytes = 0; - const auto num_in_chars = thrust::distance(in_begin, in_end); + auto const num_in_chars = thrust::distance(in_begin, in_end); // String values are indicated by keeping the quote character bool const is_string_value = num_in_chars >= 2LL && @@ -428,4 +428,4 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, return out_col; } -} // namespace cudf::io::json::experimental::detail +} // namespace cudf::io::json::detail diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 7b0350e9bc8..6930a4fdb25 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -33,7 +33,7 @@ namespace cudf::io::json::detail { * * @return cudf::table object that contains the array of cudf::column. */ -table_with_metadata read_json(std::vector>& sources, +table_with_metadata read_json(host_span> sources, json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp index b7794c0df6a..623f402f9c9 100644 --- a/cpp/include/cudf/io/detail/orc.hpp +++ b/cpp/include/cudf/io/detail/orc.hpp @@ -70,11 +70,9 @@ class reader { * @brief Reads the entire dataset. * * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches. - * * @return The set of columns along with table metadata */ - table_with_metadata read(orc_reader_options const& options, rmm::cuda_stream_view stream); + table_with_metadata read(orc_reader_options const& options); }; /** diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 9a94924824d..3f2e1fa5e6c 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -21,6 +21,7 @@ #pragma once #include +#include #include #include @@ -208,8 +209,17 @@ class writer { * @return A parquet-compatible blob that contains the data for all rowgroups in the list */ static std::unique_ptr> merge_row_group_metadata( - const std::vector>>& metadata_list); + std::vector>> const& metadata_list); }; +/** + * @brief Reads metadata of parquet dataset. + * + * @param sources Dataset sources to read from + * + * @return parquet_metadata with parquet schema, number of rows, number of row groups and key-value + * metadata. + */ +parquet_metadata read_parquet_metadata(host_span const> sources); } // namespace detail::parquet } // namespace cudf::io diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp index 4914f434c98..b2ea29a85c3 100644 --- a/cpp/include/cudf/io/detail/tokenize_json.hpp +++ b/cpp/include/cudf/io/detail/tokenize_json.hpp @@ -110,6 +110,8 @@ enum token_t : PdaTokenT { ValueEnd, /// Beginning-of-error token (on first encounter of a parsing error) ErrorBegin, + /// Delimiting a JSON line for error recovery + LineEnd, /// Total number of tokens NUM_TOKENS }; diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index f94fd5adeb8..15dc2a614ad 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -54,6 +54,14 @@ struct schema_element { std::map child_types; }; +/** + * @brief Control the error recovery behavior of the json parser + */ +enum class json_recovery_mode_t { + FAIL, ///< Does not recover from an error when encountering an invalid format + RECOVER_WITH_NULL ///< Recovers from an error, replacing invalid records with null +}; + /** * @brief Input arguments to the `read_json` interface. * @@ -105,12 +113,15 @@ class json_reader_options { // Whether to keep the quote characters of string values bool _keep_quotes = false; + // Whether to recover after an invalid JSON line + json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; + /** * @brief Constructor from source info. * * @param src source information used to read parquet file */ - explicit json_reader_options(const source_info& src) : _source(src) {} + explicit json_reader_options(source_info const& src) : _source(src) {} friend json_reader_options_builder; @@ -192,7 +203,7 @@ class json_reader_options { */ size_t get_byte_range_padding() const { - auto const num_columns = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes); + auto const num_columns = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes); auto const max_row_bytes = 16 * 1024; // 16KB auto const column_bytes = 64; @@ -235,6 +246,13 @@ class json_reader_options { */ bool is_enabled_keep_quotes() const { return _keep_quotes; } + /** + * @brief Queries the JSON reader's behavior on invalid JSON lines. + * + * @returns An enum that specifies the JSON reader's behavior on invalid JSON lines. + */ + json_recovery_mode_t recovery_mode() const { return _recovery_mode; } + /** * @brief Set data types for columns to be read. * @@ -305,6 +323,13 @@ class json_reader_options { * of string values */ void enable_keep_quotes(bool val) { _keep_quotes = val; } + + /** + * @brief Specifies the JSON reader's behavior on invalid JSON lines. + * + * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines. + */ + void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; } }; /** @@ -449,6 +474,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Specifies the JSON reader's behavior on invalid JSON lines. + * + * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines. + * @return this for chaining + */ + json_reader_options_builder& recovery_mode(json_recovery_mode_t val) + { + options._recovery_mode = val; + return *this; + } + /** * @brief move json_reader_options member once it's built. */ diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index e3abbe6056f..024f4f23b94 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -445,9 +445,11 @@ class orc_writer_options { // Set of columns to output table_view _table; // Optional associated metadata - const table_input_metadata* _metadata = nullptr; + std::optional _metadata; // Optional footer key_value_metadata std::map _user_data; + // Optional compression statistics + std::shared_ptr _compression_stats; friend orc_writer_options_builder; @@ -548,7 +550,7 @@ class orc_writer_options { * * @return Associated metadata */ - [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; } + [[nodiscard]] auto const& get_metadata() const { return _metadata; } /** * @brief Returns Key-Value footer metadata information. @@ -560,6 +562,16 @@ class orc_writer_options { return _user_data; } + /** + * @brief Returns a shared pointer to the user-provided compression statistics. + * + * @return Compression statistics + */ + [[nodiscard]] std::shared_ptr get_compression_statistics() const + { + return _compression_stats; + } + // Setters /** @@ -637,7 +649,7 @@ class orc_writer_options { * * @param meta Associated metadata */ - void set_metadata(table_input_metadata const* meta) { _metadata = meta; } + void set_metadata(table_input_metadata meta) { _metadata = std::move(meta); } /** * @brief Sets metadata. @@ -648,6 +660,16 @@ class orc_writer_options { { _user_data = std::move(metadata); } + + /** + * @brief Sets the pointer to the output compression statistics. + * + * @param comp_stats Pointer to compression statistics to be updated after writing + */ + void set_compression_statistics(std::shared_ptr comp_stats) + { + _compression_stats = std::move(comp_stats); + } }; /** @@ -757,9 +779,9 @@ class orc_writer_options_builder { * @param meta Associated metadata * @return this for chaining */ - orc_writer_options_builder& metadata(table_input_metadata const* meta) + orc_writer_options_builder& metadata(table_input_metadata meta) { - options._metadata = meta; + options._metadata = std::move(meta); return *this; } @@ -775,6 +797,19 @@ class orc_writer_options_builder { return *this; } + /** + * @brief Sets the pointer to the output compression statistics. + * + * @param comp_stats Pointer to compression statistics to be filled once writer is done + * @return this for chaining + */ + orc_writer_options_builder& compression_statistics( + std::shared_ptr const& comp_stats) + { + options._compression_stats = comp_stats; + return *this; + } + /** * @brief move orc_writer_options member once it's built. */ @@ -826,9 +861,11 @@ class chunked_orc_writer_options { // Row index stride (maximum number of rows in each row group) size_type _row_index_stride = default_row_index_stride; // Optional associated metadata - const table_input_metadata* _metadata = nullptr; + std::optional _metadata; // Optional footer key_value_metadata std::map _user_data; + // Optional compression statistics + std::shared_ptr _compression_stats; friend chunked_orc_writer_options_builder; @@ -907,7 +944,7 @@ class chunked_orc_writer_options { * * @return Associated metadata */ - [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; } + [[nodiscard]] auto const& get_metadata() const { return _metadata; } /** * @brief Returns Key-Value footer metadata information. @@ -919,6 +956,16 @@ class chunked_orc_writer_options { return _user_data; } + /** + * @brief Returns a shared pointer to the user-provided compression statistics. + * + * @return Compression statistics + */ + [[nodiscard]] std::shared_ptr get_compression_statistics() const + { + return _compression_stats; + } + // Setters /** @@ -989,7 +1036,7 @@ class chunked_orc_writer_options { * * @param meta Associated metadata */ - void metadata(table_input_metadata const* meta) { _metadata = meta; } + void metadata(table_input_metadata meta) { _metadata = std::move(meta); } /** * @brief Sets Key-Value footer metadata. @@ -1000,6 +1047,16 @@ class chunked_orc_writer_options { { _user_data = std::move(metadata); } + + /** + * @brief Sets the pointer to the output compression statistics. + * + * @param comp_stats Pointer to compression statistics to be updated after writing + */ + void set_compression_statistics(std::shared_ptr comp_stats) + { + _compression_stats = std::move(comp_stats); + } }; /** @@ -1094,9 +1151,9 @@ class chunked_orc_writer_options_builder { * @param meta Associated metadata * @return this for chaining */ - chunked_orc_writer_options_builder& metadata(table_input_metadata const* meta) + chunked_orc_writer_options_builder& metadata(table_input_metadata meta) { - options._metadata = meta; + options._metadata = std::move(meta); return *this; } @@ -1113,6 +1170,19 @@ class chunked_orc_writer_options_builder { return *this; } + /** + * @brief Sets the pointer to the output compression statistics. + * + * @param comp_stats Pointer to compression statistics to be filled once writer is done + * @return this for chaining + */ + chunked_orc_writer_options_builder& compression_statistics( + std::shared_ptr const& comp_stats) + { + options._compression_stats = comp_stats; + return *this; + } + /** * @brief move chunked_orc_writer_options member once it's built. */ diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index 9ad16a0e173..623ee2e49fc 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -355,13 +355,13 @@ class orc_metadata { }; /** - * @brief Reads file-level and stripe-level statistics of ORC dataset. + * @brief Reads metadata of ORC dataset. * * @ingroup io_readers * * @param src_info Dataset source * - * @return Column names and decoded ORC statistics + * @return orc_metadata with ORC schema, number of rows and number of stripes. */ orc_metadata read_orc_metadata(source_info const& src_info); diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 07d41e3b132..788ff15f3c1 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -62,6 +63,9 @@ class parquet_reader_options { // Number of rows to read; `nullopt` is all std::optional _num_rows; + // Predicate filter as AST to filter output rows. + std::optional> _filter; + // Whether to store string data as categorical type bool _convert_strings_to_categories = false; // Whether to use PANDAS metadata to load columns @@ -160,6 +164,13 @@ class parquet_reader_options { */ [[nodiscard]] auto const& get_row_groups() const { return _row_groups; } + /** + * @brief Returns AST based filter for predicate pushdown. + * + * @return AST expression to use as filter + */ + [[nodiscard]] auto const& get_filter() const { return _filter; } + /** * @brief Returns timestamp type used to cast timestamp columns. * @@ -181,6 +192,13 @@ class parquet_reader_options { */ void set_row_groups(std::vector> row_groups); + /** + * @brief Sets AST based filter for predicate pushdown. + * + * @param filter AST expression to use as filter + */ + void set_filter(ast::expression const& filter) { _filter = filter; } + /** * @brief Sets to enable/disable conversion of strings to categories. * @@ -273,6 +291,18 @@ class parquet_reader_options_builder { return *this; } + /** + * @brief Sets vector of individual row groups to read. + * + * @param filter Vector of row groups to read + * @return this for chaining + */ + parquet_reader_options_builder& filter(ast::expression const& filter) + { + options.set_filter(filter); + return *this; + } + /** * @brief Sets enable/disable conversion of strings to categories. * @@ -472,7 +502,7 @@ class parquet_writer_options { // Partitions described as {start_row, num_rows} pairs std::vector _partitions; // Optional associated metadata - table_input_metadata const* _metadata = nullptr; + std::optional _metadata; // Optional footer key_value_metadata std::vector> _user_data; // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. @@ -496,6 +526,10 @@ class parquet_writer_options { size_t _max_dictionary_size = default_max_dictionary_size; // Maximum number of rows in a page fragment std::optional _max_page_fragment_size; + // Optional compression statistics + std::shared_ptr _compression_stats; + // write V2 page headers? + bool _v2_page_headers = false; /** * @brief Constructor from sink and table. @@ -575,7 +609,7 @@ class parquet_writer_options { * * @return Associated metadata */ - [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; } + [[nodiscard]] auto const& get_metadata() const { return _metadata; } /** * @brief Returns Key-Value footer metadata information. @@ -670,6 +704,23 @@ class parquet_writer_options { */ [[nodiscard]] auto get_max_page_fragment_size() const { return _max_page_fragment_size; } + /** + * @brief Returns a shared pointer to the user-provided compression statistics. + * + * @return Compression statistics + */ + [[nodiscard]] std::shared_ptr get_compression_statistics() const + { + return _compression_stats; + } + + /** + * @brief Returns `true` if V2 page headers should be written. + * + * @return `true` if V2 page headers should be written. + */ + [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; } + /** * @brief Sets partitions. * @@ -683,7 +734,7 @@ class parquet_writer_options { * * @param metadata Associated metadata */ - void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; } + void set_metadata(table_input_metadata metadata) { _metadata = std::move(metadata); } /** * @brief Sets metadata. @@ -777,6 +828,23 @@ class parquet_writer_options { * @param size_rows Maximum page fragment size, in rows. */ void set_max_page_fragment_size(size_type size_rows); + + /** + * @brief Sets the pointer to the output compression statistics. + * + * @param comp_stats Pointer to compression statistics to be updated after writing + */ + void set_compression_statistics(std::shared_ptr comp_stats) + { + _compression_stats = std::move(comp_stats); + } + + /** + * @brief Sets preference for V2 page headers. Write V2 page headers if set to `true`. + * + * @param val Boolean value to enable/disable writing of V2 page headers. + */ + void enable_write_v2_headers(bool val) { _v2_page_headers = val; } }; /** @@ -819,9 +887,9 @@ class parquet_writer_options_builder { * @param metadata Associated metadata * @return this for chaining */ - parquet_writer_options_builder& metadata(table_input_metadata const* metadata) + parquet_writer_options_builder& metadata(table_input_metadata metadata) { - options._metadata = metadata; + options._metadata = std::move(metadata); return *this; } @@ -983,6 +1051,19 @@ class parquet_writer_options_builder { */ parquet_writer_options_builder& max_page_fragment_size(size_type val); + /** + * @brief Sets the pointer to the output compression statistics. + * + * @param comp_stats Pointer to compression statistics to be filled once writer is done + * @return this for chaining + */ + parquet_writer_options_builder& compression_statistics( + std::shared_ptr const& comp_stats) + { + options._compression_stats = comp_stats; + return *this; + } + /** * @brief Sets whether int96 timestamps are written or not in parquet_writer_options. * @@ -995,6 +1076,14 @@ class parquet_writer_options_builder { return *this; } + /** + * @brief Set to true if V2 page headers are to be written. + * + * @param enabled Boolean value to enable/disable writing of V2 page headers. + * @return this for chaining + */ + parquet_writer_options_builder& write_v2_headers(bool enabled); + /** * @brief move parquet_writer_options member once it's built. */ @@ -1037,7 +1126,7 @@ std::unique_ptr> write_parquet(parquet_writer_options const * @return A parquet-compatible blob that contains the data for all row groups in the list */ std::unique_ptr> merge_row_group_metadata( - const std::vector>>& metadata_list); + std::vector>> const& metadata_list); class chunked_parquet_writer_options_builder; @@ -1052,7 +1141,7 @@ class chunked_parquet_writer_options { // Specify the level of statistics in the output file statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP; // Optional associated metadata. - table_input_metadata const* _metadata = nullptr; + std::optional _metadata; // Optional footer key_value_metadata std::vector> _user_data; // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. @@ -1074,6 +1163,10 @@ class chunked_parquet_writer_options { size_t _max_dictionary_size = default_max_dictionary_size; // Maximum number of rows in a page fragment std::optional _max_page_fragment_size; + // Optional compression statistics + std::shared_ptr _compression_stats; + // write V2 page headers? + bool _v2_page_headers = false; /** * @brief Constructor from sink. @@ -1118,7 +1211,7 @@ class chunked_parquet_writer_options { * * @return Metadata information */ - [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; } + [[nodiscard]] auto const& get_metadata() const { return _metadata; } /** * @brief Returns Key-Value footer metadata information. @@ -1204,12 +1297,29 @@ class chunked_parquet_writer_options { */ [[nodiscard]] auto get_max_page_fragment_size() const { return _max_page_fragment_size; } + /** + * @brief Returns a shared pointer to the user-provided compression statistics. + * + * @return Compression statistics + */ + [[nodiscard]] std::shared_ptr get_compression_statistics() const + { + return _compression_stats; + } + + /** + * @brief Returns `true` if V2 page headers should be written. + * + * @return `true` if V2 page headers should be written. + */ + [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; } + /** * @brief Sets metadata. * * @param metadata Associated metadata */ - void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; } + void set_metadata(table_input_metadata metadata) { _metadata = std::move(metadata); } /** * @brief Sets Key-Value footer metadata. @@ -1297,6 +1407,23 @@ class chunked_parquet_writer_options { */ void set_max_page_fragment_size(size_type size_rows); + /** + * @brief Sets the pointer to the output compression statistics. + * + * @param comp_stats Pointer to compression statistics to be updated after writing + */ + void set_compression_statistics(std::shared_ptr comp_stats) + { + _compression_stats = std::move(comp_stats); + } + + /** + * @brief Sets preference for V2 page headers. Write V2 page headers if set to `true`. + * + * @param val Boolean value to enable/disable writing of V2 page headers. + */ + void enable_write_v2_headers(bool val) { _v2_page_headers = val; } + /** * @brief creates builder to build chunked_parquet_writer_options. * @@ -1334,9 +1461,9 @@ class chunked_parquet_writer_options_builder { * @param metadata Associated metadata * @return this for chaining */ - chunked_parquet_writer_options_builder& metadata(table_input_metadata const* metadata) + chunked_parquet_writer_options_builder& metadata(table_input_metadata metadata) { - options._metadata = metadata; + options._metadata = std::move(metadata); return *this; } @@ -1350,7 +1477,7 @@ class chunked_parquet_writer_options_builder { std::vector> metadata); /** - * @brief Sets Sets the level of statistics in chunked_parquet_writer_options. + * @brief Sets the level of statistics in chunked_parquet_writer_options. * * @param sf Level of statistics requested in the output file * @return this for chaining @@ -1388,6 +1515,14 @@ class chunked_parquet_writer_options_builder { return *this; } + /** + * @brief Set to true if V2 page headers are to be written. + * + * @param enabled Boolean value to enable/disable writing of V2 page headers. + * @return this for chaining + */ + chunked_parquet_writer_options_builder& write_v2_headers(bool enabled); + /** * @brief Sets the maximum row group size, in bytes. * @@ -1503,6 +1638,19 @@ class chunked_parquet_writer_options_builder { */ chunked_parquet_writer_options_builder& max_page_fragment_size(size_type val); + /** + * @brief Sets the pointer to the output compression statistics. + * + * @param comp_stats Pointer to compression statistics to be filled once writer is done + * @return this for chaining + */ + chunked_parquet_writer_options_builder& compression_statistics( + std::shared_ptr const& comp_stats) + { + options._compression_stats = comp_stats; + return *this; + } + /** * @brief move chunked_parquet_writer_options member once it's built. */ diff --git a/cpp/include/cudf/io/parquet_metadata.hpp b/cpp/include/cudf/io/parquet_metadata.hpp new file mode 100644 index 00000000000..0c985fc3c69 --- /dev/null +++ b/cpp/include/cudf/io/parquet_metadata.hpp @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file parquet_metadata.hpp + * @brief cuDF-IO freeform API + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace cudf { +namespace io { + +namespace parquet { +/** + * @brief Basic data types in Parquet, determines how data is physically stored + */ +enum class TypeKind : int8_t { + UNDEFINED_TYPE = -1, // Undefined for non-leaf nodes + BOOLEAN = 0, + INT32 = 1, + INT64 = 2, + INT96 = 3, // Deprecated + FLOAT = 4, + DOUBLE = 5, + BYTE_ARRAY = 6, + FIXED_LEN_BYTE_ARRAY = 7, +}; +} // namespace parquet + +/** + * @brief Schema of a parquet column, including the nested columns. + */ +struct parquet_column_schema { + public: + /** + * @brief constructor + * + * @param name column name + * @param type parquet type + * @param children child columns (empty for non-nested types) + */ + parquet_column_schema(std::string_view name, + parquet::TypeKind type, + std::vector children) + : _name{name}, _type_kind{type}, _children{std::move(children)} + { + } + + /** + * @brief Returns parquet column name; can be empty. + * + * @return Column name + */ + [[nodiscard]] auto name() const { return _name; } + + /** + * @brief Returns parquet type of the column. + * + * @return Column parquet type + */ + [[nodiscard]] auto type_kind() const { return _type_kind; } + + /** + * @brief Returns schemas of all child columns. + * + * @return Children schemas + */ + [[nodiscard]] auto const& children() const& { return _children; } + + /** @copydoc children + * Children array is moved out of the object (rvalues only). + * + */ + [[nodiscard]] auto children() && { return std::move(_children); } + + /** + * @brief Returns schema of the child with the given index. + * + * @param idx child index + * + * @return Child schema + */ + [[nodiscard]] auto const& child(int idx) const& { return children().at(idx); } + + /** @copydoc child + * Child is moved out of the object (rvalues only). + * + */ + [[nodiscard]] auto child(int idx) && { return std::move(children().at(idx)); } + + /** + * @brief Returns the number of child columns. + * + * @return Children count + */ + [[nodiscard]] auto num_children() const { return children().size(); } + + private: + std::string _name; + // 3 types available: Physical, Converted, Logical. + parquet::TypeKind _type_kind; // Physical + std::vector _children; +}; + +/** + * @brief Schema of a parquet file. + */ +struct parquet_schema { + public: + /** + * @brief constructor + * + * @param root_column_schema root column + */ + parquet_schema(parquet_column_schema root_column_schema) : _root{std::move(root_column_schema)} {} + + /** + * @brief Returns the schema of the struct column that contains all columns as fields. + * + * @return Root column schema + */ + [[nodiscard]] auto const& root() const& { return _root; } + + /** @copydoc root + * Root column schema is moved out of the object (rvalues only). + * + */ + [[nodiscard]] auto root() && { return std::move(_root); } + + private: + parquet_column_schema _root; +}; + +/** + * @brief Information about content of a parquet file. + */ +class parquet_metadata { + public: + /// Key-value metadata in the file footer. + using key_value_metadata = std::unordered_map; + + /** + * @brief constructor + * + * @param schema parquet schema + * @param num_rows number of rows + * @param num_rowgroups number of row groups + * @param file_metadata key-value metadata in the file footer + */ + parquet_metadata(parquet_schema schema, + int64_t num_rows, + size_type num_rowgroups, + key_value_metadata file_metadata) + : _schema{std::move(schema)}, + _num_rows{num_rows}, + _num_rowgroups{num_rowgroups}, + _file_metadata{std::move(file_metadata)} + { + } + + /** + * @brief Returns the parquet schema. + * + * @return parquet schema + */ + [[nodiscard]] auto const& schema() const { return _schema; } + + /** + * @brief Returns the number of rows of the root column. + * + * If a file contains list columns, nested columns can have a different number of rows. + * + * @return Number of rows + */ + [[nodiscard]] auto num_rows() const { return _num_rows; } + + /** + * @brief Returns the number of rowgroups in the file. + * + * @return Number of row groups + */ + [[nodiscard]] auto num_rowgroups() const { return _num_rowgroups; } + /** + * @brief Returns the Key value metadata in the file footer. + * + * @return Key value metadata as a map + */ + [[nodiscard]] auto const& metadata() const { return _file_metadata; } + + private: + parquet_schema _schema; + int64_t _num_rows; + size_type _num_rowgroups; + key_value_metadata _file_metadata; +}; + +/** + * @brief Reads metadata of parquet dataset. + * + * @ingroup io_readers + * + * @param src_info Dataset source + * + * @return parquet_metadata with parquet schema, number of rows, number of row groups and key-value + * metadata. + */ +parquet_metadata read_parquet_metadata(source_info const& src_info); + +} // namespace io +} // namespace cudf diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index f5230863f17..046994d33cc 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,7 +41,7 @@ std::unique_ptr make_source(datasource& data); * @return the data chunk source for the provided host data. It copies data from the host to the * device. */ -std::unique_ptr make_source(host_span data); +std::unique_ptr make_source(host_span data); /** * @brief Creates a data source capable of producing device-buffered views of the file diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 7426811a18d..a97f81182ac 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -32,13 +32,6 @@ #include #include -// Forward declarations -namespace arrow { -namespace io { -class RandomAccessFile; -} -} // namespace arrow - namespace cudf { //! IO interfaces namespace io { @@ -100,6 +93,104 @@ enum statistics_freq { STATISTICS_COLUMN = 3, ///< Full column and offset indices. Implies STATISTICS_ROWGROUP }; +/** + * @brief Statistics about compression performed by a writer. + */ +class writer_compression_statistics { + public: + /** + * @brief Default constructor + */ + writer_compression_statistics() = default; + + /** + * @brief Constructor with initial values. + * + * @param num_compressed_bytes The number of bytes that were successfully compressed + * @param num_failed_bytes The number of bytes that failed to compress + * @param num_skipped_bytes The number of bytes that were skipped during compression + * @param num_compressed_output_bytes The number of bytes in the compressed output + */ + writer_compression_statistics(size_t num_compressed_bytes, + size_t num_failed_bytes, + size_t num_skipped_bytes, + size_t num_compressed_output_bytes) + : _num_compressed_bytes(num_compressed_bytes), + _num_failed_bytes(num_failed_bytes), + _num_skipped_bytes(num_skipped_bytes), + _num_compressed_output_bytes(num_compressed_output_bytes) + { + } + + /** + * @brief Adds the values from another `writer_compression_statistics` object. + * + * @param other The other writer_compression_statistics object + * @return writer_compression_statistics& Reference to this object + */ + writer_compression_statistics& operator+=(writer_compression_statistics const& other) noexcept + { + _num_compressed_bytes += other._num_compressed_bytes; + _num_failed_bytes += other._num_failed_bytes; + _num_skipped_bytes += other._num_skipped_bytes; + _num_compressed_output_bytes += other._num_compressed_output_bytes; + return *this; + } + + /** + * @brief Returns the number of bytes in blocks that were successfully compressed. + * + * This is the number of bytes that were actually compressed, not the size of the compressed + * output. + * + * @return size_t The number of bytes that were successfully compressed + */ + [[nodiscard]] auto num_compressed_bytes() const noexcept { return _num_compressed_bytes; } + + /** + * @brief Returns the number of bytes in blocks that failed to compress. + * + * @return size_t The number of bytes that failed to compress + */ + [[nodiscard]] auto num_failed_bytes() const noexcept { return _num_failed_bytes; } + + /** + * @brief Returns the number of bytes in blocks that were skipped during compression. + * + * @return size_t The number of bytes that were skipped during compression + */ + [[nodiscard]] auto num_skipped_bytes() const noexcept { return _num_skipped_bytes; } + + /** + * @brief Returns the total size of compression inputs. + * + * @return size_t The total size of compression inputs + */ + [[nodiscard]] auto num_total_input_bytes() const noexcept + { + return num_compressed_bytes() + num_failed_bytes() + num_skipped_bytes(); + } + + /** + * @brief Returns the compression ratio for the successfully compressed blocks. + * + * Returns nan if there were no successfully compressed blocks. + * + * @return double The ratio between the size of the compression inputs and the size of the + * compressed output. + */ + [[nodiscard]] auto compression_ratio() const noexcept + { + return static_cast(num_compressed_bytes()) / _num_compressed_output_bytes; + } + + private: + std::size_t _num_compressed_bytes = 0; ///< The number of bytes that were successfully compressed + std::size_t _num_failed_bytes = 0; ///< The number of bytes that failed to compress + std::size_t _num_skipped_bytes = 0; ///< The number of bytes that were skipped during compression + std::size_t _num_compressed_output_bytes = 0; ///< The number of bytes in the compressed output +}; + /** * @brief Control use of dictionary encoding for parquet writer */ @@ -110,20 +201,27 @@ enum dictionary_policy { }; /** - * @brief Detailed name information for output columns. + * @brief Detailed name (and optionally nullability) information for output columns. * * The hierarchy of children matches the hierarchy of children in the output * cudf columns. */ struct column_name_info { std::string name; ///< Column name + std::optional is_nullable; ///< Column nullability std::vector children; ///< Child column names + /** - * @brief Construct a column name info with a name and no children + * @brief Construct a column name info with a name, optional nullabilty, and no children * * @param _name Column name + * @param _is_nullable True if column is nullable */ - column_name_info(std::string const& _name) : name(_name) {} + column_name_info(std::string const& _name, std::optional _is_nullable = std::nullopt) + : name(_name), is_nullable(_is_nullable) + { + } + column_name_info() = default; }; @@ -165,7 +263,7 @@ struct host_buffer { * @param data Pointer to the buffer * @param size Size of the buffer */ - host_buffer(const char* data, size_t size) : data(data), size(size) {} + host_buffer(char const* data, size_t size) : data(data), size(size) {} }; /** @@ -188,8 +286,6 @@ constexpr inline auto is_byte_like_type() * @brief Source information for read interfaces */ struct source_info { - std::vector> _files; //!< Input files - source_info() = default; /** @@ -233,7 +329,7 @@ struct source_info { * @param host_data Input buffer in host memory * @param size Size of the buffer */ - explicit source_info(const char* host_data, size_t size) + explicit source_info(char const* host_data, size_t size) : _type(io_type::HOST_BUFFER), _host_buffers( {cudf::host_span(reinterpret_cast(host_data), size)}) @@ -340,12 +436,6 @@ struct source_info { * @return The device buffers of the input */ [[nodiscard]] auto const& device_buffers() const { return _device_buffers; } - /** - * @brief Get the input files - * - * @return The input files - */ - [[nodiscard]] auto const& files() const { return _files; } /** * @brief Get the user sources of the input * @@ -715,7 +805,17 @@ class table_input_metadata { * * @param table The table_view to construct metadata for */ - table_input_metadata(table_view const& table); + explicit table_input_metadata(table_view const& table); + + /** + * @brief Construct a new table_input_metadata from a table_metadata object. + * + * The constructed table_input_metadata has the same structure, column names and nullability as + * the passed table_metadata. + * + * @param metadata The table_metadata to construct table_intput_metadata for + */ + explicit table_input_metadata(table_metadata const& metadata); std::vector column_metadata; //!< List of column metadata }; diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 11d1bbf9fc8..6c50e1d5998 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -34,10 +34,11 @@ namespace cudf { // forward declaration -namespace detail { +namespace hashing::detail { template -class MurmurHash3_32; - +class MurmurHash3_x86_32; +} // namespace hashing::detail +namespace detail { template class hash_join; } // namespace detail @@ -167,7 +168,7 @@ full_join(cudf::table_view const& left_keys, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns a vector of row indices corresponding to a left semi join + * @brief Returns a vector of row indices corresponding to a left semi-join * between the specified tables. * * The returned vector contains the row indices from the left table @@ -179,13 +180,9 @@ full_join(cudf::table_view const& left_keys, * Result: {1, 2} * @endcode * - * @throw cudf::logic_error if number of columns in either - * `left_keys` or `right_keys` table is 0 or exceeds MAX_JOIN_SIZE - * - * @param[in] left_keys The left table - * @param[in] right_keys The right table - * @param[in] compare_nulls controls whether null join-key values - * should match or not. + * @param left_keys The left table + * @param right_keys The right table + * @param compare_nulls Controls whether null join-key values should match or not * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct @@ -276,7 +273,7 @@ enum class nullable_join : bool { YES, NO }; class hash_join { public: using impl_type = typename cudf::detail::hash_join< - cudf::detail::MurmurHash3_32>; ///< Implementation type + cudf::hashing::detail::MurmurHash3_x86_32>; ///< Implementation type hash_join() = delete; ~hash_join(); @@ -302,7 +299,7 @@ class hash_join { /** * @copydoc hash_join(cudf::table_view const&, null_equality, rmm::cuda_stream_view) * - * @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or + * @param has_nulls Flag to indicate if there exists any nulls in the `build` table or * any `probe` table that will be used later for join */ hash_join(cudf::table_view const& build, @@ -326,7 +323,7 @@ class hash_join { * * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct * the result of performing an inner join between two tables with `build` and `probe` - * as the the join keys . + * as the join keys . */ std::pair>, std::unique_ptr>> @@ -351,7 +348,7 @@ class hash_join { * * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct * the result of performing a left join between two tables with `build` and `probe` - * as the the join keys . + * as the join keys . */ std::pair>, std::unique_ptr>> @@ -376,7 +373,7 @@ class hash_join { * * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct * the result of performing a full join between two tables with `build` and `probe` - * as the the join keys . + * as the join keys . */ std::pair>, std::unique_ptr>> @@ -396,7 +393,7 @@ class hash_join { * constructed with null check. * * @return The exact number of output when performing an inner join between two tables with - * `build` and `probe` as the the join keys . + * `build` and `probe` as the join keys . */ [[nodiscard]] std::size_t inner_join_size( cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const; @@ -412,7 +409,7 @@ class hash_join { * constructed with null check. * * @return The exact number of output when performing a left join between two tables with `build` - * and `probe` as the the join keys . + * and `probe` as the join keys . */ [[nodiscard]] std::size_t left_join_size( cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const; @@ -430,7 +427,7 @@ class hash_join { * constructed with null check. * * @return The exact number of output when performing a full join between two tables with `build` - * and `probe` as the the join keys . + * and `probe` as the join keys . */ std::size_t full_join_size( cudf::table_view const& probe, @@ -438,7 +435,7 @@ class hash_join { rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; private: - const std::unique_ptr _impl; + const std::unique_ptr _impl; }; /** diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp index 531396e940e..0bc76828fc3 100644 --- a/cpp/include/cudf/lists/combine.hpp +++ b/cpp/include/cudf/lists/combine.hpp @@ -81,9 +81,7 @@ std::unique_ptr concatenate_rows( * @endcode * * @throws std::invalid_argument if the input column is not at least two-level depth lists column - * (i.e., each row must be a list of list). - * @throws cudf::logic_error if the input lists column contains nested typed entries that are not - * lists. + * (i.e., each row must be a list of lists). * * @param input The lists column containing lists of list elements to concatenate. * @param null_policy The parameter to specify whether a null list element will be ignored from diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh index 83710a49f6a..18fe707fd69 100644 --- a/cpp/include/cudf/lists/detail/gather.cuh +++ b/cpp/include/cudf/lists/detail/gather.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -28,7 +29,6 @@ #include #include #include -#include namespace cudf { namespace lists { @@ -74,25 +74,15 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, { // size of the gather map is the # of output rows size_type output_count = gather_map_size; - size_type offset_count = output_count + 1; // offsets of the source column int32_t const* src_offsets{source_column.offsets().data() + source_column.offset()}; size_type const src_size = source_column.size(); - // outgoing offsets. these will persist as output from the entire gather operation - auto dst_offsets_c = cudf::make_fixed_width_column( - data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr); - mutable_column_view dst_offsets_v = dst_offsets_c->mutable_view(); auto const source_column_nullmask = source_column.null_mask(); - // generate the compacted outgoing offsets. - auto count_iter = thrust::make_counting_iterator(0); - thrust::transform_exclusive_scan( - rmm::exec_policy_nosync(stream), - count_iter, - count_iter + offset_count, - dst_offsets_v.begin(), + auto sizes_itr = cudf::detail::make_counting_transform_iterator( + 0, [source_column_nullmask, source_column_offset = source_column.offset(), gather_map, @@ -112,9 +102,10 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, // the length of this list return src_offsets[offset_index + 1] - src_offsets[offset_index]; - }, - 0, - thrust::plus()); + }); + + auto [dst_offsets_c, map_size] = + cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + output_count, stream, mr); // handle sliced columns size_type const shift = @@ -147,9 +138,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, }); // Retrieve size of the resulting gather map for level N+1 (the last offset) - size_type child_gather_map_size = - cudf::detail::get_value(dst_offsets_c->view(), output_count, stream); - + auto const child_gather_map_size = static_cast(map_size); return {std::move(dst_offsets_c), std::move(base_offsets), child_gather_map_size}; } diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index 18cb147d1e4..f04b2fda2bf 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -240,11 +240,11 @@ std::unique_ptr scatter(scalar const& slr, rmm::device_buffer null_mask = slr_valid ? cudf::detail::create_null_mask(1, mask_state::UNALLOCATED, stream, mr) : cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream, mr); - auto offset_column = make_numeric_column( - data_type{type_to_id()}, 2, mask_state::UNALLOCATED, stream, mr); + auto offset_column = + make_numeric_column(data_type{type_to_id()}, 2, mask_state::UNALLOCATED, stream, mr); thrust::sequence(rmm::exec_policy_nosync(stream), - offset_column->mutable_view().begin(), - offset_column->mutable_view().end(), + offset_column->mutable_view().begin(), + offset_column->mutable_view().end(), 0, lv->view().size()); auto wrapped = column_view(data_type{type_id::LIST}, diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp index 336214e3934..8c6368eacb6 100644 --- a/cpp/include/cudf/lists/lists_column_view.hpp +++ b/cpp/include/cudf/lists/lists_column_view.hpp @@ -45,7 +45,7 @@ class lists_column_view : private column_view { */ lists_column_view(column_view const& lists_column); lists_column_view(lists_column_view&&) = default; ///< Move constructor - lists_column_view(const lists_column_view&) = default; ///< Copy constructor + lists_column_view(lists_column_view const&) = default; ///< Copy constructor ~lists_column_view() = default; /** * @brief Copy assignment operator @@ -71,9 +71,7 @@ class lists_column_view : private column_view { using column_view::null_mask; using column_view::offset; using column_view::size; - static_assert(std::is_same_v, - "offset_type is expected to be the same as size_type."); - using offset_iterator = offset_type const*; ///< Iterator type for offsets + using offset_iterator = size_type const*; ///< Iterator type for offsets /** * @brief Returns the parent column. @@ -119,7 +117,7 @@ class lists_column_view : private column_view { */ [[nodiscard]] offset_iterator offsets_begin() const noexcept { - return offsets().begin() + offset(); + return offsets().begin() + offset(); } /** diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index 360006c1eea..672f479ad53 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -36,6 +36,8 @@ namespace cudf { * @brief Returns the null count for a null mask of the specified `state` * representing `size` elements. * + * @throw std::invalid_argument if state is UNINITIALIZED + * * @param state The state of the null mask * @param size The number of elements represented by the mask * @return The count of null elements @@ -168,5 +170,21 @@ std::pair bitmask_or( table_view const& view, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Given a validity bitmask, counts the number of null elements (unset bits) + * in the range `[start, stop)`. + * + * If `bitmask == nullptr`, all elements are assumed to be valid and the + * function returns ``. + * + * @throws cudf::logic_error if `start > stop` + * @throws cudf::logic_error if `start < 0` + * + * @param bitmask Validity bitmask residing in device memory. + * @param start Index of the first bit to count (inclusive). + * @param stop Index of the last bit to count (exclusive). + * @return The number of null elements in the specified range. + */ +cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp index a5675b5f031..52aebeb55e5 100644 --- a/cpp/include/cudf/reduction.hpp +++ b/cpp/include/cudf/reduction.hpp @@ -198,7 +198,7 @@ std::unique_ptr segmented_reduce( * @returns Scanned output column */ std::unique_ptr scan( - const column_view& input, + column_view const& input, scan_aggregation const& agg, scan_type inclusive, null_policy null_handling = null_policy::EXCLUDE, diff --git a/cpp/include/cudf/reduction/detail/reduction.hpp b/cpp/include/cudf/reduction/detail/reduction.hpp new file mode 100644 index 00000000000..4cbfb82ae6b --- /dev/null +++ b/cpp/include/cudf/reduction/detail/reduction.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace cudf::reduction::detail { + +/** + * @copydoc cudf::reduce(column_view const&, reduce_aggregation const&, data_type, + * std::optional>, rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr reduce(column_view const& col, + reduce_aggregation const& agg, + data_type output_dtype, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +} // namespace cudf::reduction::detail diff --git a/cpp/include/cudf/reduction/detail/reduction_operators.cuh b/cpp/include/cudf/reduction/detail/reduction_operators.cuh index 0dba84a0b28..a747f7bade7 100644 --- a/cpp/include/cudf/reduction/detail/reduction_operators.cuh +++ b/cpp/include/cudf/reduction/detail/reduction_operators.cuh @@ -183,7 +183,7 @@ struct compound_op : public simple_op { * @copydoc simple_op::template get_null_replacing_element_transformer() */ template - auto get_null_replacing_element_transformer() override + auto get_null_replacing_element_transformer() { using element_transformer = typename Derived::transformer; using OutputType = typename Derived::intermediate::IntermediateType; @@ -202,9 +202,9 @@ struct compound_op : public simple_op { * @return transformed output result of compound operator */ template - CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input, - const cudf::size_type& count, - const cudf::size_type& ddof) + CUDF_HOST_DEVICE inline static ResultType compute_result(IntermediateType const& input, + cudf::size_type const& count, + cudf::size_type const& ddof) { // Enforced interface return Derived::template intermediate::compute_result(input, count, ddof); @@ -231,9 +231,9 @@ struct mean : public compound_op { using IntermediateType = ResultType; // sum value // compute `mean` from intermediate type `IntermediateType` - CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input, - const cudf::size_type& count, - const cudf::size_type& ddof) + CUDF_HOST_DEVICE inline static ResultType compute_result(IntermediateType const& input, + cudf::size_type const& count, + cudf::size_type const& ddof) { return (input / count); }; @@ -252,9 +252,9 @@ struct variance : public compound_op { using IntermediateType = var_std; // with sum of value, and sum of squared value // compute `variance` from intermediate type `IntermediateType` - CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input, - const cudf::size_type& count, - const cudf::size_type& ddof) + CUDF_HOST_DEVICE inline static ResultType compute_result(IntermediateType const& input, + cudf::size_type const& count, + cudf::size_type const& ddof) { ResultType mean = input.value / count; ResultType asum = input.value_squared; @@ -278,9 +278,9 @@ struct standard_deviation : public compound_op { using IntermediateType = var_std; // with sum of value, and sum of squared value // compute `standard deviation` from intermediate type `IntermediateType` - CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input, - const cudf::size_type& count, - const cudf::size_type& ddof) + CUDF_HOST_DEVICE inline static ResultType compute_result(IntermediateType const& input, + cudf::size_type const& count, + cudf::size_type const& ddof) { using intermediateOp = variance::template intermediate; ResultType var = intermediateOp::compute_result(input, count, ddof); diff --git a/cpp/include/cudf/replace.hpp b/cpp/include/cudf/replace.hpp index 9df58306ace..3405dc8b796 100644 --- a/cpp/include/cudf/replace.hpp +++ b/cpp/include/cudf/replace.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #pragma once #include +#include #include @@ -45,6 +46,7 @@ enum class replace_policy : bool { PRECEDING, FOLLOWING }; * * @param[in] input A column whose null values will be replaced * @param[in] replacement A cudf::column whose values will replace null values in input + * @param stream CUDA stream used for device memory operations and kernel launches * @param[in] mr Device memory resource used to allocate device memory of the returned column * * @returns A copy of `input` with the null values replaced with corresponding values from @@ -53,6 +55,7 @@ enum class replace_policy : bool { PRECEDING, FOLLOWING }; std::unique_ptr replace_nulls( column_view const& input, column_view const& replacement, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -63,6 +66,7 @@ std::unique_ptr replace_nulls( * * @param[in] input A column whose null values will be replaced * @param[in] replacement Scalar used to replace null values in `input` + * @param stream CUDA stream used for device memory operations and kernel launches * @param[in] mr Device memory resource used to allocate device memory of the returned column * * @returns Copy of `input` with null values replaced by `replacement` @@ -70,6 +74,7 @@ std::unique_ptr replace_nulls( std::unique_ptr replace_nulls( column_view const& input, scalar const& replacement, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -80,6 +85,7 @@ std::unique_ptr replace_nulls( * * @param[in] input A column whose null values will be replaced * @param[in] replace_policy Specify the position of replacement values relative to null values + * @param stream CUDA stream used for device memory operations and kernel launches * @param[in] mr Device memory resource used to allocate device memory of the returned column * * @returns Copy of `input` with null values replaced based on `replace_policy` @@ -87,6 +93,7 @@ std::unique_ptr replace_nulls( std::unique_ptr replace_nulls( column_view const& input, replace_policy const& replace_policy, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -106,6 +113,7 @@ std::unique_ptr replace_nulls( * * @param input A column whose NaN values will be replaced * @param replacement A cudf::column whose values will replace NaN values in input + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return A copy of `input` with the NaN values replaced with corresponding values from * `replacement`. @@ -113,6 +121,7 @@ std::unique_ptr replace_nulls( std::unique_ptr replace_nans( column_view const& input, column_view const& replacement, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -132,12 +141,14 @@ std::unique_ptr replace_nans( * * @param input A column whose NaN values will be replaced * @param replacement A cudf::scalar whose value will replace NaN values in input + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return A copy of `input` with the NaN values replaced by `replacement` */ std::unique_ptr replace_nans( column_view const& input, scalar const& replacement, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -147,6 +158,7 @@ std::unique_ptr replace_nans( * @param input_col The column to find and replace values in * @param values_to_replace The values to replace * @param replacement_values The values to replace with + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @returns Copy of `input_col` with specified values replaced @@ -155,6 +167,7 @@ std::unique_ptr find_and_replace_all( column_view const& input_col, column_view const& values_to_replace, column_view const& replacement_values, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -198,6 +211,7 @@ std::unique_ptr find_and_replace_all( * @param[in] hi Maximum clamp value. All elements greater than `hi` will be replaced by * `hi_replace`. Ignored if null. * @param[in] hi_replace All elements greater than `hi` will be replaced by `hi_replace` + * @param stream CUDA stream used for device memory operations and kernel launches * @param[in] mr Device memory resource used to allocate device memory of the returned column * * @return Returns a clamped column as per `lo` and `hi` boundaries @@ -208,6 +222,7 @@ std::unique_ptr clamp( scalar const& lo_replace, scalar const& hi, scalar const& hi_replace, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -244,6 +259,7 @@ std::unique_ptr clamp( * if null. * @param[in] hi Maximum clamp value. All elements greater than `hi` will be replaced by `hi` * Ignored if null. + * @param stream CUDA stream used for device memory operations and kernel launches * @param[in] mr Device memory resource used to allocate device memory of the returned column * * @return Returns a clamped column as per `lo` and `hi` boundaries @@ -252,6 +268,7 @@ std::unique_ptr clamp( column_view const& input, scalar const& lo, scalar const& hi, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -264,12 +281,14 @@ std::unique_ptr clamp( * * @throws cudf::logic_error if column does not have floating point data type. * @param[in] input column_view of floating-point elements to copy and normalize + * @param stream CUDA stream used for device memory operations and kernel launches * @param[in] mr device_memory_resource allocator for allocating output data * * @returns new column with the modified data */ std::unique_ptr normalize_nans_and_zeros( column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -282,8 +301,10 @@ std::unique_ptr normalize_nans_and_zeros( * * @throws cudf::logic_error if column does not have floating point data type. * @param[in, out] in_out of floating-point elements to normalize + * @param stream CUDA stream used for device memory operations and kernel launches */ -void normalize_nans_and_zeros(mutable_column_view& in_out); +void normalize_nans_and_zeros(mutable_column_view& in_out, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp index aa78979bf7a..efdb85691bd 100644 --- a/cpp/include/cudf/rolling.hpp +++ b/cpp/include/cudf/rolling.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -114,19 +114,29 @@ struct window_bounds { return window_bounds(true, std::numeric_limits::max()); } - // TODO: In the future, add units for bounds. - // E.g. {value=1, unit=DAYS, unbounded=false} - // For the present, assume units from context: - // 1. For time-based window functions, assume DAYS as before - // 2. For all else, assume ROWS as before. - const bool is_unbounded; ///< Whether the window boundary is unbounded - const size_type value; ///< Finite window boundary value (in days or rows) + /** + * Whether the window_bounds is unbounded. + * + * @return true if the window bounds is unbounded. + * @return false if the window bounds has a finite row boundary. + */ + [[nodiscard]] bool is_unbounded() const { return _is_unbounded; } + + /** + * @brief Gets the row-boundary for this window_bounds. + * + * @return the row boundary value (in days or rows) + */ + [[nodiscard]] size_type value() const { return _value; } private: explicit window_bounds(bool is_unbounded_, size_type value_ = 0) - : is_unbounded{is_unbounded_}, value{value_} + : _is_unbounded{is_unbounded_}, _value{value_} { } + + bool const _is_unbounded; ///< Whether the window boundary is unbounded + size_type const _value; ///< Finite window boundary value (in days or rows) }; /** diff --git a/cpp/include/cudf/rolling/range_window_bounds.hpp b/cpp/include/cudf/rolling/range_window_bounds.hpp index b08792740ff..c5b0c219373 100644 --- a/cpp/include/cudf/rolling/range_window_bounds.hpp +++ b/cpp/include/cudf/rolling/range_window_bounds.hpp @@ -59,7 +59,7 @@ struct range_window_bounds { * @brief Factory method to construct a window boundary * limited to the value of the current row * - * @param type type The datatype of the window boundary + * @param type The datatype of the window boundary * @return A "current row" window boundary object */ static range_window_bounds current_row(data_type type); @@ -75,7 +75,7 @@ struct range_window_bounds { /** * @brief Factory method to construct an unbounded window boundary. * - * @param type type The datatype of the window boundary + * @param type The datatype of the window boundary * @return An unbounded window boundary object */ static range_window_bounds unbounded(data_type type); diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index dab085ad7d5..af5e6d6b2d6 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -457,6 +457,8 @@ class string_scalar : public scalar { /** * @brief Construct a new string scalar object. * + * @throws std::overflow_error If the size of the input string exceeds cudf::size_type + * * @param string The value of the string. * @param is_valid Whether the value held by the scalar is valid. * @param stream CUDA stream used for device memory operations. @@ -545,7 +547,7 @@ class string_scalar : public scalar { * @brief Returns a raw pointer to the string in device memory. * @return a raw pointer to the string in device memory */ - [[nodiscard]] const char* data() const; + [[nodiscard]] char const* data() const; protected: rmm::device_buffer _data{}; ///< device memory containing the string diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh index 18bcd89a00b..846da0bbe10 100644 --- a/cpp/include/cudf/scalar/scalar_device_view.cuh +++ b/cpp/include/cudf/scalar/scalar_device_view.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -304,7 +304,7 @@ class string_scalar_device_view : public detail::scalar_device_view_base { * validity of the stored value * @param size The pointer to the size of the string in device memory */ - string_scalar_device_view(data_type type, const char* data, bool* is_valid, size_type size) + string_scalar_device_view(data_type type, char const* data, bool* is_valid, size_type size) : detail::scalar_device_view_base(type, is_valid), _data(data), _size(size) { } @@ -337,7 +337,7 @@ class string_scalar_device_view : public detail::scalar_device_view_base { [[nodiscard]] __device__ size_type size() const noexcept { return _size; } private: - const char* _data{}; ///< Pointer to device memory containing the value + char const* _data{}; ///< Pointer to device memory containing the value size_type _size; ///< Size of the string in bytes }; diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp index fee22786d7a..49acce6a63b 100644 --- a/cpp/include/cudf/search.hpp +++ b/cpp/include/cudf/search.hpp @@ -63,6 +63,7 @@ namespace cudf { * @param needles Values for which to find the insert locations in the search space * @param column_order Vector of column sort order * @param null_precedence Vector of null_precedence enums needles + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return A non-nullable column of elements containing the insertion points */ @@ -71,6 +72,7 @@ std::unique_ptr lower_bound( table_view const& needles, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -103,6 +105,7 @@ std::unique_ptr lower_bound( * @param needles Values for which to find the insert locations in the search space * @param column_order Vector of column sort order * @param null_precedence Vector of null_precedence enums needles + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return A non-nullable column of elements containing the insertion points */ @@ -111,6 +114,7 @@ std::unique_ptr upper_bound( table_view const& needles, std::vector const& column_order, std::vector const& null_precedence, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -128,9 +132,12 @@ std::unique_ptr upper_bound( * * @param haystack The column containing search space * @param needle A scalar value to check for existence in the search space + * @param stream CUDA stream used for device memory operations and kernel launches * @return true if the given `needle` value exists in the `haystack` column */ -bool contains(column_view const& haystack, scalar const& needle); +bool contains(column_view const& haystack, + scalar const& needle, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Check if the given `needles` values exists in the `haystack` column. @@ -149,12 +156,14 @@ bool contains(column_view const& haystack, scalar const& needle); * * @param haystack The column containing search space * @param needles A column of values to check for existence in the search space + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return A BOOL column indicating if each element in `needles` exists in the search space */ std::unique_ptr contains( column_view const& haystack, column_view const& needles, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index e2a6b97256f..984e3037cd1 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -254,22 +254,19 @@ std::unique_ptr
unique( * @brief Create a new table without duplicate rows. * * Given an `input` table_view, each row is copied to the output table to create a set of distinct - * rows. If there are duplicate rows, which row to be copied depends on the specified value of - * the `keep` parameter. + * rows. If there are duplicate rows, which row is copied depends on the `keep` parameter. * * The order of rows in the output table is not specified. * * Performance hint: if the input is pre-sorted, `cudf::unique` can produce an equivalent result * (i.e., same set of output rows) but with less running time than `cudf::distinct`. * - * @param[in] input input table_view to copy only distinct rows - * @param[in] keys vector of indices representing key columns from `input` - * @param[in] keep keep any, first, last, or none of the found duplicates - * @param[in] nulls_equal flag to control if nulls are compared equal or not - * @param[in] nans_equal flag to control if floating-point NaN values are compared equal or not - * @param[in] mr Device memory resource used to allocate the returned table's device - * memory - * + * @param input The input table + * @param keys Vector of indices indicating key columns in the `input` table + * @param keep Copy any, first, last, or none of the found duplicates + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN elements should be considered as equal + * @param mr Device memory resource used to allocate the returned table * @return Table with distinct rows in an unspecified order */ std::unique_ptr
distinct( @@ -280,6 +277,36 @@ std::unique_ptr
distinct( nan_equality nans_equal = nan_equality::ALL_EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create a new table without duplicate rows, preserving input order. + * + * Given an `input` table_view, each row is copied to the output table to create a set of distinct + * rows. The input row order is preserved. If there are duplicate rows, which row is copied depends + * on the `keep` parameter. + * + * This API produces the same output rows as `cudf::distinct`, but with input order preserved. + * + * Note that when `keep` is `KEEP_ANY`, the choice of which duplicate row to keep is arbitrary, but + * the returned table will retain the input order. That is, if the key column contained `1, 2, 1` + * with another values column `3, 4, 5`, the result could contain values `3, 4` or `4, 5` but not + * `4, 3` or `5, 4`. + * + * @param input The input table + * @param keys Vector of indices indicating key columns in the `input` table + * @param keep Copy any, first, last, or none of the found duplicates + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN elements should be considered as equal + * @param mr Device memory resource used to allocate the returned table + * @return Table with distinct rows, preserving input order + */ +std::unique_ptr
stable_distinct( + table_view const& input, + std::vector const& keys, + duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY, + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Count the number of consecutive groups of equivalent rows in a column. * diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp index 067f646fc33..71f65ac9080 100644 --- a/cpp/include/cudf/strings/combine.hpp +++ b/cpp/include/cudf/strings/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -258,8 +258,8 @@ std::unique_ptr concatenate( * @return New strings column with concatenated results. */ std::unique_ptr join_list_elements( - const lists_column_view& lists_strings_column, - const strings_column_view& separators, + lists_column_view const& lists_strings_column, + strings_column_view const& separators, string_scalar const& separator_narep = string_scalar("", false), string_scalar const& string_narep = string_scalar("", false), separator_on_nulls separate_nulls = separator_on_nulls::YES, @@ -316,7 +316,7 @@ std::unique_ptr join_list_elements( * @return New strings column with concatenated results. */ std::unique_ptr join_list_elements( - const lists_column_view& lists_strings_column, + lists_column_view const& lists_strings_column, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), separator_on_nulls separate_nulls = separator_on_nulls::YES, diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp index a7e09e09bac..fa729d26734 100644 --- a/cpp/include/cudf/strings/convert/convert_datetime.hpp +++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp @@ -243,7 +243,7 @@ std::unique_ptr from_timestamps( column_view const& timestamps, std::string_view format = "%Y-%m-%dT%H:%M:%SZ", strings_column_view const& names = strings_column_view(column_view{ - data_type{type_id::STRING}, 0, nullptr}), + data_type{type_id::STRING}, 0, nullptr, nullptr, 0}), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp index 3208d5f8f3b..44213b84139 100644 --- a/cpp/include/cudf/strings/convert/convert_integers.hpp +++ b/cpp/include/cudf/strings/convert/convert_integers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -81,7 +81,7 @@ std::unique_ptr from_integers( * * The output row entry will be set to `true` if the corresponding string element * have all characters in [-+0-9]. The optional sign character must only be in the first - * position. Notice that the the integer value is not checked to be within its storage limits. + * position. Notice that the integer value is not checked to be within its storage limits. * For strict integer type check, use the other `is_integer()` API which accepts `data_type` * argument. * diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp index e2480b459b9..7ab1bf47b0a 100644 --- a/cpp/include/cudf/strings/convert/convert_lists.hpp +++ b/cpp/include/cudf/strings/convert/convert_lists.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,7 +60,7 @@ std::unique_ptr format_list_column( lists_column_view const& input, string_scalar const& na_rep = string_scalar("NULL"), strings_column_view const& separators = strings_column_view(column_view{ - data_type{type_id::STRING}, 0, nullptr}), + data_type{type_id::STRING}, 0, nullptr, nullptr, 0}), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp index 275b7223a3b..0901076c835 100644 --- a/cpp/include/cudf/strings/detail/char_tables.hpp +++ b/cpp/include/cudf/strings/detail/char_tables.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,7 +33,7 @@ using character_flags_table_type = std::uint8_t; * * @return Device memory pointer to character flags table. */ -const character_flags_table_type* get_character_flags_table(); +character_flags_table_type const* get_character_flags_table(); // utilities to dissect a character-table flag constexpr uint8_t IS_DECIMAL(uint8_t x) { return ((x) & (1 << 0)); } @@ -61,7 +61,7 @@ using character_cases_table_type = uint16_t; * * @return Device memory pointer to character cases table. */ -const character_cases_table_type* get_character_cases_table(); +character_cases_table_type const* get_character_cases_table(); /** * @brief Case mapping structure for special characters. diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh index 185754a00c8..dd55cae4537 100644 --- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh +++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,7 +42,7 @@ namespace detail { */ template __device__ inline thrust::pair parse_integer( - char const*& iter, char const* iter_end, const char decimal_pt_char = '.') + char const*& iter, char const* iter_end, char const decimal_pt_char = '.') { // highest value where another decimal digit cannot be appended without an overflow; // this preserves the most digits when scaling the final result for this type diff --git a/cpp/include/cudf/strings/detail/convert/is_float.cuh b/cpp/include/cudf/strings/detail/convert/is_float.cuh index 92c993cfbb5..5b09da96dc4 100644 --- a/cpp/include/cudf/strings/detail/convert/is_float.cuh +++ b/cpp/include/cudf/strings/detail/convert/is_float.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -83,7 +83,7 @@ inline __device__ bool is_float(string_view const& d_str) bool decimal_found = false; bool exponent_found = false; size_type bytes = d_str.size_bytes(); - const char* data = d_str.data(); + char const* data = d_str.data(); // sign character allowed at the beginning of the string size_type ch_idx = (*data == '-' || *data == '+') ? 1 : 0; diff --git a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh index 8721f21a7c0..ab934750f9e 100644 --- a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh +++ b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,8 +35,8 @@ namespace detail { */ __device__ inline double stod(string_view const& d_str) { - const char* in_ptr = d_str.data(); - const char* end = in_ptr + d_str.size_bytes(); + char const* in_ptr = d_str.data(); + char const* end = in_ptr + d_str.size_bytes(); if (end == in_ptr) return 0.0; double sign{1.0}; if (*in_ptr == '-' || *in_ptr == '+') { diff --git a/cpp/include/cudf/strings/detail/convert/string_to_int.cuh b/cpp/include/cudf/strings/detail/convert/string_to_int.cuh index 6c8de06602e..8bbaea9390c 100644 --- a/cpp/include/cudf/strings/detail/convert/string_to_int.cuh +++ b/cpp/include/cudf/strings/detail/convert/string_to_int.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ __device__ inline int64_t string_to_integer(string_view const& d_str) int64_t value = 0; size_type bytes = d_str.size_bytes(); if (bytes == 0) return value; - const char* ptr = d_str.data(); + char const* ptr = d_str.data(); int sign = 1; if (*ptr == '-' || *ptr == '+') { sign = (*ptr == '-' ? -1 : 1); diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 28b98eac3b5..7cd2338cb67 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -32,16 +33,14 @@ #include #include #include -#include -#include -#include +#include namespace cudf { namespace strings { namespace detail { // Helper function for loading 16B from a potentially unaligned memory location to registers. -__forceinline__ __device__ uint4 load_uint4(const char* ptr) +__forceinline__ __device__ uint4 load_uint4(char const* ptr) { auto const offset = reinterpret_cast(ptr) % 4; auto const* aligned_ptr = reinterpret_cast(ptr - offset); @@ -100,7 +99,7 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, // This check is necessary because string_indices[istring] may be out of bound. if (out_start == out_end) continue; - const char* in_start = strings_begin[string_indices[istring]].data(); + char const* in_start = strings_begin[string_indices[istring]].data(); // Both `out_start_aligned` and `out_end_aligned` are indices into `out_chars`. // `out_start_aligned` is the first 16B aligned memory location after `out_start + 4`. @@ -294,58 +293,31 @@ std::unique_ptr gather(strings_column_view const& strings, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const output_count = std::distance(begin, end); - auto const strings_count = strings.size(); + auto const output_count = std::distance(begin, end); if (output_count == 0) return make_empty_column(type_id::STRING); - // allocate offsets column and use memory to compute string size in each output row - auto out_offsets_column = make_numeric_column( - data_type{type_id::INT32}, output_count + 1, mask_state::UNALLOCATED, stream, mr); - auto const d_out_offsets = out_offsets_column->mutable_view().template data(); - auto const d_in_offsets = (strings_count > 0) ? strings.offsets_begin() : nullptr; - auto const d_strings = column_device_view::create(strings.parent(), stream); - thrust::transform( - rmm::exec_policy_nosync(stream), - begin, - end, - d_out_offsets, - [d_strings = *d_strings, d_in_offsets, strings_count] __device__(size_type in_idx) { - if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0; - if (not d_strings.is_valid(in_idx)) return 0; - return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx]; - }); + // build offsets column + auto const d_strings = column_device_view::create(strings.parent(), stream); + auto const d_in_offsets = !strings.is_empty() ? strings.offsets_begin() : nullptr; - // check total size is not too large - size_t const total_bytes = thrust::transform_reduce( - rmm::exec_policy_nosync(stream), - d_out_offsets, - d_out_offsets + output_count, - [] __device__(auto size) { return static_cast(size); }, - size_t{0}, - thrust::plus{}); - CUDF_EXPECTS(total_bytes < static_cast(std::numeric_limits::max()), - "total size of output strings is too large for a cudf column"); - - // In-place convert output sizes into offsets - thrust::exclusive_scan(rmm::exec_policy_nosync(stream), - d_out_offsets, - d_out_offsets + output_count + 1, - d_out_offsets); + auto offsets_itr = thrust::make_transform_iterator( + begin, [d_strings = *d_strings, d_in_offsets] __device__(size_type idx) { + if (NullifyOutOfBounds && (idx < 0 || idx >= d_strings.size())) { return 0; } + if (not d_strings.is_valid(idx)) { return 0; } + return d_in_offsets[idx + 1] - d_in_offsets[idx]; + }); + auto [out_offsets_column, total_bytes] = + cudf::detail::make_offsets_child_column(offsets_itr, offsets_itr + output_count, stream, mr); // build chars column - cudf::device_span const d_out_offsets_span(d_out_offsets, output_count + 1); - auto out_chars_column = gather_chars(d_strings->begin(), - begin, - end, - d_out_offsets_span, - static_cast(total_bytes), - stream, - mr); + auto const offsets_view = out_offsets_column->view(); + auto out_chars_column = gather_chars( + d_strings->begin(), begin, end, offsets_view, total_bytes, stream, mr); return make_strings_column(output_count, std::move(out_offsets_column), std::move(out_chars_column), - 0, + 0, // caller sets these rmm::device_buffer{}); } diff --git a/cpp/include/cudf/strings/detail/split_utils.cuh b/cpp/include/cudf/strings/detail/split_utils.cuh index 99a5edaf91a..a95a9ee23bd 100644 --- a/cpp/include/cudf/strings/detail/split_utils.cuh +++ b/cpp/include/cudf/strings/detail/split_utils.cuh @@ -22,6 +22,35 @@ namespace cudf { namespace strings { namespace detail { +constexpr bool is_whitespace(char_utf8 ch) { return ch <= ' '; } + +/** + * @brief Count tokens delimited by whitespace + * + * @param d_str String to tokenize + * @param max_tokens Maximum number of tokens to count + * @return Number of tokens delimited by whitespace + */ +__device__ inline size_type count_tokens_whitespace( + string_view d_str, size_type const max_tokens = std::numeric_limits::max()) +{ + auto token_count = size_type{0}; + auto spaces = true; + auto itr = d_str.data(); + auto const end = itr + d_str.size_bytes(); + while (itr < end && token_count < max_tokens) { + cudf::char_utf8 ch = 0; + auto const chr_width = cudf::strings::detail::to_char_utf8(itr, ch); + if (spaces == is_whitespace(ch)) { + itr += chr_width; + } else { + token_count += static_cast(spaces); + spaces = !spaces; + } + } + return token_count; +} + // JIT has trouble including thrust/pair.h struct position_pair { size_type first; @@ -43,26 +72,33 @@ struct whitespace_string_tokenizer { */ __device__ bool next_token() { - if (itr != d_str.begin()) { // skip these 2 lines the first time through - ++itr; - start_position = itr.byte_offset(); // end_position + 1; + if (start_position >= d_str.size_bytes()) { return false; } + auto const src_ptr = d_str.data(); + if (current_position != 0) { + current_position += cudf::strings::detail::bytes_in_char_utf8(src_ptr[current_position]); + start_position = current_position; } - if (start_position >= d_str.size_bytes()) return false; + if (start_position >= d_str.size_bytes()) { return false; } // continue search for the next token end_position = d_str.size_bytes(); - for (; itr < d_str.end(); ++itr) { - if (spaces == (*itr <= ' ')) { - if (spaces) - start_position = (itr + 1).byte_offset(); - else - end_position = (itr + 1).byte_offset(); + while (current_position < d_str.size_bytes()) { + cudf::char_utf8 ch = 0; + auto const chr_width = cudf::strings::detail::to_char_utf8(src_ptr + current_position, ch); + if (spaces == is_whitespace(ch)) { + current_position += chr_width; + if (spaces) { + start_position = current_position; + } else { + end_position = current_position; + } continue; } spaces = !spaces; if (spaces) { - end_position = itr.byte_offset(); + end_position = current_position; break; } + current_position += chr_width; } return start_position < end_position; } @@ -106,7 +142,8 @@ struct whitespace_string_tokenizer { spaces(true), start_position{reverse ? d_str.size_bytes() + 1 : 0}, end_position{d_str.size_bytes()}, - itr{reverse ? d_str.end() : d_str.begin()} + itr{reverse ? d_str.end() : d_str.begin()}, + current_position{0} { } @@ -116,6 +153,7 @@ struct whitespace_string_tokenizer { cudf::string_view::const_iterator itr; size_type start_position; size_type end_position; + size_type current_position; }; } // namespace detail diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index 02a65c01178..5f8a2a34606 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -79,8 +79,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, // Convert the sizes to offsets auto const bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream); - CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), - "Size of output exceeds column size limit", + CUDF_EXPECTS(bytes <= std::numeric_limits::max(), + "Size of output exceeds the column size limit", std::overflow_error); // Now build the chars column diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index b219b28cf9b..7e608cd10f0 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -45,7 +45,7 @@ namespace detail { * @brief Basic type expected for iterators passed to `make_strings_column` that represent string * data in device memory. */ -using string_index_pair = thrust::pair; +using string_index_pair = thrust::pair; /** * @brief Average string byte-length threshold for deciding character-level diff --git a/cpp/include/cudf/strings/detail/strip.cuh b/cpp/include/cudf/strings/detail/strip.cuh index 533e76121b5..264ea0c103a 100644 --- a/cpp/include/cudf/strings/detail/strip.cuh +++ b/cpp/include/cudf/strings/detail/strip.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,6 +36,8 @@ __device__ cudf::string_view strip(cudf::string_view const d_str, cudf::string_view const d_to_strip, side_type side = side_type::BOTH) { + if (d_str.empty()) { return cudf::string_view{}; } // sanitize empty return + auto is_strip_character = [d_to_strip](char_utf8 chr) -> bool { if (d_to_strip.empty()) return chr <= ' '; // whitespace check for (auto c : d_to_strip) { diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp index 9e1bd10c1cf..df8e2885782 100644 --- a/cpp/include/cudf/strings/detail/utf8.hpp +++ b/cpp/include/cudf/strings/detail/utf8.hpp @@ -108,7 +108,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte) * @param[out] character Single char_utf8 value. * @return The number of bytes in the character */ -constexpr size_type to_char_utf8(const char* str, char_utf8& character) +constexpr size_type to_char_utf8(char const* str, char_utf8& character) { size_type const chr_width = bytes_in_utf8_byte(static_cast(*str)); diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh index 1d48a5cc201..5c719cd25d2 100644 --- a/cpp/include/cudf/strings/detail/utilities.cuh +++ b/cpp/include/cudf/strings/detail/utilities.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,9 @@ #include #include +#include +#include + #include #include @@ -29,14 +32,15 @@ namespace detail { * @brief Copies input string data into a buffer and increments the pointer by the number of bytes * copied. * - * @param buffer Device buffer to copy to. - * @param input Data to copy from. - * @param bytes Number of bytes to copy. - * @return Pointer to the end of the output buffer after the copy. + * @param buffer Device buffer to copy to + * @param input Data to copy from + * @param bytes Number of bytes to copy + * @return Pointer to the end of the output buffer after the copy */ -__device__ inline char* copy_and_increment(char* buffer, const char* input, size_type bytes) +__device__ inline char* copy_and_increment(char* buffer, char const* input, size_type bytes) { - memcpy(buffer, input, bytes); + // this can be slightly faster than memcpy + thrust::copy_n(thrust::seq, input, bytes, buffer); return buffer + bytes; } @@ -48,7 +52,7 @@ __device__ inline char* copy_and_increment(char* buffer, const char* input, size * @param d_string String to copy. * @return Pointer to the end of the output buffer after the copy. */ -__device__ inline char* copy_string(char* buffer, const string_view& d_string) +__device__ inline char* copy_string(char* buffer, string_view const& d_string) { return copy_and_increment(buffer, d_string.data(), d_string.size_bytes()); } @@ -62,7 +66,7 @@ class per_context_cache { // If there is no object available in the cache, it calls the initializer // `init` to create a new one and cache it for later uses. template - TableType* find_or_initialize(const Initializer& init) + TableType* find_or_initialize(Initializer const& init) { int device_id; CUDF_CUDA_TRY(cudaGetDevice(&device_id)); @@ -85,7 +89,7 @@ template class thread_safe_per_context_cache : public per_context_cache { public: template - TableType* find_or_initialize(const Initializer& init) + TableType* find_or_initialize(Initializer const& init) { std::lock_guard guard(mutex); return per_context_cache::find_or_initialize(init); diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp index 4f4b71ac82d..2fed36862b9 100644 --- a/cpp/include/cudf/strings/find.hpp +++ b/cpp/include/cudf/strings/find.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -87,6 +87,33 @@ std::unique_ptr rfind( size_type stop = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a column of character position values where the target + * string is first found in the corresponding string of the provided column + * + * The output of row `i` is the character position of the target string for row `i` + * within input string of row `i` starting at the character position `start`. + * If the target is not found within the input string, -1 is returned for that + * row entry in the output column. + * + * Any null input or target entries return corresponding null output column entries. + * + * @throw cudf::logic_error if `input.size() != target.size()` + * + * @param input Strings to search against + * @param target Strings to search for in `input` + * @param start First character position to include in the search + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New integer column with character position values + */ +std::unique_ptr find( + strings_column_view const& input, + strings_column_view const& target, + size_type start = 0, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a column of boolean values for each string where true indicates * the target string was found within that string in the provided column. diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp index 26fe5f95983..2b6575f80d0 100644 --- a/cpp/include/cudf/strings/repeat_strings.hpp +++ b/cpp/include/cudf/strings/repeat_strings.hpp @@ -20,8 +20,6 @@ #include -#include - namespace cudf { namespace strings { /** @@ -49,9 +47,8 @@ namespace strings { * out is '123XYZ-123XYZ-123XYZ-' * @endcode * - * @throw cudf::logic_error if the size of the output string scalar exceeds the maximum value that - * can be stored by the index type: - * `input.size() * repeat_times > max of size_type` + * @throw std::overflow_error if the size of the output string scalar exceeds the maximum value that + * can be stored by the scalar: `input.size() * repeat_times > max of size_type` * * @param input The scalar containing the string to repeat * @param repeat_times The number of times the input string is repeated diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp index e28d42b8154..5f2c71725eb 100644 --- a/cpp/include/cudf/strings/slice.hpp +++ b/cpp/include/cudf/strings/slice.hpp @@ -107,99 +107,6 @@ std::unique_ptr slice_strings( column_view const& stops, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Slices a column of strings by using a delimiter as a slice point. - * - * Returns a column of strings after searching for @p delimiter @p count number of - * times in the source @p strings from left to right if @p count is positive or from - * right to left if @p count is negative. If @p count is positive, it returns a substring - * from the start of the source @p strings up until @p count occurrence of the @p delimiter - * not including the @p delimiter. If @p count is negative, it returns a substring from - * the start of the @p count occurrence of the @p delimiter in the source @p strings past - * the delimiter until the end of the string. - * - * The search for @p delimiter in @p strings is case sensitive. - * If the row value of @p strings is null, the row value in the output column will be null. - * If the @p count is 0 or if @p delimiter is invalid or empty, every row in the output column - * will be an empty string. - * If the column value for a row is empty, the row value in the output column will be empty. - * If @p count occurrences of @p delimiter isn't found, the row value in the output column will - * be the row value from the input @p strings column. - * - * @code{.pseudo} - * Example: - * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo'] - * r = slice_strings(in_s, '.', 1) - * r = ['www', null, 'www', '', 'foo'] - * - * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo'] - * r = slice_strings(in_s, '.', -2) - * r = ['nvidia.com', null, 'google.com', '', 'foo'] - * @endcode - * - * @param strings Strings instance for this operation. - * @param delimiter UTF-8 encoded string to search for in each string. - * @param count Number of times to search for delimiter in each string. If the value is positive, - * delimiter is searched from left to right; else, it is searched from right to left. - * @param mr Resource for allocating device memory. - * @return New strings column containing the substrings. - */ -std::unique_ptr slice_strings( - strings_column_view const& strings, - string_scalar const& delimiter, - size_type count, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Slices a column of strings by using a delimiter column as slice points. - * - * Returns a column of strings after searching the delimiter defined per row from - * @p delimiter_strings @p count number of times in the source @p strings from left to right - * if @p count is positive or from right to left if @p count is negative. If @p count is - * positive, it returns a substring from the start of the source @p strings up until - * @p count occurrence of the delimiter for that row not including that delimiter. If @p count - * is negative, it returns a substring from the start of the @p count occurrence of the - * delimiter for that row in the source @p strings past the delimiter until the end of the string. - * - * The search for @p delimiter_strings in @p strings is case sensitive. - * If the @p count is 0, every row in the output column will be an empty string. - * If the row value of @p strings is null, the row value in the output column will be null. - * If the row value from @p delimiter_strings is invalid or null, the row value in the - * output column will be an empty string. - * If the row value from @p delimiter_strings or the column value for a row is empty, the - * row value in the output column will be empty. - * If @p count occurrences of delimiter isn't found, the row value in the output column will - * be the row value from the input @p strings column. - * - * @code{.pseudo} - * Example: - * in_s = ['www.nvidia.com', null, 'www.google.com', 'bar', 'foo..bar....goo'] - * delimiters = ['.', '..', '', null, '..'] - * r = slice_strings(in_s, delimiters, 2) - * r = ['www.nvidia', null, '', '', 'foo..bar'] - * - * in_s = ['www.nvidia.com', null, 'www.google.com', '', 'foo..bar....goo', 'apache.org'] - * delimiters = ['.', '..', '', null,'..', '.'] - * r = slice_strings(in_s, delimiters, -2) - * r = ['nvidia.com', null, '', '', '..goo', 'apache.org'] - * @endcode - * - * @throw cudf::logic_error if the number of rows in @p strings and @p delimiter_strings do not - * match. - * - * @param strings Strings instance for this operation. - * @param delimiter_strings UTF-8 encoded string for each row. - * @param count Number of times to search for delimiter in each string. If the value is positive, - * delimiter is searched from left to right; else, it is searched from right to left. - * @param mr Resource for allocating device memory. - * @return New strings column containing the substrings. - */ -std::unique_ptr slice_strings( - strings_column_view const& strings, - strings_column_view const& delimiter_strings, - size_type count, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp index a6c942d39b4..701950e61a5 100644 --- a/cpp/include/cudf/strings/split/split.hpp +++ b/cpp/include/cudf/strings/split/split.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,18 +43,20 @@ namespace strings { * * Any null string entries return corresponding null output columns. * - * @param strings_column Strings instance for this operation. - * @param delimiter UTF-8 encoded string indicating the split points in each string. + * @param strings_column Strings instance for this operation + * @param delimiter UTF-8 encoded string indicating the split points in each string; * Default of empty string indicates split on whitespace. - * @param maxsplit Maximum number of splits to perform. + * @param maxsplit Maximum number of splits to perform; * Default of -1 indicates all possible splits on each string. - * @param mr Device memory resource used to allocate the returned table's device memory. - * @return New table of strings columns. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table's device memory + * @return New table of strings columns */ std::unique_ptr
split( strings_column_view const& strings_column, string_scalar const& delimiter = string_scalar(""), size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -71,18 +73,20 @@ std::unique_ptr
split( * * Any null string entries return corresponding null output columns. * - * @param strings_column Strings instance for this operation. - * @param delimiter UTF-8 encoded string indicating the split points in each string. + * @param strings_column Strings instance for this operation + * @param delimiter UTF-8 encoded string indicating the split points in each string; * Default of empty string indicates split on whitespace. - * @param maxsplit Maximum number of splits to perform. + * @param maxsplit Maximum number of splits to perform; * Default of -1 indicates all possible splits on each string. - * @param mr Device memory resource used to allocate the returned table's device memory. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table's device memory * @return New strings columns. */ std::unique_ptr
rsplit( strings_column_view const& strings_column, string_scalar const& delimiter = string_scalar(""), size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -141,20 +145,22 @@ std::unique_ptr
rsplit( * * @throw cudf:logic_error if `delimiter` is invalid. * - * @param strings A column of string elements to be split. - * @param delimiter The string to identify split points in each string. + * @param strings A column of string elements to be split + * @param delimiter The string to identify split points in each string; * Default of empty string indicates split on whitespace. - * @param maxsplit Maximum number of splits to perform. - * Default of -1 indicates all possible splits on each string. - * @param mr Device memory resource used to allocate the returned result's device memory. - * @return Lists column of strings - * Each vector of the lists column holds splits from a single row + * @param maxsplit Maximum number of splits to perform; + * Default of -1 indicates all possible splits on each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned result's device memory + * @return Lists column of strings; + * Each row of the lists column holds splits from a single row * element of the input column. */ std::unique_ptr split_record( strings_column_view const& strings, string_scalar const& delimiter = string_scalar(""), size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -218,20 +224,22 @@ std::unique_ptr split_record( * * @throw cudf:logic_error if `delimiter` is invalid. * - * @param strings A column of string elements to be split. - * @param delimiter The string to identify split points in each string. + * @param strings A column of string elements to be split + * @param delimiter The string to identify split points in each string; * Default of empty string indicates split on whitespace. - * @param maxsplit Maximum number of splits to perform. - * Default of -1 indicates all possible splits on each string. - * @param mr Device memory resource used to allocate the returned result's device memory. - * @return Lists column of strings - * Each vector of the lists column holds splits from a single row + * @param maxsplit Maximum number of splits to perform; + * Default of -1 indicates all possible splits on each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned result's device memory + * @return Lists column of strings; + * Each row of the lists column holds splits from a single row * element of the input column. */ std::unique_ptr rsplit_record( strings_column_view const& strings, string_scalar const& delimiter = string_scalar(""), size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index fc4e3d57cfb..74df1ea1887 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -46,7 +46,7 @@ namespace detail { * @param bytes Number of bytes in str. * @return The number of characters in the array. */ -__device__ inline size_type characters_in_string(const char* str, size_type bytes) +__device__ inline size_type characters_in_string(char const* str, size_type bytes) { if ((str == nullptr) || (bytes == 0)) return 0; auto ptr = reinterpret_cast(str); @@ -123,7 +123,7 @@ CUDF_HOST_DEVICE inline string_view string_view::min() { return string_view(); } */ CUDF_HOST_DEVICE inline string_view string_view::max() { - const char* psentinel{nullptr}; + char const* psentinel{nullptr}; #if defined(__CUDA_ARCH__) psentinel = &cudf::strings::detail::max_string_sentinel[0]; #else @@ -142,7 +142,7 @@ __device__ inline size_type string_view::length() const // @cond // this custom iterator knows about UTF8 encoding -__device__ inline string_view::const_iterator::const_iterator(const string_view& str, size_type pos) +__device__ inline string_view::const_iterator::const_iterator(string_view const& str, size_type pos) : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)} { } @@ -223,38 +223,45 @@ __device__ inline string_view::const_iterator string_view::const_iterator::opera return tmp; } +__device__ inline string_view::const_iterator& string_view::const_iterator::move_to( + size_type new_pos) +{ + *this += (new_pos - char_pos); // more efficient than recounting from the start + return *this; +} + __device__ inline bool string_view::const_iterator::operator==( - const string_view::const_iterator& rhs) const + string_view::const_iterator const& rhs) const { return (p == rhs.p) && (char_pos == rhs.char_pos); } __device__ inline bool string_view::const_iterator::operator!=( - const string_view::const_iterator& rhs) const + string_view::const_iterator const& rhs) const { return (p != rhs.p) || (char_pos != rhs.char_pos); } __device__ inline bool string_view::const_iterator::operator<( - const string_view::const_iterator& rhs) const + string_view::const_iterator const& rhs) const { return (p == rhs.p) && (char_pos < rhs.char_pos); } __device__ inline bool string_view::const_iterator::operator<=( - const string_view::const_iterator& rhs) const + string_view::const_iterator const& rhs) const { return (p == rhs.p) && (char_pos <= rhs.char_pos); } __device__ inline bool string_view::const_iterator::operator>( - const string_view::const_iterator& rhs) const + string_view::const_iterator const& rhs) const { return (p == rhs.p) && (char_pos > rhs.char_pos); } __device__ inline bool string_view::const_iterator::operator>=( - const string_view::const_iterator& rhs) const + string_view::const_iterator const& rhs) const { return (p == rhs.p) && (char_pos >= rhs.char_pos); } @@ -272,7 +279,7 @@ __device__ inline size_type string_view::const_iterator::byte_offset() const { r __device__ inline string_view::const_iterator string_view::begin() const { - return const_iterator(*this, 0); + return const_iterator(*this, 0, 0); } __device__ inline string_view::const_iterator string_view::end() const @@ -296,16 +303,16 @@ __device__ inline size_type string_view::byte_offset(size_type pos) const return std::get<0>(strings::detail::bytes_to_character_position(*this, pos)); } -__device__ inline int string_view::compare(const string_view& in) const +__device__ inline int string_view::compare(string_view const& in) const { return compare(in.data(), in.size_bytes()); } -__device__ inline int string_view::compare(const char* data, size_type bytes) const +__device__ inline int string_view::compare(char const* data, size_type bytes) const { size_type const len1 = size_bytes(); - const auto* ptr1 = reinterpret_cast(this->data()); - const auto* ptr2 = reinterpret_cast(data); + auto const* ptr1 = reinterpret_cast(this->data()); + auto const* ptr2 = reinterpret_cast(data); if ((ptr1 == ptr2) && (bytes == len1)) return 0; size_type idx = 0; for (; (idx < len1) && (idx < bytes); ++idx) { @@ -318,39 +325,39 @@ __device__ inline int string_view::compare(const char* data, size_type bytes) co return 0; } -__device__ inline bool string_view::operator==(const string_view& rhs) const +__device__ inline bool string_view::operator==(string_view const& rhs) const { return (size_bytes() == rhs.size_bytes()) && (compare(rhs) == 0); } -__device__ inline bool string_view::operator!=(const string_view& rhs) const +__device__ inline bool string_view::operator!=(string_view const& rhs) const { return compare(rhs) != 0; } -__device__ inline bool string_view::operator<(const string_view& rhs) const +__device__ inline bool string_view::operator<(string_view const& rhs) const { return compare(rhs) < 0; } -__device__ inline bool string_view::operator>(const string_view& rhs) const +__device__ inline bool string_view::operator>(string_view const& rhs) const { return compare(rhs) > 0; } -__device__ inline bool string_view::operator<=(const string_view& rhs) const +__device__ inline bool string_view::operator<=(string_view const& rhs) const { int rc = compare(rhs); return (rc == 0) || (rc < 0); } -__device__ inline bool string_view::operator>=(const string_view& rhs) const +__device__ inline bool string_view::operator>=(string_view const& rhs) const { int rc = compare(rhs); return (rc == 0) || (rc > 0); } -__device__ inline size_type string_view::find(const string_view& str, +__device__ inline size_type string_view::find(string_view const& str, size_type pos, size_type count) const { @@ -358,7 +365,7 @@ __device__ inline size_type string_view::find(const string_view& str, } template -__device__ inline size_type string_view::find_impl(const char* str, +__device__ inline size_type string_view::find_impl(char const* str, size_type bytes, size_type pos, size_type count) const @@ -388,7 +395,7 @@ __device__ inline size_type string_view::find_impl(const char* str, return npos; } -__device__ inline size_type string_view::find(const char* str, +__device__ inline size_type string_view::find(char const* str, size_type bytes, size_type pos, size_type count) const @@ -403,14 +410,14 @@ __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size return find(str, chwidth, pos, count); } -__device__ inline size_type string_view::rfind(const string_view& str, +__device__ inline size_type string_view::rfind(string_view const& str, size_type pos, size_type count) const { return rfind(str.data(), str.size_bytes(), pos, count); } -__device__ inline size_type string_view::rfind(const char* str, +__device__ inline size_type string_view::rfind(char const* str, size_type bytes, size_type pos, size_type count) const diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 23627943d95..afc7e027a4b 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -60,7 +60,7 @@ class string_view { * * @return A pointer to the internal device array */ - CUDF_HOST_DEVICE [[nodiscard]] inline const char* data() const { return _data; } + CUDF_HOST_DEVICE [[nodiscard]] inline char const* data() const { return _data; } /** * @brief Return true if string has no characters @@ -80,10 +80,10 @@ class string_view { using reference = char_utf8&; using pointer = char_utf8*; using iterator_category = std::input_iterator_tag; - __device__ inline const_iterator(const string_view& str, size_type pos); - const_iterator(const const_iterator& mit) = default; + __device__ inline const_iterator(string_view const& str, size_type pos); + const_iterator(const_iterator const& mit) = default; const_iterator(const_iterator&& mit) = default; - const_iterator& operator=(const const_iterator&) = default; + const_iterator& operator=(const_iterator const&) = default; const_iterator& operator=(const_iterator&&) = default; __device__ inline const_iterator& operator++(); __device__ inline const_iterator operator++(int); @@ -93,19 +93,20 @@ class string_view { __device__ inline const_iterator operator--(int); __device__ inline const_iterator& operator-=(difference_type); __device__ inline const_iterator operator-(difference_type) const; - __device__ inline bool operator==(const const_iterator&) const; - __device__ inline bool operator!=(const const_iterator&) const; - __device__ inline bool operator<(const const_iterator&) const; - __device__ inline bool operator<=(const const_iterator&) const; - __device__ inline bool operator>(const const_iterator&) const; - __device__ inline bool operator>=(const const_iterator&) const; + __device__ inline const_iterator& move_to(size_type); + __device__ inline bool operator==(const_iterator const&) const; + __device__ inline bool operator!=(const_iterator const&) const; + __device__ inline bool operator<(const_iterator const&) const; + __device__ inline bool operator<=(const_iterator const&) const; + __device__ inline bool operator>(const_iterator const&) const; + __device__ inline bool operator>=(const_iterator const&) const; __device__ inline char_utf8 operator*() const; [[nodiscard]] __device__ inline size_type position() const; [[nodiscard]] __device__ inline size_type byte_offset() const; private: friend class string_view; - const char* p{}; + char const* p{}; size_type bytes{}; size_type char_pos{}; size_type byte_pos{}; @@ -154,7 +155,7 @@ class string_view { * not match is greater in the arg string, or all compared characters * match but the arg string is longer. */ - __device__ [[nodiscard]] inline int compare(const string_view& str) const; + __device__ [[nodiscard]] inline int compare(string_view const& str) const; /** * @brief Comparing target string with this string. Each character is compared * as a UTF-8 code-point value. @@ -169,7 +170,7 @@ class string_view { * not match is greater in the arg string, or all compared characters * match but the arg string is longer. */ - __device__ inline int compare(const char* str, size_type bytes) const; + __device__ inline int compare(char const* str, size_type bytes) const; /** * @brief Returns true if rhs matches this string exactly. @@ -177,42 +178,42 @@ class string_view { * @param rhs Target string to compare with this string. * @return true if rhs matches this string exactly */ - __device__ inline bool operator==(const string_view& rhs) const; + __device__ inline bool operator==(string_view const& rhs) const; /** * @brief Returns true if rhs does not match this string. * * @param rhs Target string to compare with this string. * @return true if rhs does not match this string */ - __device__ inline bool operator!=(const string_view& rhs) const; + __device__ inline bool operator!=(string_view const& rhs) const; /** * @brief Returns true if this string is ordered before rhs. * * @param rhs Target string to compare with this string. * @return true if this string is ordered before rhs */ - __device__ inline bool operator<(const string_view& rhs) const; + __device__ inline bool operator<(string_view const& rhs) const; /** * @brief Returns true if rhs is ordered before this string. * * @param rhs Target string to compare with this string. * @return true if rhs is ordered before this string */ - __device__ inline bool operator>(const string_view& rhs) const; + __device__ inline bool operator>(string_view const& rhs) const; /** * @brief Returns true if this string matches or is ordered before rhs. * * @param rhs Target string to compare with this string. * @return true if this string matches or is ordered before rhs */ - __device__ inline bool operator<=(const string_view& rhs) const; + __device__ inline bool operator<=(string_view const& rhs) const; /** * @brief Returns true if rhs matches or is ordered before this string. * * @param rhs Target string to compare with this string. * @return true if rhs matches or is ordered before this string */ - __device__ inline bool operator>=(const string_view& rhs) const; + __device__ inline bool operator>=(string_view const& rhs) const; /** * @brief Returns the character position of the first occurrence where the @@ -224,7 +225,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if str is not found in this string. */ - __device__ [[nodiscard]] inline size_type find(const string_view& str, + __device__ [[nodiscard]] inline size_type find(string_view const& str, size_type pos = 0, size_type count = -1) const; /** @@ -238,7 +239,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if arg string is not found in this string. */ - __device__ inline size_type find(const char* str, + __device__ inline size_type find(char const* str, size_type bytes, size_type pos = 0, size_type count = -1) const; @@ -265,7 +266,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if arg string is not found in this string. */ - __device__ [[nodiscard]] inline size_type rfind(const string_view& str, + __device__ [[nodiscard]] inline size_type rfind(string_view const& str, size_type pos = 0, size_type count = -1) const; /** @@ -279,7 +280,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if arg string is not found in this string. */ - __device__ inline size_type rfind(const char* str, + __device__ inline size_type rfind(char const* str, size_type bytes, size_type pos = 0, size_type count = -1) const; @@ -339,12 +340,12 @@ class string_view { * @param data Device char array encoded in UTF8. * @param bytes Number of bytes in data array. */ - CUDF_HOST_DEVICE inline string_view(const char* data, size_type bytes) + CUDF_HOST_DEVICE inline string_view(char const* data, size_type bytes) : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH) { } - string_view(const string_view&) = default; ///< Copy constructor + string_view(string_view const&) = default; ///< Copy constructor string_view(string_view&&) = default; ///< Move constructor ~string_view() = default; /** @@ -352,7 +353,7 @@ class string_view { * * @return Reference to this instance */ - string_view& operator=(const string_view&) = default; + string_view& operator=(string_view const&) = default; /** * @brief Move assignment operator * @@ -368,7 +369,7 @@ class string_view { static inline cudf::size_type const npos{-1}; private: - const char* _data{}; ///< Pointer to device memory contain char array for this string + char const* _data{}; ///< Pointer to device memory contain char array for this string size_type _bytes{}; ///< Number of bytes in _data for this string mutable size_type _length{}; ///< Number of characters in this string (computed) @@ -399,7 +400,7 @@ class string_view { * @return npos if str is not found in this string */ template - __device__ inline size_type find_impl(const char* str, + __device__ inline size_type find_impl(char const* str, size_type bytes, size_type pos, size_type count) const; diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index e617dbde024..f1aa8e49f00 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,8 +67,8 @@ class strings_column_view : private column_view { using column_view::offset; using column_view::size; - using offset_iterator = offset_type const*; ///< offsets iterator type - using chars_iterator = char const*; ///< character iterator type + using offset_iterator = size_type const*; ///< offsets iterator type + using chars_iterator = char const*; ///< character iterator type /** * @brief Returns the parent column. diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 3e37bd53972..6b024d902a9 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -17,11 +17,11 @@ #pragma once #include -#include #include #include #include -#include +#include +#include #include #include #include @@ -542,8 +542,10 @@ class device_row_comparator { size_type const rhs_index) const noexcept { int last_null_depth = std::numeric_limits::max(); - size_type list_column_index{0}; + size_type list_column_index{-1}; for (size_type i = 0; i < _lhs.num_columns(); ++i) { + if (_lhs.column(i).type().id() == type_id::LIST) { ++list_column_index; } + int const depth = _depth.has_value() ? (*_depth)[i] : 0; if (depth > last_null_depth) { continue; } @@ -556,15 +558,12 @@ class device_row_comparator { // TODO: At what point do we verify that the columns of lhs and rhs are // all of the same types? I assume that it's already happened before // here, otherwise the current code would be failing. - auto [l_dremel_i, r_dremel_i] = [&]() { - if (_lhs.column(i).type().id() == type_id::LIST) { - auto idx = list_column_index++; - return std::make_tuple(optional_dremel_view(_l_dremel[idx]), - optional_dremel_view(_r_dremel[idx])); - } else { - return std::make_tuple(optional_dremel_view{}, optional_dremel_view{}); - } - }(); + auto const [l_dremel_i, r_dremel_i] = + _lhs.column(i).type().id() == type_id::LIST + ? std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]), + optional_dremel_view(_r_dremel[list_column_index])) + : std::make_tuple(optional_dremel_view{}, optional_dremel_view{}); + auto element_comp = element_comparator{_check_nulls, _lhs.column(i), _rhs.column(i), @@ -1813,7 +1812,7 @@ class device_row_hasher { // Hash each element and combine all the hash values together return detail::accumulate(it, it + _table.num_columns(), _seed, [](auto hash, auto h) { - return cudf::detail::hash_combine(hash, h); + return cudf::hashing::detail::hash_combine(hash, h); }); } @@ -1854,7 +1853,8 @@ class device_row_hasher { auto validity_it = detail::make_validity_iterator(curr_col); hash = detail::accumulate( validity_it, validity_it + curr_col.size(), hash, [](auto hash, auto is_valid) { - return cudf::detail::hash_combine(hash, is_valid ? NON_NULL_HASH : NULL_HASH); + return cudf::hashing::detail::hash_combine(hash, + is_valid ? NON_NULL_HASH : NULL_HASH); }); } if (curr_col.type().id() == type_id::STRUCT) { @@ -1866,13 +1866,13 @@ class device_row_hasher { auto list_sizes = make_list_size_iterator(list_col); hash = detail::accumulate( list_sizes, list_sizes + list_col.size(), hash, [](auto hash, auto size) { - return cudf::detail::hash_combine(hash, hash_fn{}(size)); + return cudf::hashing::detail::hash_combine(hash, hash_fn{}(size)); }); curr_col = list_col.get_sliced_child(); } } for (int i = 0; i < curr_col.size(); ++i) { - hash = cudf::detail::hash_combine( + hash = cudf::hashing::detail::hash_combine( hash, type_dispatcher(curr_col.type(), _element_hasher, curr_col, i)); } @@ -1941,7 +1941,7 @@ class row_hasher { * @param seed The seed to use for the hash function * @return A hash operator to use on the device */ - template