From cf3c818d40e484db75e27c51c587b489ac393a51 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 3 May 2024 17:30:20 -0400 Subject: [PATCH 01/22] Add `from_arrow_device_host` functions for cudf interop with nanoarrow --- cpp/include/cudf/interop.hpp | 44 ++ cpp/src/interop/from_arrow_device.cu | 394 ++++++++++ cpp/tests/CMakeLists.txt | 1 + .../interop/from_arrow_device_host_test.cpp | 737 ++++++++++++++++++ cpp/tests/interop/nanoarrow_utils.hpp | 182 +++++ cpp/tests/interop/to_arrow_device_test.cpp | 2 - 6 files changed, 1358 insertions(+), 2 deletions(-) create mode 100644 cpp/tests/interop/from_arrow_device_host_test.cpp diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index bb05a622f40..577fe10f773 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -348,6 +348,50 @@ std::unique_ptr from_arrow( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create `cudf::table` from given ArrowDeviceArray input + * + * @throws cudf::logic_error if either schema or input are NULL + * + * @throws cudf::logic_error if the device_type is not `ARROW_DEVICE_CPU` + * + * @throws cudf::data_type_error if the input array is not a struct array, + * non-struct arrays should be passed to `from_arrow_device_host_column` instead. + * + * @param schema `ArrowSchema` pointer to describe the type of the data + * @param input `ArrowDeviceArray` pointer to object owning the Arrow data + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to perform cuda allocation + * @return cudf table generated from the given Arrow data + */ +std::unique_ptr from_arrow_device_host( + ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create `cudf::column` from given ArrowDeviceArray input + * + * @throws cudf::logic_error if either schema or input are NULL + * + * @throws cudf::logic_error if the device_type is not `ARROW_DEVICE_CPU` + * + * @throws cudf::data_type_error if input arrow data type is not supported in cudf. + * + * @param schema `ArrowSchema` pointer to describe the type of the data + * @param input `ArrowDeviceArray` pointer to object owning the Arrow data + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to perform cuda allocation + * @return cudf table generated from the given Arrow data + */ +std::unique_ptr from_arrow_device_host_column( + ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + + /** * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray * diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index d4d31d1989b..70d41b4b688 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -16,14 +16,17 @@ #include "arrow_utilities.hpp" +#include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -39,6 +42,8 @@ #include #include +#include + namespace cudf { namespace detail { @@ -88,6 +93,309 @@ data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view) namespace { +struct dispatch_copy_from_arrow_device { + rmm::cuda_stream_view stream; + rmm::mr::device_memory_resource* mr; + + std::unique_ptr get_mask_buffer(ArrowArray const* array) + { + auto* bitmap = array->buffers[validity_buffer_idx]; + if (bitmap == nullptr) { return std::make_unique(0, stream, mr); } + + auto const bitmask_size = array->length + array->offset; + auto const allocation_size = + bitmask_allocation_size_bytes(static_cast(bitmask_size)); + auto mask = std::make_unique(allocation_size, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(), + reinterpret_cast(bitmap), + allocation_size, + cudaMemcpyDefault, + stream.value())); + return mask; + } + + template () && + !std::is_same_v)> + std::unique_ptr operator()(ArrowSchemaView*, ArrowArray const*, data_type, bool) + { + CUDF_FAIL("Unsupported type in copy_from_arrow_device."); + } + + template () || std::is_same_v)> + std::unique_ptr operator()(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask) + { + using DeviceType = std::conditional_t, __int128_t, T>; + + size_type const num_rows = input->length; + size_type const offset = input->offset; + size_type const null_count = input->null_count; + auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; + + auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr; + auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr); + auto mutable_column_view = col->mutable_view(); + CUDF_CUDA_TRY( + cudaMemcpyAsync(mutable_column_view.data(), + reinterpret_cast(data_buffer) + offset * sizeof(DeviceType), + sizeof(DeviceType) * num_rows, + cudaMemcpyDefault, + stream.value())); + + if (has_nulls) { + auto tmp_mask = get_mask_buffer(input); + + // if array is sliced, we have to copy the whole mask and then take copy + auto out_mask = + (offset == 0) + ? std::move(*tmp_mask) + : cudf::detail::copy_bitmask( + static_cast(tmp_mask->data()), offset, offset + num_rows, stream, mr); + + col->set_null_mask(std::move(out_mask), null_count); + } + + return col; + } +}; + +// forward declaration is needed because `type_dispatch` instantiates the +// dispatch_from_arrow_device struct causing a recursive situation for struct, +// dictionary and list_view types. +std::unique_ptr get_column_copy(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +template <> +std::unique_ptr dispatch_copy_from_arrow_device::operator()(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask) +{ + auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; + const auto buffer_length = bitmask_allocation_size_bytes(input->length + input->offset); + // mask-to-bools expects the mask to be bitmask_type aligned/padded + auto data = rmm::device_buffer(buffer_length, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(), + reinterpret_cast(data_buffer), + buffer_length, + cudaMemcpyDefault, + stream.value())); + auto out_col = mask_to_bools(static_cast(data.data()), + input->offset, + input->offset + input->length, + stream, + mr); + + auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr; + if (has_nulls) { + auto out_mask = detail::copy_bitmask(static_cast(get_mask_buffer(input)->data()), + input->offset, + input->offset + input->length, + stream, + mr); + + out_col->set_null_mask(std::move(out_mask), input->null_count); + } + + return out_col; +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_device::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + if (input->length == 0) { return make_empty_column(type_id::STRING); } + + const void* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; + ArrowArray offsets_array = { + .length = input->offset + input->length + 1, + .null_count = 0, + .offset = 0, + .n_buffers = 2, + .n_children = 0, + .buffers = offset_buffers, + }; + + size_type const char_data_length = + reinterpret_cast(offset_buffers[1])[input->length + input->offset]; + const void* char_buffers[2] = {nullptr, input->buffers[2]}; + ArrowArray char_array = { + .length = char_data_length, + .null_count = 0, + .offset = 0, + .n_buffers = 2, + .n_children = 0, + .buffers = char_buffers, + }; + + nanoarrow::UniqueSchema offset_schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32)); + + nanoarrow::UniqueSchema char_data_schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(char_data_schema.get(), NANOARROW_TYPE_INT8)); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr)); + auto offsets_column = + this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr)); + auto chars_column = this->operator()(&view, &char_array, data_type(type_id::INT8), true); + + auto const num_rows = offsets_column->size() - 1; + auto out_col = make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + input->null_count, + std::move(*get_mask_buffer(input))); + + return input->offset == 0 + ? std::move(out_col) + : std::make_unique( + cudf::detail::slice(out_col->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_device::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + ArrowSchemaView keys_schema_view; + NANOARROW_THROW_NOT_OK( + ArrowSchemaViewInit(&keys_schema_view, schema->schema->dictionary, nullptr)); + + auto const keys_type = arrow_to_cudf_type(&keys_schema_view); + auto keys_column = + get_column_copy(&keys_schema_view, input->dictionary, keys_type, true, stream, mr); + + auto const dict_indices_type = [&schema]() -> data_type { + // cudf dictionary requires an unsigned type for the indices, + // since it is invalid for an arrow dictionary to contain negative + // indices, we can safely use the unsigned equivalent without having + // to modify the buffers. + switch (schema->storage_type) { + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); + default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error); + } + }(); + + auto indices_column = get_column_copy(schema, input, dict_indices_type, false, stream, mr); + // child columns shouldn't have masks and we need the mask in the main column + auto column_contents = indices_column->release(); + indices_column = std::make_unique(dict_indices_type, + static_cast(input->length), + std::move(*(column_contents.data)), + rmm::device_buffer{}, + 0); + + return make_dictionary_column(std::move(keys_column), + std::move(indices_column), + std::move(*(column_contents.null_mask)), + input->null_count); +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_device::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + char buffer[1024]; + + std::vector> child_columns; + std::transform(input->children, + input->children + input->n_children, + schema->schema->children, + std::back_inserter(child_columns), + [this, input, &buffer](ArrowArray const* child, ArrowSchema const* child_schema) { + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); + auto type = arrow_to_cudf_type(&view); + + auto out = get_column_copy(&view, child, type, false, stream, mr); + return std::make_unique( + cudf::detail::slice(out->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); + }); + + auto out_mask = std::move(*(get_mask_buffer(input))); + if (input->buffers[validity_buffer_idx] != nullptr) { + out_mask = detail::copy_bitmask(static_cast(out_mask.data()), + input->offset, + input->offset + input->length, + stream, + mr); + } + + return make_structs_column( + input->length, std::move(child_columns), input->null_count, std::move(out_mask), stream, mr); +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_device::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + const void* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; + ArrowArray offsets_array = { + .length = input->offset + input->length + 1, + .null_count = 0, + .offset = 0, + .n_buffers = 2, + .n_children = 0, + .buffers = offset_buffers, + }; + nanoarrow::UniqueSchema offset_schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32)); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr)); + auto offsets_column = + this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); + + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema->schema->children[0], nullptr)); + auto child_type = arrow_to_cudf_type(&view); + auto child_column = get_column_copy(&view, input->children[0], child_type, false, stream, mr); + + auto const num_rows = offsets_column->size() - 1; + auto out_col = make_lists_column(num_rows, + std::move(offsets_column), + std::move(child_column), + input->null_count, + std::move(*get_mask_buffer(input)), + stream, + mr); + + return num_rows == input->length + ? std::move(out_col) + : std::make_unique( + cudf::detail::slice(out_col->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); +} + using dispatch_tuple_t = std::tuple; struct dispatch_from_arrow_device { @@ -359,6 +667,23 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( std::move(owned)); } +std::unique_ptr get_column_copy(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type.id() != type_id::EMPTY + ? std::move(type_dispatcher( + type, dispatch_copy_from_arrow_device{stream, mr}, schema, input, type, skip_mask)) + : std::make_unique(data_type(type_id::EMPTY), + input->length, + rmm::device_buffer{}, + rmm::device_buffer{}, + input->length); +} + dispatch_tuple_t get_column(ArrowSchemaView* schema, ArrowArray const* input, data_type type, @@ -436,8 +761,77 @@ unique_column_view_t from_arrow_device_column(ArrowSchemaView* schema, custom_view_deleter{std::move(owned)}}; } +std::unique_ptr
from_arrow_device_host(ArrowSchemaView* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + std::vector> columns; + + auto type = arrow_to_cudf_type(schema); + CUDF_EXPECTS(type == data_type(type_id::STRUCT), + "Must pass a struct to `from_arrow_device_host`", + cudf::data_type_error); + + std::transform(input->array.children, + input->array.children + input->array.n_children, + schema->schema->children, + std::back_inserter(columns), + [&stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) { + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); + auto type = arrow_to_cudf_type(&view); + return get_column_copy(&view, child, type, false, stream, mr); + }); + + return std::make_unique
(std::move(columns)); +} + +std::unique_ptr from_arrow_device_host_column(ArrowSchemaView* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto type = arrow_to_cudf_type(schema); + return get_column_copy(schema, &input->array, type, false, stream, mr); +} + } // namespace detail +std::unique_ptr
from_arrow_device_host(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL"); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, + "ArrowDeviceArray must have CPU device type for `from_arrow_device_host`"); + + CUDF_FUNC_RANGE(); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + return detail::from_arrow_device_host(&view, input, stream, mr); +} + +std::unique_ptr from_arrow_device_host_column(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL"); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, + "ArrowDeviceArray must have CPU device type for `from_arrow_device_host_column`"); + + CUDF_FUNC_RANGE(); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + return detail::from_arrow_device_host_column(&view, input, stream, mr); +} + unique_table_view_t from_arrow_device(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index bbb919aa2d1..fe3ed92de4e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -273,6 +273,7 @@ ConfigureTest( interop/to_arrow_test.cpp interop/from_arrow_test.cpp interop/from_arrow_device_test.cpp + interop/from_arrow_device_host_test.cpp interop/dlpack_test.cpp EXTRA_LIB nanoarrow diff --git a/cpp/tests/interop/from_arrow_device_host_test.cpp b/cpp/tests/interop/from_arrow_device_host_test.cpp new file mode 100644 index 00000000000..99dea2b0a25 --- /dev/null +++ b/cpp/tests/interop/from_arrow_device_host_test.cpp @@ -0,0 +1,737 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nanoarrow_utils.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> +get_nanoarrow_host_tables(cudf::size_type length) +{ + std::vector int64_data(length); + std::vector bool_data(length); + std::vector string_data(length); + std::vector validity(length); + std::vector bool_validity(length); + std::vector bool_data_validity; + cudf::size_type length_of_individual_list = 3; + cudf::size_type length_of_list = length_of_individual_list * length; + std::vector list_int64_data(length_of_list); + std::vector list_int64_data_validity(length_of_list); + std::vector list_offsets(length + 1); + + std::vector> columns; + + std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; }); + std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; }); + auto validity_generator = []() { return rand() % 7 != 0; }; + std::generate( + list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator); + std::generate( + list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable { + return (n++) * length_of_individual_list; + }); + std::generate(bool_data.begin(), bool_data.end(), validity_generator); + std::generate( + string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; }); + std::generate(validity.begin(), validity.end(), validity_generator); + std::generate(bool_validity.begin(), bool_validity.end(), validity_generator); + + std::transform(bool_validity.cbegin(), + bool_validity.cend(), + std::back_inserter(bool_data_validity), + [](auto val) { return static_cast(val); }); + + columns.emplace_back(cudf::test::fixed_width_column_wrapper( + int64_data.begin(), int64_data.end(), validity.begin()) + .release()); + columns.emplace_back( + cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) + .release()); + auto col4 = cudf::test::fixed_width_column_wrapper( + int64_data.begin(), int64_data.end(), validity.begin()); + auto dict_col = cudf::dictionary::encode(col4); + columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::test::fixed_width_column_wrapper( + bool_data.begin(), bool_data.end(), bool_validity.begin()) + .release()); + auto list_child_column = cudf::test::fixed_width_column_wrapper( + list_int64_data.begin(), list_int64_data.end(), list_int64_data_validity.begin()); + auto list_offsets_column = + cudf::test::fixed_width_column_wrapper(list_offsets.begin(), list_offsets.end()); + auto [list_mask, list_nulls] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( + bool_data_validity.begin(), bool_data_validity.end())); + columns.emplace_back(cudf::make_lists_column(length, + list_offsets_column.release(), + list_child_column.release(), + list_nulls, + std::move(*list_mask))); + auto int_column = cudf::test::fixed_width_column_wrapper( + int64_data.begin(), int64_data.end(), validity.begin()) + .release(); + auto str_column = + cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) + .release(); + vector_of_columns cols; + cols.push_back(move(int_column)); + cols.push_back(move(str_column)); + auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( + bool_data_validity.begin(), bool_data_validity.end())); + columns.emplace_back( + cudf::make_structs_column(length, std::move(cols), null_count, std::move(*null_mask))); + + nanoarrow::UniqueSchema schema; + ArrowSchemaInit(schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(schema.get(), 6)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[0], "a")); + if (columns[0]->null_count() > 0) { + schema->children[0]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[0]->flags = 0; + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[1], NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[1], "b")); + if (columns[1]->null_count() > 0) { + schema->children[1]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[1]->flags = 0; + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_UINT32)); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(schema->children[2])); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(schema->children[2]->dictionary, NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[2], "c")); + if (columns[2]->null_count() > 0) { + schema->children[2]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[2]->flags = 0; + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[3], NANOARROW_TYPE_BOOL)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[3], "d")); + if (columns[3]->null_count() > 0) { + schema->children[3]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[3]->flags = 0; + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[4], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(schema->children[4]->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[4]->children[0], "element")); + if (columns[4]->child(1).null_count() > 0) { + schema->children[4]->children[0]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[4]->children[0]->flags = 0; + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[4], "e")); + if (columns[4]->has_nulls()) { + schema->children[4]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[4]->flags = 0; + } + + ArrowSchemaInit(schema->children[5]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(schema->children[5], 2)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(schema->children[5]->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5]->children[0], "integral")); + if (columns[5]->child(0).has_nulls()) { + schema->children[5]->children[0]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[5]->children[0]->flags = 0; + } + + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(schema->children[5]->children[1], NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5]->children[1], "string")); + if (columns[5]->child(1).has_nulls()) { + schema->children[5]->children[1]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[5]->children[1]->flags = 0; + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5], "f")); + if (columns[5]->has_nulls()) { + schema->children[5]->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->children[5]->flags = 0; + } + + auto int64_array = get_nanoarrow_array(int64_data, validity); + auto string_array = get_nanoarrow_array(string_data, validity); + cudf::dictionary_column_view view(dict_col->view()); + auto keys = cudf::test::to_host(view.keys()).first; + auto indices = cudf::test::to_host(view.indices()).first; + auto dict_array = get_nanoarrow_dict_array(std::vector(keys.begin(), keys.end()), + std::vector(indices.begin(), indices.end()), + validity); + auto boolarray = get_nanoarrow_array(bool_data, bool_validity); + auto list_array = get_nanoarrow_list_array( + list_int64_data, list_offsets, list_int64_data_validity, bool_data_validity); + + nanoarrow::UniqueArray arrow; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr)); + arrow->length = length; + + int64_array.move(arrow->children[0]); + string_array.move(arrow->children[1]); + dict_array.move(arrow->children[2]); + boolarray.move(arrow->children[3]); + list_array.move(arrow->children[4]); + + int64_array = get_nanoarrow_array(int64_data, validity); + string_array = get_nanoarrow_array(string_data, validity); + int64_array.move(arrow->children[5]->children[0]); + string_array.move(arrow->children[5]->children[1]); + + ArrowBitmap struct_validity; + ArrowBitmapInit(&struct_validity); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&struct_validity, length)); + ArrowBitmapAppendInt8Unsafe( + &struct_validity, reinterpret_cast(bool_data_validity.data()), length); + arrow->children[5]->length = length; + ArrowArraySetValidityBitmap(arrow->children[5], &struct_validity); + arrow->children[5]->null_count = + length - ArrowBitCountSet(ArrowArrayValidityBitmap(arrow->children[5])->buffer.data, 0, length); + + ArrowError error; + if (ArrowArrayFinishBuilding(arrow.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, &error) != + NANOARROW_OK) { + std::cerr << ArrowErrorMessage(&error) << std::endl; + CUDF_FAIL("failed to build example arrays"); + } + + return std::make_tuple( + std::make_unique(std::move(columns)), std::move(schema), std::move(arrow)); +} + +struct FromArrowHostDeviceTest : public cudf::test::BaseFixture {}; + +template +struct FromArrowHostDeviceTestDurationsTest : public cudf::test::BaseFixture {}; + +TYPED_TEST_SUITE(FromArrowHostDeviceTestDurationsTest, cudf::test::DurationTypes); + +TEST_F(FromArrowHostDeviceTest, EmptyTable) +{ + auto [tbl, schema, arr] = get_nanoarrow_host_tables(0); + + auto expected_cudf_table = tbl->view(); + ArrowDeviceArray input; + memcpy(&input.array, arr.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + auto got_cudf_table = cudf::from_arrow_device_host(schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table->view()); +} + +TEST_F(FromArrowHostDeviceTest, DateTimeTable) +{ + auto data = std::vector{1, 2, 3, 4, 5, 6}; + auto col = cudf::test::fixed_width_column_wrapper( + data.begin(), data.end()); + cudf::table_view expected_table_view({col}); + + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); + ArrowSchemaInit(input_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime( + input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + + nanoarrow::UniqueArray input_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + input_array->length = 6; + input_array->null_count = 0; + + auto arr = get_nanoarrow_array(data); + arr.move(input_array->children[0]); + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + auto got_cudf_table = cudf::from_arrow_device_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + + auto got_cudf_col = cudf::from_arrow_device_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +TYPED_TEST(FromArrowHostDeviceTestDurationsTest, DurationTable) +{ + using T = TypeParam; + if (cudf::type_to_id() == cudf::type_id::DURATION_DAYS) { return; } + + auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}}; + auto col = cudf::test::fixed_width_column_wrapper(data); + + cudf::table_view expected_table_view({col}); + const ArrowTimeUnit time_unit = [&] { + switch (cudf::type_to_id()) { + case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND; + case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI; + case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO; + case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO; + default: CUDF_FAIL("Unsupported duration unit in arrow"); + } + }(); + + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); + + ArrowSchemaInit(input_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime( + input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + + nanoarrow::UniqueArray input_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + input_array->length = expected_table_view.num_rows(); + input_array->null_count = 0; + + auto arr = get_nanoarrow_array(data); + arr.move(input_array->children[0]); + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + auto got_cudf_table = cudf::from_arrow_device_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + + auto got_cudf_col = cudf::from_arrow_device_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +TEST_F(FromArrowHostDeviceTest, NestedList) +{ + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; }); + auto col = cudf::test::lists_column_wrapper( + {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids}); + cudf::table_view expected_table_view({col}); + + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + input_schema->children[0]->flags = ARROW_FLAG_NULLABLE; + + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0]->children[0], "element")); + input_schema->children[0]->children[0]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType( + input_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element")); + input_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE; + + auto list_arr = get_nanoarrow_list_array({6, 7, 8, 9}, {0, 1, 4}, {1, 0, 1, 1}); + std::vector offset{0, 0, 2}; + + ArrowBitmap mask; + ArrowBitmapInit(&mask); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 2)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 1)); + + nanoarrow::UniqueArray input_array; + EXPECT_EQ(NANOARROW_OK, ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + input_array->length = expected_table_view.num_rows(); + input_array->null_count = 0; + + ArrowArraySetValidityBitmap(input_array->children[0], &mask); + input_array->children[0]->length = expected_table_view.num_rows(); + input_array->children[0]->null_count = 1; + auto offset_buf = ArrowArrayBuffer(input_array->children[0], 1); + EXPECT_EQ( + NANOARROW_OK, + ArrowBufferAppend( + offset_buf, reinterpret_cast(offset.data()), offset.size() * sizeof(int32_t))); + + list_arr.move(input_array->children[0]->children[0]); + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + auto got_cudf_table = cudf::from_arrow_device_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + + auto got_cudf_col = cudf::from_arrow_device_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +TEST_F(FromArrowHostDeviceTest, StructColumn) +{ + // Create cudf table + auto nested_type_field_names = + std::vector>{{"string", "integral", "bool", "nested_list", "struct"}}; + auto str_col = + cudf::test::strings_column_wrapper{ + "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"} + .release(); + auto str_col2 = + cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release(); + int num_rows{str_col->size()}; + auto int_col = cudf::test::fixed_width_column_wrapper{{48, 27, 25}}.release(); + auto int_col2 = + cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); + auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); + auto list_col = + cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) + .release(); + vector_of_columns cols2; + cols2.push_back(std::move(str_col2)); + cols2.push_back(std::move(int_col2)); + auto [null_mask, null_count] = + cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper{{true, true, false}}); + auto sub_struct_col = + cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask)); + vector_of_columns cols; + cols.push_back(std::move(str_col)); + cols.push_back(std::move(int_col)); + cols.push_back(std::move(bool_col)); + cols.push_back(std::move(list_col)); + cols.push_back(std::move(sub_struct_col)); + + auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {}); + cudf::table_view expected_table_view({struct_col->view()}); + + // Create name metadata + auto sub_metadata = cudf::column_metadata{"struct"}; + sub_metadata.children_meta = {{"string2"}, {"integral2"}}; + auto metadata = cudf::column_metadata{"a"}; + metadata.children_meta = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata}; + + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); + + ArrowSchemaInit(input_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema->children[0], 5)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + input_schema->children[0]->flags = 0; + + auto child = input_schema->children[0]; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string")); + child->children[0]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral")); + child->children[1]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool")); + child->children[2]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list")); + child->children[3]->flags = 0; + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element")); + child->children[3]->children[0]->flags = 0; + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element")); + child->children[3]->children[0]->children[0]->flags = 0; + + ArrowSchemaInit(child->children[4]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct")); + + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2")); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2")); + + // create nanoarrow table + std::vector str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}; + std::vector str2{"CUDF", "ROCKS", "EVERYWHERE"}; + auto str_array = get_nanoarrow_array(str); + auto int_array = get_nanoarrow_array({48, 27, 25}); + auto str2_array = get_nanoarrow_array(str2, {0, 1, 0}); + auto int2_array = get_nanoarrow_array({12, 24, 47}, {1, 0, 1}); + auto bool_array = get_nanoarrow_array({true, true, false}); + auto list_arr = + get_nanoarrow_list_array({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9}); + std::vector offset{0, 3, 4, 6}; + + nanoarrow::UniqueArray input_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + + input_array->length = expected_table_view.num_rows(); + + auto array_a = input_array->children[0]; + auto view_a = expected_table_view.column(0); + array_a->length = view_a.size(); + array_a->null_count = view_a.null_count(); + + str_array.move(array_a->children[0]); + int_array.move(array_a->children[1]); + bool_array.move(array_a->children[2]); + + array_a->children[3]->length = expected_table_view.num_rows(); + array_a->children[3]->null_count = 0; + auto offset_buf = ArrowArrayBuffer(array_a->children[3], 1); + EXPECT_EQ( + NANOARROW_OK, + ArrowBufferAppend( + offset_buf, reinterpret_cast(offset.data()), offset.size() * sizeof(int32_t))); + + list_arr.move(array_a->children[3]->children[0]); + + ArrowBitmap mask; + ArrowBitmapInit(&mask); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 3)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 2)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1)); + + auto array_struct = array_a->children[4]; + auto view_struct = view_a.child(4); + ArrowArraySetValidityBitmap(array_struct, &mask); + array_struct->null_count = view_struct.null_count(); + array_struct->length = view_struct.size(); + + str2_array.move(array_struct->children[0]); + int2_array.move(array_struct->children[1]); + + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + auto got_cudf_table = cudf::from_arrow_device_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + + auto got_cudf_col = cudf::from_arrow_device_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType) +{ + auto array1 = + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto array2 = + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto array3 = + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + + auto keys_col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 7}); + auto ind1_col = cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto ind2_col = + cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto ind3_col = + cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + + vector_of_columns columns; + columns.emplace_back(cudf::make_dictionary_column(keys_col, ind1_col)); + columns.emplace_back(cudf::make_dictionary_column(keys_col, ind2_col)); + columns.emplace_back(cudf::make_dictionary_column(keys_col, ind3_col)); + + cudf::table expected_table(std::move(columns)); + + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_UINT8)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0])); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_UINT16)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b")); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1])); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_UINT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c")); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2])); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64)); + + nanoarrow::UniqueArray input_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + input_array->length = expected_table.num_rows(); + input_array->null_count = 0; + + array1.move(input_array->children[0]); + array2.move(input_array->children[1]); + array3.move(input_array->children[2]); + + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + auto got_cudf_table = cudf::from_arrow_device_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.view(), got_cudf_table->view()); + + auto got_cudf_col = cudf::from_arrow_device_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +void slice_host_nanoarrow(ArrowArray* arr, int64_t start, int64_t end) +{ + auto op = [&](ArrowArray* array) { + array->offset = start; + array->length = end - start; + if (array->null_count != 0) { + array->null_count = + array->length - ArrowBitCountSet(ArrowArrayValidityBitmap(array)->buffer.data, start, end - start); + } + }; + + if (arr->n_children == 0) { + op(arr); + return; + } + + arr->length = end - start; + for (int64_t i = 0; i < arr->n_children; ++i) { + op(arr->children[i]); + } +} + +struct FromArrowHostDeviceTestSlice + : public FromArrowHostDeviceTest, + public ::testing::WithParamInterface> {}; + +TEST_P(FromArrowHostDeviceTestSlice, SliceTest) +{ + auto [table, schema, array] = get_nanoarrow_host_tables(10000); + auto cudf_table_view = table->view(); + auto const [start, end] = GetParam(); + + auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0]; + auto expected_cudf_table = cudf::table{sliced_cudf_table}; + slice_host_nanoarrow(array.get(), start, end); + + ArrowDeviceArray input; + memcpy(&input.array, array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + auto got_cudf_table = cudf::from_arrow_device_host(schema.get(), &input); + if (got_cudf_table->num_rows() == 0 and sliced_cudf_table.num_rows() == 0) { + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table->view()); + + auto got_cudf_col = cudf::from_arrow_device_host_column(schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{std::vector(got_cudf_col_view.child_begin(), + got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(got_cudf_table->view(), from_struct); + } else { + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table.view(), got_cudf_table->view()); + + auto got_cudf_col = cudf::from_arrow_device_host_column(schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{std::vector(got_cudf_col_view.child_begin(), + got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); + } +} + + +INSTANTIATE_TEST_CASE_P(FromArrowHostDeviceTest, + FromArrowHostDeviceTestSlice, + ::testing::Values(std::make_tuple(0, 10000), + std::make_tuple(2912, 2915), + std::make_tuple(100, 3000), + std::make_tuple(0, 0), + std::make_tuple(0, 3000), + std::make_tuple(10000, 10000))); diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index fb5d1060f6f..588b5a39dfe 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -20,11 +20,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include @@ -135,7 +137,187 @@ void populate_dict_from_col(ArrowArray* arr, cudf::dictionary_column_view dview) populate_from_col(arr->dictionary, dview.keys()); } +using vector_of_columns = std::vector>; + std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> get_nanoarrow_tables(cudf::size_type length = 10000); void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view); + +std::unique_ptr get_cudf_table(); + +template +struct nanoarrow_storage_type {}; + +#define DEFINE_NANOARROW_STORAGE(T, NanoType) \ + template <> \ + struct nanoarrow_storage_type { \ + static constexpr ArrowType type = NANOARROW_TYPE_##NanoType; \ + } + +DEFINE_NANOARROW_STORAGE(bool, BOOL); +DEFINE_NANOARROW_STORAGE(int64_t, INT64); +DEFINE_NANOARROW_STORAGE(uint16_t, UINT16); +DEFINE_NANOARROW_STORAGE(uint64_t, UINT64); +DEFINE_NANOARROW_STORAGE(cudf::duration_D, INT32); +DEFINE_NANOARROW_STORAGE(cudf::duration_s, INT64); +DEFINE_NANOARROW_STORAGE(cudf::duration_ms, INT64); +DEFINE_NANOARROW_STORAGE(cudf::duration_us, INT64); +DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64); +DEFINE_NANOARROW_STORAGE(uint8_t, UINT8); +DEFINE_NANOARROW_STORAGE(int32_t, INT32); + +#undef DEFINE_NANOARROW_STORAGE + +template +std::enable_if_t() and !std::is_same_v, nanoarrow::UniqueArray> +get_nanoarrow_array(std::vector const& data, std::vector const& mask = {}) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), nanoarrow_storage_type::type)); + + if (!mask.empty()) { + ArrowBitmap bitmap; + ArrowBitmapInit(&bitmap); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bitmap, mask.size())); + ArrowBitmapAppendInt8Unsafe(&bitmap, reinterpret_cast(mask.data()), mask.size()); + + ArrowArraySetValidityBitmap(tmp.get(), &bitmap); + tmp->null_count = data.size() - ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, mask.size()); + } + + ArrowBuffer buf; + ArrowBufferInit(&buf); + NANOARROW_THROW_NOT_OK( + ArrowBufferAppend(&buf, reinterpret_cast(data.data()), sizeof(T) * data.size())); + ArrowArraySetBuffer(tmp.get(), 1, &buf); + + tmp->length = data.size(); + + return tmp; +} + +template +std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_array( + std::vector const& data, std::vector const& mask = {}) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_BOOL)); + + auto to_arrow_bitmap = [](std::vector const& b) -> ArrowBitmap { + ArrowBitmap out; + ArrowBitmapInit(&out); + NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1)); + out.buffer.size_bytes = (b.size() >> 3) + ((b.size() & 7) != 0); + out.size_bits = b.size(); + + for (size_t i = 0; i < b.size(); ++i) { + ArrowBitSetTo(out.buffer.data, i, static_cast(b[i])); + } + + return out; + }; + + if (!mask.empty()) { + auto validity_bitmap = to_arrow_bitmap(mask); + ArrowArraySetValidityBitmap(tmp.get(), &validity_bitmap); + tmp->null_count = mask.size() - ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, mask.size()); + } + + auto raw_buffer = to_arrow_bitmap(data); + ArrowArraySetBuffer(tmp.get(), 1, &raw_buffer.buffer); + tmp->length = data.size(); + + return tmp; +} + +template +nanoarrow::UniqueArray get_nanoarrow_array(std::initializer_list elements, + std::initializer_list validity = {}) +{ + std::vector mask(validity); + std::vector data(elements); + + return get_nanoarrow_array(data, mask); +} + +template +std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_array( + std::vector const& data, std::vector const& mask = {}) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(tmp.get())); + NANOARROW_THROW_NOT_OK(ArrowArrayReserve(tmp.get(), data.size())); + + for (size_t i = 0; i < data.size(); ++i) { + if (!mask.empty() && mask[i] == 0) { + NANOARROW_THROW_NOT_OK(ArrowArrayAppendNull(tmp.get(), 1)); + } else { + NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(tmp.get(), ArrowCharView(data[i].c_str()))); + } + } + + return tmp; +} + +template +nanoarrow::UniqueArray get_nanoarrow_dict_array(std::vector const& keys, + std::vector const& ind, + std::vector const& validity = {}) +{ + auto indices_array = get_nanoarrow_array(ind, validity); + NANOARROW_THROW_NOT_OK(ArrowArrayAllocateDictionary(indices_array.get())); + + auto keys_array = get_nanoarrow_array(keys); + keys_array.move(indices_array->dictionary); + + return indices_array; +} + +template +nanoarrow::UniqueArray get_nanoarrow_list_array(std::vector const& data, + std::vector const& offsets, + std::vector const& data_validity = {}, + std::vector const& list_validity = {}) +{ + auto data_array = get_nanoarrow_array(data, data_validity); + + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1)); + data_array.move(tmp->children[0]); + + tmp->length = offsets.size() - 1; + if (!list_validity.empty()) { + ArrowBitmap bitmap; + ArrowBitmapInit(&bitmap); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bitmap, list_validity.size())); + ArrowBitmapAppendInt8Unsafe( + &bitmap, reinterpret_cast(list_validity.data()), list_validity.size()); + + ArrowArraySetValidityBitmap(tmp.get(), &bitmap); + tmp->null_count = tmp->length - ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, list_validity.size()); + } + + ArrowBuffer buf; + ArrowBufferInit(&buf); + NANOARROW_THROW_NOT_OK(ArrowBufferAppend( + &buf, reinterpret_cast(offsets.data()), sizeof(int32_t) * offsets.size())); + ArrowArraySetBuffer(tmp.get(), 1, &buf); + + return tmp; +} + +template +nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list data, + std::initializer_list offsets, + std::initializer_list data_validity = {}, + std::initializer_list list_validity = {}) +{ + std::vector data_vector(data); + std::vector offset(offsets); + std::vector data_mask(data_validity); + std::vector list_mask(list_validity); + return get_nanoarrow_list_array(data_vector, offset, data_mask, list_mask); +} \ No newline at end of file diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 626aeb53cdd..a10517128d2 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -38,8 +38,6 @@ #include -using vector_of_columns = std::vector>; - std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> get_nanoarrow_tables(cudf::size_type length) { From 5e06b19cce141e1012a6fdf79979d96e106079c9 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 6 May 2024 11:38:57 -0400 Subject: [PATCH 02/22] rename functions and lint --- cpp/include/cudf/interop.hpp | 25 ++++++----- cpp/src/interop/from_arrow_device.cu | 42 +++++++++---------- .../interop/from_arrow_device_host_test.cpp | 32 +++++++------- cpp/tests/interop/nanoarrow_utils.hpp | 18 +++++--- 4 files changed, 61 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 577fe10f773..df58390304f 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -352,46 +352,45 @@ std::unique_ptr from_arrow( * @brief Create `cudf::table` from given ArrowDeviceArray input * * @throws cudf::logic_error if either schema or input are NULL - * + * * @throws cudf::logic_error if the device_type is not `ARROW_DEVICE_CPU` - * + * * @throws cudf::data_type_error if the input array is not a struct array, - * non-struct arrays should be passed to `from_arrow_device_host_column` instead. - * + * non-struct arrays should be passed to `from_arrow_host_column` instead. + * * @param schema `ArrowSchema` pointer to describe the type of the data * @param input `ArrowDeviceArray` pointer to object owning the Arrow data * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to perform cuda allocation * @return cudf table generated from the given Arrow data */ -std::unique_ptr
from_arrow_device_host( +std::unique_ptr
from_arrow_host( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::column` from given ArrowDeviceArray input - * + * * @throws cudf::logic_error if either schema or input are NULL - * + * * @throws cudf::logic_error if the device_type is not `ARROW_DEVICE_CPU` - * + * * @throws cudf::data_type_error if input arrow data type is not supported in cudf. - * + * * @param schema `ArrowSchema` pointer to describe the type of the data * @param input `ArrowDeviceArray` pointer to object owning the Arrow data * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to perform cuda allocation * @return cudf table generated from the given Arrow data */ -std::unique_ptr from_arrow_device_host_column( +std::unique_ptr from_arrow_host_column( ArrowSchema const* schema, ArrowDeviceArray const* input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray * diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 70d41b4b688..77926896278 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -761,16 +761,16 @@ unique_column_view_t from_arrow_device_column(ArrowSchemaView* schema, custom_view_deleter{std::move(owned)}}; } -std::unique_ptr
from_arrow_device_host(ArrowSchemaView* schema, - ArrowDeviceArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
from_arrow_host(ArrowSchemaView* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { std::vector> columns; auto type = arrow_to_cudf_type(schema); CUDF_EXPECTS(type == data_type(type_id::STRUCT), - "Must pass a struct to `from_arrow_device_host`", + "Must pass a struct to `from_arrow_host`", cudf::data_type_error); std::transform(input->array.children, @@ -787,10 +787,10 @@ std::unique_ptr
from_arrow_device_host(ArrowSchemaView* schema, return std::make_unique
(std::move(columns)); } -std::unique_ptr from_arrow_device_host_column(ArrowSchemaView* schema, - ArrowDeviceArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr from_arrow_host_column(ArrowSchemaView* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto type = arrow_to_cudf_type(schema); return get_column_copy(schema, &input->array, type, false, stream, mr); @@ -798,38 +798,38 @@ std::unique_ptr from_arrow_device_host_column(ArrowSchemaView* schema, } // namespace detail -std::unique_ptr
from_arrow_device_host(ArrowSchema const* schema, - ArrowDeviceArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
from_arrow_host(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL"); CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, - "ArrowDeviceArray must have CPU device type for `from_arrow_device_host`"); + "ArrowDeviceArray must have CPU device type for `from_arrow_host`"); CUDF_FUNC_RANGE(); ArrowSchemaView view; NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - return detail::from_arrow_device_host(&view, input, stream, mr); + return detail::from_arrow_host(&view, input, stream, mr); } -std::unique_ptr from_arrow_device_host_column(ArrowSchema const* schema, - ArrowDeviceArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, "input ArrowSchema and ArrowDeviceArray must not be NULL"); CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, - "ArrowDeviceArray must have CPU device type for `from_arrow_device_host_column`"); + "ArrowDeviceArray must have CPU device type for `from_arrow_host_column`"); CUDF_FUNC_RANGE(); ArrowSchemaView view; NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - return detail::from_arrow_device_host_column(&view, input, stream, mr); + return detail::from_arrow_host_column(&view, input, stream, mr); } unique_table_view_t from_arrow_device(ArrowSchema const* schema, diff --git a/cpp/tests/interop/from_arrow_device_host_test.cpp b/cpp/tests/interop/from_arrow_device_host_test.cpp index 99dea2b0a25..6c35e649963 100644 --- a/cpp/tests/interop/from_arrow_device_host_test.cpp +++ b/cpp/tests/interop/from_arrow_device_host_test.cpp @@ -263,7 +263,7 @@ TEST_F(FromArrowHostDeviceTest, EmptyTable) input.device_id = -1; input.device_type = ARROW_DEVICE_CPU; - auto got_cudf_table = cudf::from_arrow_device_host(schema.get(), &input); + auto got_cudf_table = cudf::from_arrow_host(schema.get(), &input); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table->view()); } @@ -297,10 +297,10 @@ TEST_F(FromArrowHostDeviceTest, DateTimeTable) input.device_id = -1; input.device_type = ARROW_DEVICE_CPU; - auto got_cudf_table = cudf::from_arrow_device_host(input_schema.get(), &input); + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); - auto got_cudf_col = cudf::from_arrow_device_host_column(input_schema.get(), &input); + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); cudf::table_view from_struct{ @@ -351,10 +351,10 @@ TYPED_TEST(FromArrowHostDeviceTestDurationsTest, DurationTable) input.device_id = -1; input.device_type = ARROW_DEVICE_CPU; - auto got_cudf_table = cudf::from_arrow_device_host(input_schema.get(), &input); + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); - auto got_cudf_col = cudf::from_arrow_device_host_column(input_schema.get(), &input); + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); cudf::table_view from_struct{ @@ -421,10 +421,10 @@ TEST_F(FromArrowHostDeviceTest, NestedList) input.device_id = -1; input.device_type = ARROW_DEVICE_CPU; - auto got_cudf_table = cudf::from_arrow_device_host(input_schema.get(), &input); + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); - auto got_cudf_col = cudf::from_arrow_device_host_column(input_schema.get(), &input); + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); cudf::table_view from_struct{ @@ -579,10 +579,10 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) input.device_id = -1; input.device_type = ARROW_DEVICE_CPU; - auto got_cudf_table = cudf::from_arrow_device_host(input_schema.get(), &input); + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); - auto got_cudf_col = cudf::from_arrow_device_host_column(input_schema.get(), &input); + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); cudf::table_view from_struct{ @@ -652,10 +652,10 @@ TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType) input.device_id = -1; input.device_type = ARROW_DEVICE_CPU; - auto got_cudf_table = cudf::from_arrow_device_host(input_schema.get(), &input); + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.view(), got_cudf_table->view()); - auto got_cudf_col = cudf::from_arrow_device_host_column(input_schema.get(), &input); + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); cudf::table_view from_struct{ @@ -670,7 +670,8 @@ void slice_host_nanoarrow(ArrowArray* arr, int64_t start, int64_t end) array->length = end - start; if (array->null_count != 0) { array->null_count = - array->length - ArrowBitCountSet(ArrowArrayValidityBitmap(array)->buffer.data, start, end - start); + array->length - + ArrowBitCountSet(ArrowArrayValidityBitmap(array)->buffer.data, start, end - start); } }; @@ -704,11 +705,11 @@ TEST_P(FromArrowHostDeviceTestSlice, SliceTest) input.device_id = -1; input.device_type = ARROW_DEVICE_CPU; - auto got_cudf_table = cudf::from_arrow_device_host(schema.get(), &input); + auto got_cudf_table = cudf::from_arrow_host(schema.get(), &input); if (got_cudf_table->num_rows() == 0 and sliced_cudf_table.num_rows() == 0) { CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table->view()); - auto got_cudf_col = cudf::from_arrow_device_host_column(schema.get(), &input); + auto got_cudf_col = cudf::from_arrow_host_column(schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); cudf::table_view from_struct{std::vector(got_cudf_col_view.child_begin(), @@ -717,7 +718,7 @@ TEST_P(FromArrowHostDeviceTestSlice, SliceTest) } else { CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table.view(), got_cudf_table->view()); - auto got_cudf_col = cudf::from_arrow_device_host_column(schema.get(), &input); + auto got_cudf_col = cudf::from_arrow_host_column(schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); cudf::table_view from_struct{std::vector(got_cudf_col_view.child_begin(), @@ -726,7 +727,6 @@ TEST_P(FromArrowHostDeviceTestSlice, SliceTest) } } - INSTANTIATE_TEST_CASE_P(FromArrowHostDeviceTest, FromArrowHostDeviceTestSlice, ::testing::Values(std::make_tuple(0, 10000), diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index 588b5a39dfe..f1677627800 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -183,7 +183,9 @@ get_nanoarrow_array(std::vector const& data, std::vector const& mask ArrowBitmapAppendInt8Unsafe(&bitmap, reinterpret_cast(mask.data()), mask.size()); ArrowArraySetValidityBitmap(tmp.get(), &bitmap); - tmp->null_count = data.size() - ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, mask.size()); + tmp->null_count = + data.size() - + ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, mask.size()); } ArrowBuffer buf; @@ -209,8 +211,8 @@ std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_ ArrowBitmapInit(&out); NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1)); out.buffer.size_bytes = (b.size() >> 3) + ((b.size() & 7) != 0); - out.size_bits = b.size(); - + out.size_bits = b.size(); + for (size_t i = 0; i < b.size(); ++i) { ArrowBitSetTo(out.buffer.data, i, static_cast(b[i])); } @@ -219,9 +221,11 @@ std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_ }; if (!mask.empty()) { - auto validity_bitmap = to_arrow_bitmap(mask); + auto validity_bitmap = to_arrow_bitmap(mask); ArrowArraySetValidityBitmap(tmp.get(), &validity_bitmap); - tmp->null_count = mask.size() - ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, mask.size()); + tmp->null_count = + mask.size() - + ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, mask.size()); } auto raw_buffer = to_arrow_bitmap(data); @@ -297,7 +301,9 @@ nanoarrow::UniqueArray get_nanoarrow_list_array(std::vector const& data, &bitmap, reinterpret_cast(list_validity.data()), list_validity.size()); ArrowArraySetValidityBitmap(tmp.get(), &bitmap); - tmp->null_count = tmp->length - ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, list_validity.size()); + tmp->null_count = + tmp->length - + ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, list_validity.size()); } ArrowBuffer buf; From 79414d199de750ddc588437152999c42bbf783c5 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 6 May 2024 13:21:50 -0400 Subject: [PATCH 03/22] from feedback --- cpp/src/interop/from_arrow_device.cu | 31 ++++++++++++++------------- cpp/tests/interop/nanoarrow_utils.hpp | 2 +- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 77926896278..9ad473913bc 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -250,21 +250,22 @@ std::unique_ptr dispatch_copy_from_arrow_device::operator()operator()(&view, &char_array, data_type(type_id::INT8), true); auto const num_rows = offsets_column->size() - 1; - auto out_col = make_strings_column(num_rows, - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - input->null_count, - std::move(*get_mask_buffer(input))); - - return input->offset == 0 - ? std::move(out_col) - : std::make_unique( - cudf::detail::slice(out_col->view(), - static_cast(input->offset), - static_cast(input->offset + input->length), - stream), - stream, - mr); + auto bitmask = get_mask_buffer(input); + auto out_col = cudf::column_view(data_type{type_id::STRING}, + num_rows, + chars_column->view().head(), + static_cast(bitmask->data()), + input->null_count, + 0, + {offsets_column->view()}); + + return std::make_unique( + cudf::detail::slice(out_col, + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); } template <> diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index f1677627800..7dd8ea7a20b 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -326,4 +326,4 @@ nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list data, std::vector data_mask(data_validity); std::vector list_mask(list_validity); return get_nanoarrow_list_array(data_vector, offset, data_mask, list_mask); -} \ No newline at end of file +} From a05e78893fcc39ed3e30285b75bda0464ebfe51d Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 8 May 2024 11:42:21 -0400 Subject: [PATCH 04/22] back to original impl --- cpp/src/interop/from_arrow_device.cu | 31 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 9ad473913bc..b309c7b9830 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -250,22 +250,21 @@ std::unique_ptr dispatch_copy_from_arrow_device::operator()operator()(&view, &char_array, data_type(type_id::INT8), true); auto const num_rows = offsets_column->size() - 1; - auto bitmask = get_mask_buffer(input); - auto out_col = cudf::column_view(data_type{type_id::STRING}, - num_rows, - chars_column->view().head(), - static_cast(bitmask->data()), - input->null_count, - 0, - {offsets_column->view()}); - - return std::make_unique( - cudf::detail::slice(out_col, - static_cast(input->offset), - static_cast(input->offset + input->length), - stream), - stream, - mr); + auto out_col = make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + input->null_count, + std::move(*get_mask_buffer(input))); + + return input->offset == 0 + ? std::move(out_col) + : std::make_unique( + cudf::detail::slice(out_col, + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); } template <> From 7bfc8d494c584e5690c5976ebb36f91344eeb915 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 14 May 2024 13:37:47 -0400 Subject: [PATCH 05/22] updates from feedback and including extra comments --- cpp/CMakeLists.txt | 1 + cpp/src/interop/arrow_utilities.hpp | 6 + cpp/src/interop/from_arrow_device.cu | 391 ---------------- cpp/src/interop/from_arrow_host.cu | 443 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 2 +- ...host_test.cpp => from_arrow_host_test.cpp} | 32 +- 6 files changed, 482 insertions(+), 393 deletions(-) create mode 100644 cpp/src/interop/from_arrow_host.cu rename cpp/tests/interop/{from_arrow_device_host_test.cpp => from_arrow_host_test.cpp} (94%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f11f3fc3c9a..b4ee6ef923f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -360,6 +360,7 @@ add_library( src/interop/to_arrow.cu src/interop/to_arrow_device.cu src/interop/from_arrow_device.cu + src/interop/from_arrow_host.cu src/interop/to_arrow_schema.cpp src/interop/to_arrow_utilities.cpp src/interop/detail/arrow_allocator.cpp diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp index 9bbdaa2c363..152b8c3e939 100644 --- a/cpp/src/interop/arrow_utilities.hpp +++ b/cpp/src/interop/arrow_utilities.hpp @@ -16,6 +16,10 @@ #pragma once +#include + +#include + namespace cudf { namespace detail { @@ -26,5 +30,7 @@ namespace detail { static constexpr int validity_buffer_idx = 0; static constexpr int fixed_width_data_buffer_idx = 1; +data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view); + } // namespace detail } // namespace cudf diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index b309c7b9830..4c82a5d6a8c 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -42,8 +42,6 @@ #include #include -#include - namespace cudf { namespace detail { @@ -93,309 +91,6 @@ data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view) namespace { -struct dispatch_copy_from_arrow_device { - rmm::cuda_stream_view stream; - rmm::mr::device_memory_resource* mr; - - std::unique_ptr get_mask_buffer(ArrowArray const* array) - { - auto* bitmap = array->buffers[validity_buffer_idx]; - if (bitmap == nullptr) { return std::make_unique(0, stream, mr); } - - auto const bitmask_size = array->length + array->offset; - auto const allocation_size = - bitmask_allocation_size_bytes(static_cast(bitmask_size)); - auto mask = std::make_unique(allocation_size, stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(), - reinterpret_cast(bitmap), - allocation_size, - cudaMemcpyDefault, - stream.value())); - return mask; - } - - template () && - !std::is_same_v)> - std::unique_ptr operator()(ArrowSchemaView*, ArrowArray const*, data_type, bool) - { - CUDF_FAIL("Unsupported type in copy_from_arrow_device."); - } - - template () || std::is_same_v)> - std::unique_ptr operator()(ArrowSchemaView* schema, - ArrowArray const* input, - data_type type, - bool skip_mask) - { - using DeviceType = std::conditional_t, __int128_t, T>; - - size_type const num_rows = input->length; - size_type const offset = input->offset; - size_type const null_count = input->null_count; - auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; - - auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr; - auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr); - auto mutable_column_view = col->mutable_view(); - CUDF_CUDA_TRY( - cudaMemcpyAsync(mutable_column_view.data(), - reinterpret_cast(data_buffer) + offset * sizeof(DeviceType), - sizeof(DeviceType) * num_rows, - cudaMemcpyDefault, - stream.value())); - - if (has_nulls) { - auto tmp_mask = get_mask_buffer(input); - - // if array is sliced, we have to copy the whole mask and then take copy - auto out_mask = - (offset == 0) - ? std::move(*tmp_mask) - : cudf::detail::copy_bitmask( - static_cast(tmp_mask->data()), offset, offset + num_rows, stream, mr); - - col->set_null_mask(std::move(out_mask), null_count); - } - - return col; - } -}; - -// forward declaration is needed because `type_dispatch` instantiates the -// dispatch_from_arrow_device struct causing a recursive situation for struct, -// dictionary and list_view types. -std::unique_ptr get_column_copy(ArrowSchemaView* schema, - ArrowArray const* input, - data_type type, - bool skip_mask, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - -template <> -std::unique_ptr dispatch_copy_from_arrow_device::operator()(ArrowSchemaView* schema, - ArrowArray const* input, - data_type type, - bool skip_mask) -{ - auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; - const auto buffer_length = bitmask_allocation_size_bytes(input->length + input->offset); - // mask-to-bools expects the mask to be bitmask_type aligned/padded - auto data = rmm::device_buffer(buffer_length, stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(), - reinterpret_cast(data_buffer), - buffer_length, - cudaMemcpyDefault, - stream.value())); - auto out_col = mask_to_bools(static_cast(data.data()), - input->offset, - input->offset + input->length, - stream, - mr); - - auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr; - if (has_nulls) { - auto out_mask = detail::copy_bitmask(static_cast(get_mask_buffer(input)->data()), - input->offset, - input->offset + input->length, - stream, - mr); - - out_col->set_null_mask(std::move(out_mask), input->null_count); - } - - return out_col; -} - -template <> -std::unique_ptr dispatch_copy_from_arrow_device::operator()( - ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) -{ - if (input->length == 0) { return make_empty_column(type_id::STRING); } - - const void* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; - ArrowArray offsets_array = { - .length = input->offset + input->length + 1, - .null_count = 0, - .offset = 0, - .n_buffers = 2, - .n_children = 0, - .buffers = offset_buffers, - }; - - size_type const char_data_length = - reinterpret_cast(offset_buffers[1])[input->length + input->offset]; - const void* char_buffers[2] = {nullptr, input->buffers[2]}; - ArrowArray char_array = { - .length = char_data_length, - .null_count = 0, - .offset = 0, - .n_buffers = 2, - .n_children = 0, - .buffers = char_buffers, - }; - - nanoarrow::UniqueSchema offset_schema; - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32)); - - nanoarrow::UniqueSchema char_data_schema; - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(char_data_schema.get(), NANOARROW_TYPE_INT8)); - - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr)); - auto offsets_column = - this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr)); - auto chars_column = this->operator()(&view, &char_array, data_type(type_id::INT8), true); - - auto const num_rows = offsets_column->size() - 1; - auto out_col = make_strings_column(num_rows, - std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), - input->null_count, - std::move(*get_mask_buffer(input))); - - return input->offset == 0 - ? std::move(out_col) - : std::make_unique( - cudf::detail::slice(out_col, - static_cast(input->offset), - static_cast(input->offset + input->length), - stream), - stream, - mr); -} - -template <> -std::unique_ptr dispatch_copy_from_arrow_device::operator()( - ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) -{ - ArrowSchemaView keys_schema_view; - NANOARROW_THROW_NOT_OK( - ArrowSchemaViewInit(&keys_schema_view, schema->schema->dictionary, nullptr)); - - auto const keys_type = arrow_to_cudf_type(&keys_schema_view); - auto keys_column = - get_column_copy(&keys_schema_view, input->dictionary, keys_type, true, stream, mr); - - auto const dict_indices_type = [&schema]() -> data_type { - // cudf dictionary requires an unsigned type for the indices, - // since it is invalid for an arrow dictionary to contain negative - // indices, we can safely use the unsigned equivalent without having - // to modify the buffers. - switch (schema->storage_type) { - case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); - case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); - default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error); - } - }(); - - auto indices_column = get_column_copy(schema, input, dict_indices_type, false, stream, mr); - // child columns shouldn't have masks and we need the mask in the main column - auto column_contents = indices_column->release(); - indices_column = std::make_unique(dict_indices_type, - static_cast(input->length), - std::move(*(column_contents.data)), - rmm::device_buffer{}, - 0); - - return make_dictionary_column(std::move(keys_column), - std::move(indices_column), - std::move(*(column_contents.null_mask)), - input->null_count); -} - -template <> -std::unique_ptr dispatch_copy_from_arrow_device::operator()( - ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) -{ - char buffer[1024]; - - std::vector> child_columns; - std::transform(input->children, - input->children + input->n_children, - schema->schema->children, - std::back_inserter(child_columns), - [this, input, &buffer](ArrowArray const* child, ArrowSchema const* child_schema) { - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); - auto type = arrow_to_cudf_type(&view); - - auto out = get_column_copy(&view, child, type, false, stream, mr); - return std::make_unique( - cudf::detail::slice(out->view(), - static_cast(input->offset), - static_cast(input->offset + input->length), - stream), - stream, - mr); - }); - - auto out_mask = std::move(*(get_mask_buffer(input))); - if (input->buffers[validity_buffer_idx] != nullptr) { - out_mask = detail::copy_bitmask(static_cast(out_mask.data()), - input->offset, - input->offset + input->length, - stream, - mr); - } - - return make_structs_column( - input->length, std::move(child_columns), input->null_count, std::move(out_mask), stream, mr); -} - -template <> -std::unique_ptr dispatch_copy_from_arrow_device::operator()( - ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) -{ - const void* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; - ArrowArray offsets_array = { - .length = input->offset + input->length + 1, - .null_count = 0, - .offset = 0, - .n_buffers = 2, - .n_children = 0, - .buffers = offset_buffers, - }; - nanoarrow::UniqueSchema offset_schema; - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32)); - - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr)); - auto offsets_column = - this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); - - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema->schema->children[0], nullptr)); - auto child_type = arrow_to_cudf_type(&view); - auto child_column = get_column_copy(&view, input->children[0], child_type, false, stream, mr); - - auto const num_rows = offsets_column->size() - 1; - auto out_col = make_lists_column(num_rows, - std::move(offsets_column), - std::move(child_column), - input->null_count, - std::move(*get_mask_buffer(input)), - stream, - mr); - - return num_rows == input->length - ? std::move(out_col) - : std::make_unique( - cudf::detail::slice(out_col->view(), - static_cast(input->offset), - static_cast(input->offset + input->length), - stream), - stream, - mr); -} - using dispatch_tuple_t = std::tuple; struct dispatch_from_arrow_device { @@ -667,23 +362,6 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( std::move(owned)); } -std::unique_ptr get_column_copy(ArrowSchemaView* schema, - ArrowArray const* input, - data_type type, - bool skip_mask, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return type.id() != type_id::EMPTY - ? std::move(type_dispatcher( - type, dispatch_copy_from_arrow_device{stream, mr}, schema, input, type, skip_mask)) - : std::make_unique(data_type(type_id::EMPTY), - input->length, - rmm::device_buffer{}, - rmm::device_buffer{}, - input->length); -} - dispatch_tuple_t get_column(ArrowSchemaView* schema, ArrowArray const* input, data_type type, @@ -761,77 +439,8 @@ unique_column_view_t from_arrow_device_column(ArrowSchemaView* schema, custom_view_deleter{std::move(owned)}}; } -std::unique_ptr
from_arrow_host(ArrowSchemaView* schema, - ArrowDeviceArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - std::vector> columns; - - auto type = arrow_to_cudf_type(schema); - CUDF_EXPECTS(type == data_type(type_id::STRUCT), - "Must pass a struct to `from_arrow_host`", - cudf::data_type_error); - - std::transform(input->array.children, - input->array.children + input->array.n_children, - schema->schema->children, - std::back_inserter(columns), - [&stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) { - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); - auto type = arrow_to_cudf_type(&view); - return get_column_copy(&view, child, type, false, stream, mr); - }); - - return std::make_unique
(std::move(columns)); -} - -std::unique_ptr from_arrow_host_column(ArrowSchemaView* schema, - ArrowDeviceArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto type = arrow_to_cudf_type(schema); - return get_column_copy(schema, &input->array, type, false, stream, mr); -} - } // namespace detail -std::unique_ptr
from_arrow_host(ArrowSchema const* schema, - ArrowDeviceArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(schema != nullptr && input != nullptr, - "input ArrowSchema and ArrowDeviceArray must not be NULL"); - CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, - "ArrowDeviceArray must have CPU device type for `from_arrow_host`"); - - CUDF_FUNC_RANGE(); - - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - return detail::from_arrow_host(&view, input, stream, mr); -} - -std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, - ArrowDeviceArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(schema != nullptr && input != nullptr, - "input ArrowSchema and ArrowDeviceArray must not be NULL"); - CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, - "ArrowDeviceArray must have CPU device type for `from_arrow_host_column`"); - - CUDF_FUNC_RANGE(); - - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - return detail::from_arrow_host_column(&view, input, stream, mr); -} - unique_table_view_t from_arrow_device(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu new file mode 100644 index 00000000000..db420265e59 --- /dev/null +++ b/cpp/src/interop/from_arrow_host.cu @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arrow_utilities.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace detail { + +namespace { + +struct dispatch_copy_from_arrow_host { + rmm::cuda_stream_view stream; + rmm::mr::device_memory_resource* mr; + + std::unique_ptr get_mask_buffer(ArrowArray const* array) + { + auto* bitmap = array->buffers[validity_buffer_idx]; + if (bitmap == nullptr) { return std::make_unique(0, stream, mr); } + + auto const bitmask_size = array->length + array->offset; + auto const allocation_size = + bitmask_allocation_size_bytes(static_cast(bitmask_size)); + auto mask = std::make_unique(allocation_size, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(), + reinterpret_cast(bitmap), + allocation_size, + cudaMemcpyDefault, + stream.value())); + return mask; + } + + template () && + !std::is_same_v)> + std::unique_ptr operator()(ArrowSchemaView*, ArrowArray const*, data_type, bool) + { + CUDF_FAIL("Unsupported type in copy_from_arrow_device."); + } + + template () || std::is_same_v)> + std::unique_ptr operator()(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask) + { + using DeviceType = std::conditional_t, __int128_t, T>; + + size_type const num_rows = input->length; + size_type const offset = input->offset; + size_type const null_count = input->null_count; + auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; + + auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr; + auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr); + auto mutable_column_view = col->mutable_view(); + CUDF_CUDA_TRY( + cudaMemcpyAsync(mutable_column_view.data(), + reinterpret_cast(data_buffer) + offset * sizeof(DeviceType), + sizeof(DeviceType) * num_rows, + cudaMemcpyDefault, + stream.value())); + + if (has_nulls) { + auto tmp_mask = get_mask_buffer(input); + + // if array is sliced, we have to copy the whole mask and then take copy + auto out_mask = + (offset == 0) + ? std::move(*tmp_mask) + : cudf::detail::copy_bitmask( + static_cast(tmp_mask->data()), offset, offset + num_rows, stream, mr); + + col->set_null_mask(std::move(out_mask), null_count); + } + + return col; + } +}; + +// forward declaration is needed because `type_dispatch` instantiates the +// dispatch_from_arrow_device struct causing a recursive situation for struct, +// dictionary and list_view types. +std::unique_ptr get_column_copy(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask) +{ + auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; + const auto buffer_length = bitmask_allocation_size_bytes(input->length + input->offset); + // mask-to-bools expects the mask to be bitmask_type aligned/padded + auto data = rmm::device_buffer(buffer_length, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(), + reinterpret_cast(data_buffer), + buffer_length, + cudaMemcpyDefault, + stream.value())); + auto out_col = mask_to_bools(static_cast(data.data()), + input->offset, + input->offset + input->length, + stream, + mr); + + auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr; + if (has_nulls) { + auto out_mask = detail::copy_bitmask(static_cast(get_mask_buffer(input)->data()), + input->offset, + input->offset + input->length, + stream, + mr); + + out_col->set_null_mask(std::move(out_mask), input->null_count); + } + + return out_col; +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + if (input->length == 0) { return make_empty_column(type_id::STRING); } + + const void* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; + ArrowArray offsets_array = { + .length = input->offset + input->length + 1, + .null_count = 0, + .offset = 0, + .n_buffers = 2, + .n_children = 0, + .buffers = offset_buffers, + }; + + size_type const char_data_length = + reinterpret_cast(offset_buffers[1])[input->length + input->offset]; + const void* char_buffers[2] = {nullptr, input->buffers[2]}; + ArrowArray char_array = { + .length = char_data_length, + .null_count = 0, + .offset = 0, + .n_buffers = 2, + .n_children = 0, + .buffers = char_buffers, + }; + + nanoarrow::UniqueSchema offset_schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32)); + + nanoarrow::UniqueSchema char_data_schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(char_data_schema.get(), NANOARROW_TYPE_INT8)); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr)); + auto offsets_column = + this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr)); + auto chars_column = this->operator()(&view, &char_array, data_type(type_id::INT8), true); + + auto const num_rows = offsets_column->size() - 1; + auto out_col = make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + input->null_count, + std::move(*get_mask_buffer(input))); + + return input->offset == 0 + ? std::move(out_col) + : std::make_unique( + cudf::detail::slice(out_col->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + ArrowSchemaView keys_schema_view; + NANOARROW_THROW_NOT_OK( + ArrowSchemaViewInit(&keys_schema_view, schema->schema->dictionary, nullptr)); + + auto const keys_type = arrow_to_cudf_type(&keys_schema_view); + auto keys_column = + get_column_copy(&keys_schema_view, input->dictionary, keys_type, true, stream, mr); + + auto const dict_indices_type = [&schema]() -> data_type { + // cudf dictionary requires an unsigned type for the indices, + // since it is invalid for an arrow dictionary to contain negative + // indices, we can safely use the unsigned equivalent without having + // to modify the buffers. + switch (schema->storage_type) { + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); + default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error); + } + }(); + + auto indices_column = get_column_copy(schema, input, dict_indices_type, false, stream, mr); + // child columns shouldn't have masks and we need the mask in the main column + auto column_contents = indices_column->release(); + indices_column = std::make_unique(dict_indices_type, + static_cast(input->length), + std::move(*(column_contents.data)), + rmm::device_buffer{}, + 0); + + return make_dictionary_column(std::move(keys_column), + std::move(indices_column), + std::move(*(column_contents.null_mask)), + input->null_count); +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + char buffer[1024]; + + std::vector> child_columns; + std::transform(input->children, + input->children + input->n_children, + schema->schema->children, + std::back_inserter(child_columns), + [this, input, &buffer](ArrowArray const* child, ArrowSchema const* child_schema) { + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); + auto type = arrow_to_cudf_type(&view); + + auto out = get_column_copy(&view, child, type, false, stream, mr); + return std::make_unique( + cudf::detail::slice(out->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); + }); + + auto out_mask = std::move(*(get_mask_buffer(input))); + if (input->buffers[validity_buffer_idx] != nullptr) { + out_mask = detail::copy_bitmask(static_cast(out_mask.data()), + input->offset, + input->offset + input->length, + stream, + mr); + } + + return make_structs_column( + input->length, std::move(child_columns), input->null_count, std::move(out_mask), stream, mr); +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + const void* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; + ArrowArray offsets_array = { + .length = input->offset + input->length + 1, + .null_count = 0, + .offset = 0, + .n_buffers = 2, + .n_children = 0, + .buffers = offset_buffers, + }; + nanoarrow::UniqueSchema offset_schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32)); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr)); + auto offsets_column = + this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); + + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema->schema->children[0], nullptr)); + auto child_type = arrow_to_cudf_type(&view); + auto child_column = get_column_copy(&view, input->children[0], child_type, false, stream, mr); + + auto const num_rows = offsets_column->size() - 1; + auto out_col = make_lists_column(num_rows, + std::move(offsets_column), + std::move(child_column), + input->null_count, + std::move(*get_mask_buffer(input)), + stream, + mr); + + return num_rows == input->length + ? std::move(out_col) + : std::make_unique( + cudf::detail::slice(out_col->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); +} + +std::unique_ptr get_column_copy(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type.id() != type_id::EMPTY + ? std::move(type_dispatcher( + type, dispatch_copy_from_arrow_host{stream, mr}, schema, input, type, skip_mask)) + : std::make_unique(data_type(type_id::EMPTY), + input->length, + rmm::device_buffer{}, + rmm::device_buffer{}, + input->length); +} + +} // namespace + +std::unique_ptr
from_arrow_host(ArrowSchemaView* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + std::vector> columns; + + auto type = arrow_to_cudf_type(schema); + CUDF_EXPECTS(type == data_type(type_id::STRUCT), + "Must pass a struct to `from_arrow_host`", + cudf::data_type_error); + + std::transform(input->array.children, + input->array.children + input->array.n_children, + schema->schema->children, + std::back_inserter(columns), + [&stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) { + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); + auto type = arrow_to_cudf_type(&view); + return get_column_copy(&view, child, type, false, stream, mr); + }); + + return std::make_unique
(std::move(columns)); +} + +std::unique_ptr from_arrow_host_column(ArrowSchemaView* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto type = arrow_to_cudf_type(schema); + return get_column_copy(schema, &input->array, type, false, stream, mr); +} + +} // namespace detail + +std::unique_ptr
from_arrow_host(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL"); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, + "ArrowDeviceArray must have CPU device type for `from_arrow_host`"); + + CUDF_FUNC_RANGE(); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + return detail::from_arrow_host(&view, input, stream, mr); +} + +std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL"); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, + "ArrowDeviceArray must have CPU device type for `from_arrow_host_column`"); + + CUDF_FUNC_RANGE(); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + return detail::from_arrow_host_column(&view, input, stream, mr); +} + +} // namespace cudf \ No newline at end of file diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 10e9b4de39f..5e8239a2467 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -273,7 +273,7 @@ ConfigureTest( interop/to_arrow_test.cpp interop/from_arrow_test.cpp interop/from_arrow_device_test.cpp - interop/from_arrow_device_host_test.cpp + interop/from_arrow_host_test.cpp interop/dlpack_test.cpp EXTRA_LIB nanoarrow diff --git a/cpp/tests/interop/from_arrow_device_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp similarity index 94% rename from cpp/tests/interop/from_arrow_device_host_test.cpp rename to cpp/tests/interop/from_arrow_host_test.cpp index 6c35e649963..0c75e1d8b5e 100644 --- a/cpp/tests/interop/from_arrow_device_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -40,6 +40,7 @@ #include +// create a cudf::table and equivalent arrow table with host memory std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> get_nanoarrow_host_tables(cudf::size_type length) { @@ -274,6 +275,7 @@ TEST_F(FromArrowHostDeviceTest, DateTimeTable) data.begin(), data.end()); cudf::table_view expected_table_view({col}); + // construct equivalent arrow schema with nanoarrow nanoarrow::UniqueSchema input_schema; ArrowSchemaInit(input_schema.get()); NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); @@ -282,6 +284,7 @@ TEST_F(FromArrowHostDeviceTest, DateTimeTable) input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr)); NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + // equivalent arrow record batch nanoarrow::UniqueArray input_array; NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); input_array->length = 6; @@ -297,9 +300,13 @@ TEST_F(FromArrowHostDeviceTest, DateTimeTable) input.device_id = -1; input.device_type = ARROW_DEVICE_CPU; + // test that we get the same cudf table as we expect by converting the + // host arrow memory to a cudf table auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + // test that we get a cudf table with a single struct column that is equivalent + // if we use from_arrow_host_column auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); @@ -351,9 +358,12 @@ TYPED_TEST(FromArrowHostDeviceTestDurationsTest, DurationTable) input.device_id = -1; input.device_type = ARROW_DEVICE_CPU; + // converting arrow host memory to cudf table gives us the expected table auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + // converting to a cudf table with a single struct column gives us the expected + // result column auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); @@ -389,9 +399,11 @@ TEST_F(FromArrowHostDeviceTest, NestedList) ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element")); input_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE; + // create the base arrow list array auto list_arr = get_nanoarrow_list_array({6, 7, 8, 9}, {0, 1, 4}, {1, 0, 1, 1}); std::vector offset{0, 0, 2}; + // populate the bitmask we're going to use for the top level list ArrowBitmap mask; ArrowBitmapInit(&mask); NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 2)); @@ -412,6 +424,8 @@ TEST_F(FromArrowHostDeviceTest, NestedList) ArrowBufferAppend( offset_buf, reinterpret_cast(offset.data()), offset.size() * sizeof(int32_t))); + // move our base list to be the child of the one we just created + // so that we now have an equivalent value to what we created for cudf list_arr.move(input_array->children[0]->children[0]); NANOARROW_THROW_NOT_OK( ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); @@ -421,9 +435,11 @@ TEST_F(FromArrowHostDeviceTest, NestedList) input.device_id = -1; input.device_type = ARROW_DEVICE_CPU; + // converting from arrow host memory to cudf gives us the expected table auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + // converting to a single column cudf table gives us the expected struct column auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); @@ -474,6 +490,7 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) auto metadata = cudf::column_metadata{"a"}; metadata.children_meta = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata}; + // create the equivalent arrow schema using nanoarrow nanoarrow::UniqueSchema input_schema; ArrowSchemaInit(input_schema.get()); NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); @@ -521,6 +538,7 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2")); // create nanoarrow table + // first our underlying arrays std::vector str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}; std::vector str2{"CUDF", "ROCKS", "EVERYWHERE"}; auto str_array = get_nanoarrow_array(str); @@ -532,6 +550,7 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) get_nanoarrow_list_array({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9}); std::vector offset{0, 3, 4, 6}; + // create the struct array nanoarrow::UniqueArray input_array; NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); @@ -541,7 +560,7 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) auto view_a = expected_table_view.column(0); array_a->length = view_a.size(); array_a->null_count = view_a.null_count(); - + // populate the children of our struct by moving them from the original arrays str_array.move(array_a->children[0]); int_array.move(array_a->children[1]); bool_array.move(array_a->children[2]); @@ -556,6 +575,7 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) list_arr.move(array_a->children[3]->children[0]); + // set our struct bitmap validity mask ArrowBitmap mask; ArrowBitmapInit(&mask); NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 3)); @@ -579,9 +599,11 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) input.device_id = -1; input.device_type = ARROW_DEVICE_CPU; + // test we get the expected cudf::table from the arrow host memory data auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + // test we get the expected cudf struct column auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); @@ -592,6 +614,8 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType) { + // test dictionary arrays with different index types + // cudf asserts that the index type must be unsigned auto array1 = get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); auto array2 = @@ -599,6 +623,7 @@ TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType) auto array3 = get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + // create equivalent cudf dictionary columns auto keys_col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 7}); auto ind1_col = cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); auto ind2_col = @@ -652,9 +677,11 @@ TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType) input.device_id = -1; input.device_type = ARROW_DEVICE_CPU; + // test we get the expected cudf table when we convert from Arrow host memory auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.view(), got_cudf_table->view()); + // test we get the expected cudf::column as a struct column auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); @@ -666,6 +693,7 @@ TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType) void slice_host_nanoarrow(ArrowArray* arr, int64_t start, int64_t end) { auto op = [&](ArrowArray* array) { + // slicing only needs to happen at the top level of an array array->offset = start; array->length = end - start; if (array->null_count != 0) { @@ -680,6 +708,8 @@ void slice_host_nanoarrow(ArrowArray* arr, int64_t start, int64_t end) return; } + // since we want to simulate a sliced table where the children are sliced, + // we slice each individual child of the record batch arr->length = end - start; for (int64_t i = 0; i < arr->n_children; ++i) { op(arr->children[i]); From e2a3c6b0b2f0728aa384d0871ebc9b14535018c4 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 15 May 2024 13:15:21 -0400 Subject: [PATCH 06/22] Update cpp/src/interop/from_arrow_host.cu Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/src/interop/from_arrow_host.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index db420265e59..e13066d15e0 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -118,7 +118,7 @@ struct dispatch_copy_from_arrow_host { }; // forward declaration is needed because `type_dispatch` instantiates the -// dispatch_from_arrow_device struct causing a recursive situation for struct, +// dispatch_copy_from_arrow_host struct causing a recursive situation for struct, // dictionary and list_view types. std::unique_ptr get_column_copy(ArrowSchemaView* schema, ArrowArray const* input, From e2065cdb9de6164e9d817d417870db6dc8d88fd9 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 15 May 2024 14:24:37 -0400 Subject: [PATCH 07/22] updates from feedback --- cpp/src/interop/from_arrow_host.cu | 50 ++++++++++++++------------- cpp/tests/interop/nanoarrow_utils.hpp | 6 ++-- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index e13066d15e0..2cd44e6fc1d 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -129,9 +129,9 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, template <> std::unique_ptr dispatch_copy_from_arrow_host::operator()(ArrowSchemaView* schema, - ArrowArray const* input, - data_type type, - bool skip_mask) + ArrowArray const* input, + data_type type, + bool skip_mask) { auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; const auto buffer_length = bitmask_allocation_size_bytes(input->length + input->offset); @@ -369,21 +369,29 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, } // namespace -std::unique_ptr
from_arrow_host(ArrowSchemaView* schema, +std::unique_ptr
from_arrow_host(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL"); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, + "ArrowDeviceArray must have CPU device type for `from_arrow_host`"); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + std::vector> columns; - auto type = arrow_to_cudf_type(schema); + auto type = arrow_to_cudf_type(&view); CUDF_EXPECTS(type == data_type(type_id::STRUCT), "Must pass a struct to `from_arrow_host`", cudf::data_type_error); std::transform(input->array.children, input->array.children + input->array.n_children, - schema->schema->children, + view.schema->children, std::back_inserter(columns), [&stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) { ArrowSchemaView view; @@ -395,13 +403,21 @@ std::unique_ptr
from_arrow_host(ArrowSchemaView* schema, return std::make_unique
(std::move(columns)); } -std::unique_ptr from_arrow_host_column(ArrowSchemaView* schema, +std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL"); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, + "ArrowDeviceArray must have CPU device type for `from_arrow_host_column`"); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + auto type = arrow_to_cudf_type(schema); - return get_column_copy(schema, &input->array, type, false, stream, mr); + return get_column_copy(&view, &input->array, type, false, stream, mr); } } // namespace detail @@ -411,16 +427,9 @@ std::unique_ptr
from_arrow_host(ArrowSchema const* schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(schema != nullptr && input != nullptr, - "input ArrowSchema and ArrowDeviceArray must not be NULL"); - CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, - "ArrowDeviceArray must have CPU device type for `from_arrow_host`"); - CUDF_FUNC_RANGE(); - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - return detail::from_arrow_host(&view, input, stream, mr); + return detail::from_arrow_host(schema, input, stream, mr); } std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, @@ -428,16 +437,9 @@ std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(schema != nullptr && input != nullptr, - "input ArrowSchema and ArrowDeviceArray must not be NULL"); - CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, - "ArrowDeviceArray must have CPU device type for `from_arrow_host_column`"); - CUDF_FUNC_RANGE(); - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - return detail::from_arrow_host_column(&view, input, stream, mr); + return detail::from_arrow_host_column(schema, input, stream, mr); } } // namespace cudf \ No newline at end of file diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index 7dd8ea7a20b..a36584062c0 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -192,7 +192,7 @@ get_nanoarrow_array(std::vector const& data, std::vector const& mask ArrowBufferInit(&buf); NANOARROW_THROW_NOT_OK( ArrowBufferAppend(&buf, reinterpret_cast(data.data()), sizeof(T) * data.size())); - ArrowArraySetBuffer(tmp.get(), 1, &buf); + NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(tmp.get(), 1, &buf)); tmp->length = data.size(); @@ -229,7 +229,7 @@ std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_ } auto raw_buffer = to_arrow_bitmap(data); - ArrowArraySetBuffer(tmp.get(), 1, &raw_buffer.buffer); + NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(tmp.get(), 1, &raw_buffer.buffer)); tmp->length = data.size(); return tmp; @@ -310,7 +310,7 @@ nanoarrow::UniqueArray get_nanoarrow_list_array(std::vector const& data, ArrowBufferInit(&buf); NANOARROW_THROW_NOT_OK(ArrowBufferAppend( &buf, reinterpret_cast(offsets.data()), sizeof(int32_t) * offsets.size())); - ArrowArraySetBuffer(tmp.get(), 1, &buf); + NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(tmp.get(), 1, &buf)); return tmp; } From 7e117a8e21d7c9505e612349e34f229d7149c709 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 20 May 2024 12:21:15 -0400 Subject: [PATCH 08/22] removing includes by suggestion --- cpp/src/interop/from_arrow_device.cu | 3 --- cpp/tests/interop/from_arrow_host_test.cpp | 4 ---- 2 files changed, 7 deletions(-) diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 4c82a5d6a8c..d4d31d1989b 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -16,17 +16,14 @@ #include "arrow_utilities.hpp" -#include #include #include -#include #include #include #include #include #include #include -#include #include #include #include diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp index 0c75e1d8b5e..4179d623a42 100644 --- a/cpp/tests/interop/from_arrow_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -26,10 +26,6 @@ #include #include #include -#include -#include -#include -#include #include #include #include From 5ac8d6c2f4a9c674f8227d84434891a2ef1805b4 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 21 May 2024 12:06:12 -0400 Subject: [PATCH 09/22] updates from feedback --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/interop.hpp | 14 ++-- cpp/src/interop/arrow_utilities.cpp | 70 +++++++++++++++++ cpp/src/interop/from_arrow_device.cu | 109 +++++++++------------------ cpp/src/interop/from_arrow_host.cu | 69 ++++++++++------- 5 files changed, 157 insertions(+), 106 deletions(-) create mode 100644 cpp/src/interop/arrow_utilities.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 66f5ba0e63b..a58d221a7dd 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -360,6 +360,7 @@ add_library( src/hash/xxhash_64.cu src/interop/dlpack.cpp src/interop/from_arrow.cu + src/interop/arrow_utilities.cpp src/interop/to_arrow.cu src/interop/to_arrow_device.cu src/interop/from_arrow_device.cu diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index df58390304f..766978b48a5 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -351,9 +351,9 @@ std::unique_ptr from_arrow( /** * @brief Create `cudf::table` from given ArrowDeviceArray input * - * @throws cudf::logic_error if either schema or input are NULL + * @throws std::invalid_argument if either schema or input are NULL * - * @throws cudf::logic_error if the device_type is not `ARROW_DEVICE_CPU` + * @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU` * * @throws cudf::data_type_error if the input array is not a struct array, * non-struct arrays should be passed to `from_arrow_host_column` instead. @@ -373,9 +373,9 @@ std::unique_ptr
from_arrow_host( /** * @brief Create `cudf::column` from given ArrowDeviceArray input * - * @throws cudf::logic_error if either schema or input are NULL + * @throws std::invalid_argument if either schema or input are NULL * - * @throws cudf::logic_error if the device_type is not `ARROW_DEVICE_CPU` + * @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU` * * @throws cudf::data_type_error if input arrow data type is not supported in cudf. * @@ -383,7 +383,7 @@ std::unique_ptr
from_arrow_host( * @param input `ArrowDeviceArray` pointer to object owning the Arrow data * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to perform cuda allocation - * @return cudf table generated from the given Arrow data + * @return cudf column generated from the given Arrow data */ std::unique_ptr from_arrow_host_column( ArrowSchema const* schema, @@ -441,7 +441,7 @@ using unique_table_view_t = * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::table_view` is not * accessed after this happens. * - * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST` + * @throws std::invalid_argument if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST` * or `ARROW_DEVICE_CUDA_MANAGED` * * @throws cudf::data_type_error if the input array is not a struct array, non-struct @@ -489,7 +489,7 @@ using unique_column_view_t = * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::column_view` is not * accessed after this happens. * - * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST` + * @throws std::invalid_argument if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST` * or `ARROW_DEVICE_CUDA_MANAGED` * * @throws cudf::data_type_error input arrow data type is not supported. diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp new file mode 100644 index 00000000000..e4e3d20ef0c --- /dev/null +++ b/cpp/src/interop/arrow_utilities.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arrow_utilities.hpp" + +#include +#include + +#include + +namespace cudf { +namespace detail { +data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view) +{ + switch (arrow_view->type) { + case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY); + case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8); + case NANOARROW_TYPE_INT8: return data_type(type_id::INT8); + case NANOARROW_TYPE_INT16: return data_type(type_id::INT16); + case NANOARROW_TYPE_INT32: return data_type(type_id::INT32); + case NANOARROW_TYPE_INT64: return data_type(type_id::INT64); + case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); + case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); + case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); + case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); + case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32); + case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64); + case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS); + case NANOARROW_TYPE_STRING: return data_type(type_id::STRING); + case NANOARROW_TYPE_LIST: return data_type(type_id::LIST); + case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32); + case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT); + case NANOARROW_TYPE_TIMESTAMP: { + switch (arrow_view->time_unit) { + case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS); + case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS); + case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS); + case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS); + default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error); + } + } + case NANOARROW_TYPE_DURATION: { + switch (arrow_view->time_unit) { + case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS); + case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS); + case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS); + case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS); + default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error); + } + } + case NANOARROW_TYPE_DECIMAL128: + return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale}; + default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error); + } +} +} // namespace detail +} // namespace cudf \ No newline at end of file diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index d4d31d1989b..002a8ec1f14 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -42,49 +42,6 @@ namespace cudf { namespace detail { -data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view) -{ - switch (arrow_view->type) { - case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY); - case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8); - case NANOARROW_TYPE_INT8: return data_type(type_id::INT8); - case NANOARROW_TYPE_INT16: return data_type(type_id::INT16); - case NANOARROW_TYPE_INT32: return data_type(type_id::INT32); - case NANOARROW_TYPE_INT64: return data_type(type_id::INT64); - case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); - case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); - case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); - case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); - case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32); - case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64); - case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS); - case NANOARROW_TYPE_STRING: return data_type(type_id::STRING); - case NANOARROW_TYPE_LIST: return data_type(type_id::LIST); - case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32); - case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT); - case NANOARROW_TYPE_TIMESTAMP: { - switch (arrow_view->time_unit) { - case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS); - case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS); - case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS); - case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS); - default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error); - } - } - case NANOARROW_TYPE_DURATION: { - switch (arrow_view->time_unit) { - case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS); - case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS); - case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS); - case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS); - default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error); - } - } - case NANOARROW_TYPE_DECIMAL128: - return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale}; - default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error); - } -} namespace { @@ -379,11 +336,25 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, } // namespace -unique_table_view_t from_arrow_device(ArrowSchemaView* schema, +unique_table_view_t from_arrow_device(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL", + std::invalid_argument); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA || + input->device_type == ARROW_DEVICE_CUDA_HOST || + input->device_type == ARROW_DEVICE_CUDA_MANAGED, + "ArrowDeviceArray memory must be accessible to CUDA", + std::invalid_argument); + + rmm::cuda_set_device_raii dev( + rmm::cuda_device_id{static_cast(input->device_id)}); + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + if (input->sync_event != nullptr) { CUDF_CUDA_TRY( cudaStreamWaitEvent(stream.value(), *reinterpret_cast(input->sync_event))); @@ -392,14 +363,14 @@ unique_table_view_t from_arrow_device(ArrowSchemaView* schema, std::vector columns; owned_columns_t owned_mem; - auto type = arrow_to_cudf_type(schema); + auto type = arrow_to_cudf_type(&view); CUDF_EXPECTS(type == data_type(type_id::STRUCT), "Must pass a struct to `from_arrow_device`", cudf::data_type_error); std::transform( input->array.children, input->array.children + input->array.n_children, - schema->schema->children, + view.schema->children, std::back_inserter(columns), [&owned_mem, &stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) { ArrowSchemaView view; @@ -420,18 +391,32 @@ unique_table_view_t from_arrow_device(ArrowSchemaView* schema, custom_view_deleter{std::move(owned_mem)}}; } -unique_column_view_t from_arrow_device_column(ArrowSchemaView* schema, +unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL", + std::invalid_argument); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA || + input->device_type == ARROW_DEVICE_CUDA_HOST || + input->device_type == ARROW_DEVICE_CUDA_MANAGED, + "ArrowDeviceArray must be accessible to CUDA", + std::invalid_argument); + + rmm::cuda_set_device_raii dev( + rmm::cuda_device_id{static_cast(input->device_id)}); + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + if (input->sync_event != nullptr) { CUDF_CUDA_TRY( cudaStreamWaitEvent(stream.value(), *reinterpret_cast(input->sync_event))); } - auto type = arrow_to_cudf_type(schema); - auto [colview, owned] = get_column(schema, &input->array, type, false, stream, mr); + auto type = arrow_to_cudf_type(&view); + auto [colview, owned] = get_column(&view, &input->array, type, false, stream, mr); return unique_column_view_t{new column_view{colview}, custom_view_deleter{std::move(owned)}}; } @@ -443,20 +428,9 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(schema != nullptr && input != nullptr, - "input ArrowSchema and ArrowDeviceArray must not be NULL"); - CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA || - input->device_type == ARROW_DEVICE_CUDA_HOST || - input->device_type == ARROW_DEVICE_CUDA_MANAGED, - "ArrowDeviceArray memory must be accessible to CUDA"); - CUDF_FUNC_RANGE(); - rmm::cuda_set_device_raii dev( - rmm::cuda_device_id{static_cast(input->device_id)}); - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - return detail::from_arrow_device(&view, input, stream, mr); + return detail::from_arrow_device(schema, input, stream, mr); } unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, @@ -464,20 +438,9 @@ unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(schema != nullptr && input != nullptr, - "input ArrowSchema and ArrowDeviceArray must not be NULL"); - CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA || - input->device_type == ARROW_DEVICE_CUDA_HOST || - input->device_type == ARROW_DEVICE_CUDA_MANAGED, - "ArrowDeviceArray must be accessible to CUDA"); - CUDF_FUNC_RANGE(); - rmm::cuda_set_device_raii dev( - rmm::cuda_device_id{static_cast(input->device_id)}); - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - return detail::from_arrow_device_column(&view, input, stream, mr); + return detail::from_arrow_device_column(schema, input, stream, mr); } } // namespace cudf diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 2cd44e6fc1d..bc026e549f1 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -120,6 +120,12 @@ struct dispatch_copy_from_arrow_host { // forward declaration is needed because `type_dispatch` instantiates the // dispatch_copy_from_arrow_host struct causing a recursive situation for struct, // dictionary and list_view types. +// +// This function is simply a convenience wrapper around the dispatch functor with +// some extra handling to avoid having to reproduce it for all of the nested types. +// It also allows us to centralize the location where the recursive calls happen +// so that we only need to forward declare this one function, rather than multiple +// functions which handle the overloads for nested types (list, struct, etc.) std::unique_ptr get_column_copy(ArrowSchemaView* schema, ArrowArray const* input, data_type type, @@ -168,7 +174,9 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()length == 0) { return make_empty_column(type_id::STRING); } - const void* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; + // offsets column should contain no nulls so we can put nullptr for the bitmask + // nulls are tracked in the parent string column itself, not in the offsets + void const* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; ArrowArray offsets_array = { .length = input->offset + input->length + 1, .null_count = 0, @@ -178,9 +186,11 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()(offset_buffers[1])[input->length + input->offset]; - const void* char_buffers[2] = {nullptr, input->buffers[2]}; + void const* char_buffers[2] = {nullptr, input->buffers[2]}; ArrowArray char_array = { .length = char_data_length, .null_count = 0, @@ -196,6 +206,8 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator() std::unique_ptr dispatch_copy_from_arrow_host::operator()( ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) { - char buffer[1024]; - std::vector> child_columns; - std::transform(input->children, - input->children + input->n_children, - schema->schema->children, - std::back_inserter(child_columns), - [this, input, &buffer](ArrowArray const* child, ArrowSchema const* child_schema) { - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); - auto type = arrow_to_cudf_type(&view); - - auto out = get_column_copy(&view, child, type, false, stream, mr); - return std::make_unique( - cudf::detail::slice(out->view(), - static_cast(input->offset), - static_cast(input->offset + input->length), - stream), - stream, - mr); - }); + std::transform( + input->children, + input->children + input->n_children, + schema->schema->children, + std::back_inserter(child_columns), + [this, input](ArrowArray const* child, ArrowSchema const* child_schema) { + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); + auto type = arrow_to_cudf_type(&view); + + auto out = get_column_copy(&view, child, type, false, stream, mr); + return input->offset == 0 + ? std::move(out) + : std::make_unique( + cudf::detail::slice(out->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); + }); auto out_mask = std::move(*(get_mask_buffer(input))); if (input->buffers[validity_buffer_idx] != nullptr) { @@ -375,9 +388,11 @@ std::unique_ptr
from_arrow_host(ArrowSchema const* schema, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, - "input ArrowSchema and ArrowDeviceArray must not be NULL"); + "input ArrowSchema and ArrowDeviceArray must not be NULL", + std::invalid_argument); CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, - "ArrowDeviceArray must have CPU device type for `from_arrow_host`"); + "ArrowDeviceArray must have CPU device type for `from_arrow_host`", + std::invalid_argument); ArrowSchemaView view; NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); @@ -409,9 +424,11 @@ std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(schema != nullptr && input != nullptr, - "input ArrowSchema and ArrowDeviceArray must not be NULL"); + "input ArrowSchema and ArrowDeviceArray must not be NULL", + std::invalid_argument); CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, - "ArrowDeviceArray must have CPU device type for `from_arrow_host_column`"); + "ArrowDeviceArray must have CPU device type for `from_arrow_host_column`", + std::invalid_argument); ArrowSchemaView view; NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); From 679993ca153a4b8eadfae4b249208c5c66a3f5a0 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 21 May 2024 12:24:39 -0400 Subject: [PATCH 10/22] add `from_arrow` overload --- cpp/include/cudf/interop.hpp | 18 ++++++++++++++++++ cpp/src/interop/from_arrow_host.cu | 15 +++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 766978b48a5..96e1f5d009c 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -348,6 +348,24 @@ std::unique_ptr from_arrow( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create `cudf::table` from given ArrowArray and ArrowSchema input + * + * @throws std::invalid_argument if either schema or input are NULL + * + * @throws cudf::data_type_error if the input array is not a struct array. + * + * @param schema `ArrowSchema` pointer to describe the type of the data + * @param input `ArrowArray` pointer that needs to be converted to cudf::table + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate `cudf::table` + * @return cudf table generated from given arrow data + */ +std::unique_ptr from_arrow(ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + /** * @brief Create `cudf::table` from given ArrowDeviceArray input * diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index bc026e549f1..da91096f329 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -459,4 +459,19 @@ std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, return detail::from_arrow_host_column(schema, input, stream, mr); } +std::unique_ptr
from_arrow(ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + ArrowDeviceArray const device_input = { + .array = *input, + .device_id = -1, + .device_type = ARROW_DEVICE_CPU, + }; + return detail::from_arrow_host(schema, &device_input, stream, mr); +} + } // namespace cudf \ No newline at end of file From 8610c91b1aa4f676ce022b3de35aab979b240940 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 21 May 2024 15:09:33 -0400 Subject: [PATCH 11/22] remove excess comment --- cpp/src/interop/from_arrow_host.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index da91096f329..10e3f53c88e 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -141,7 +141,7 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()(ArrowSch { auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; const auto buffer_length = bitmask_allocation_size_bytes(input->length + input->offset); - // mask-to-bools expects the mask to be bitmask_type aligned/padded + auto data = rmm::device_buffer(buffer_length, stream, mr); CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(), reinterpret_cast(data_buffer), @@ -467,9 +467,9 @@ std::unique_ptr
from_arrow(ArrowSchema const* schema, CUDF_FUNC_RANGE(); ArrowDeviceArray const device_input = { - .array = *input, - .device_id = -1, - .device_type = ARROW_DEVICE_CPU, + .array = *input, + .device_id = -1, + .device_type = ARROW_DEVICE_CPU, }; return detail::from_arrow_host(schema, &device_input, stream, mr); } From 2f492930bfab5a0dd0d9440e046a5997cc0897e0 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 22 May 2024 11:06:25 -0400 Subject: [PATCH 12/22] shift function from to_arrow_utilities to just arrow_utilities --- cpp/CMakeLists.txt | 1 - cpp/src/interop/arrow_utilities.cpp | 20 ++++++++++++ cpp/src/interop/arrow_utilities.hpp | 15 +++++++++ cpp/src/interop/to_arrow_device.cu | 1 - cpp/src/interop/to_arrow_schema.cpp | 2 +- cpp/src/interop/to_arrow_utilities.cpp | 44 -------------------------- cpp/src/interop/to_arrow_utilities.hpp | 34 -------------------- 7 files changed, 36 insertions(+), 81 deletions(-) delete mode 100644 cpp/src/interop/to_arrow_utilities.cpp delete mode 100644 cpp/src/interop/to_arrow_utilities.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a58d221a7dd..3fc5a7d342a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -366,7 +366,6 @@ add_library( src/interop/from_arrow_device.cu src/interop/from_arrow_host.cu src/interop/to_arrow_schema.cpp - src/interop/to_arrow_utilities.cpp src/interop/detail/arrow_allocator.cpp src/io/avro/avro.cpp src/io/avro/avro_gpu.cu diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp index e4e3d20ef0c..ca85677b03f 100644 --- a/cpp/src/interop/arrow_utilities.cpp +++ b/cpp/src/interop/arrow_utilities.cpp @@ -66,5 +66,25 @@ data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view) default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error); } } + +ArrowType id_to_arrow_type(cudf::type_id id) +{ + switch (id) { + case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL; + case cudf::type_id::INT8: return NANOARROW_TYPE_INT8; + case cudf::type_id::INT16: return NANOARROW_TYPE_INT16; + case cudf::type_id::INT32: return NANOARROW_TYPE_INT32; + case cudf::type_id::INT64: return NANOARROW_TYPE_INT64; + case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8; + case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16; + case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32; + case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64; + case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT; + case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE; + case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32; + default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error); + } +} + } // namespace detail } // namespace cudf \ No newline at end of file diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp index 152b8c3e939..31011f85bd1 100644 --- a/cpp/src/interop/arrow_utilities.hpp +++ b/cpp/src/interop/arrow_utilities.hpp @@ -19,6 +19,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -30,7 +31,21 @@ namespace detail { static constexpr int validity_buffer_idx = 0; static constexpr int fixed_width_data_buffer_idx = 1; +/** + * @brief Map ArrowType id to cudf column type id + * + * @param arrow_view SchemaView to pull the logical and storage types from + * @return Column type id + */ data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view); +/** + * @brief Map cudf column type id to ArrowType id + * + * @param id Column type id + * @return ArrowType id + */ +ArrowType id_to_arrow_type(cudf::type_id id); + } // namespace detail } // namespace cudf diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu index f2b1669df9b..ebfd6605977 100644 --- a/cpp/src/interop/to_arrow_device.cu +++ b/cpp/src/interop/to_arrow_device.cu @@ -15,7 +15,6 @@ */ #include "arrow_utilities.hpp" -#include "to_arrow_utilities.hpp" #include #include diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp index 6f943593dce..19915464236 100644 --- a/cpp/src/interop/to_arrow_schema.cpp +++ b/cpp/src/interop/to_arrow_schema.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "to_arrow_utilities.hpp" +#include "arrow_utilities.hpp" #include #include diff --git a/cpp/src/interop/to_arrow_utilities.cpp b/cpp/src/interop/to_arrow_utilities.cpp deleted file mode 100644 index 04d17847273..00000000000 --- a/cpp/src/interop/to_arrow_utilities.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "to_arrow_utilities.hpp" - -#include - -namespace cudf { -namespace detail { - -ArrowType id_to_arrow_type(cudf::type_id id) -{ - switch (id) { - case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL; - case cudf::type_id::INT8: return NANOARROW_TYPE_INT8; - case cudf::type_id::INT16: return NANOARROW_TYPE_INT16; - case cudf::type_id::INT32: return NANOARROW_TYPE_INT32; - case cudf::type_id::INT64: return NANOARROW_TYPE_INT64; - case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8; - case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16; - case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32; - case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64; - case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT; - case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE; - case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32; - default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error); - } -} - -} // namespace detail -} // namespace cudf diff --git a/cpp/src/interop/to_arrow_utilities.hpp b/cpp/src/interop/to_arrow_utilities.hpp deleted file mode 100644 index 3c01c726a7b..00000000000 --- a/cpp/src/interop/to_arrow_utilities.hpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -namespace cudf { -namespace detail { - -/** - * @brief Map cudf column type id to ArrowType id - * - * @param id Column type id - * @return ArrowType id - */ -ArrowType id_to_arrow_type(cudf::type_id id); - -} // namespace detail -} // namespace cudf From 5b57bb381902ecda8ea7fdfdd3b8b4bcd38afe2e Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 22 May 2024 15:50:38 -0400 Subject: [PATCH 13/22] fix style problems --- cpp/include/cudf/interop.hpp | 2 +- cpp/src/interop/arrow_utilities.cpp | 2 +- cpp/src/interop/arrow_utilities.hpp | 2 +- cpp/src/interop/from_arrow_host.cu | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 96e1f5d009c..ed344ed0e5d 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -351,7 +351,7 @@ std::unique_ptr from_arrow( /** * @brief Create `cudf::table` from given ArrowArray and ArrowSchema input * - * @throws std::invalid_argument if either schema or input are NULL + * @throws std::invalid_argument if either schema or input are NULL * * @throws cudf::data_type_error if the input array is not a struct array. * diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp index ca85677b03f..05beecfbf9b 100644 --- a/cpp/src/interop/arrow_utilities.cpp +++ b/cpp/src/interop/arrow_utilities.cpp @@ -87,4 +87,4 @@ ArrowType id_to_arrow_type(cudf::type_id id) } } // namespace detail -} // namespace cudf \ No newline at end of file +} // namespace cudf diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp index 31011f85bd1..defddb4dc42 100644 --- a/cpp/src/interop/arrow_utilities.hpp +++ b/cpp/src/interop/arrow_utilities.hpp @@ -33,7 +33,7 @@ static constexpr int fixed_width_data_buffer_idx = 1; /** * @brief Map ArrowType id to cudf column type id - * + * * @param arrow_view SchemaView to pull the logical and storage types from * @return Column type id */ diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 10e3f53c88e..c49ed7f2c51 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -474,4 +474,4 @@ std::unique_ptr
from_arrow(ArrowSchema const* schema, return detail::from_arrow_host(schema, &device_input, stream, mr); } -} // namespace cudf \ No newline at end of file +} // namespace cudf From 923c42219f45a9d959752d66091a3b5cc5cce476 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 22 May 2024 15:52:45 -0400 Subject: [PATCH 14/22] forward declare ArrowArray --- cpp/include/cudf/interop.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index ed344ed0e5d..116fc31b18e 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -46,6 +46,8 @@ struct ArrowDeviceArray; struct ArrowSchema; +struct ArrowArray; + namespace cudf { /** * @addtogroup interop_dlpack From d4290049548cdabe570555e28760aa00fa966152 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 22 May 2024 15:53:04 -0400 Subject: [PATCH 15/22] fix forgotten view usage --- cpp/src/interop/from_arrow_host.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index c49ed7f2c51..25e1de74be5 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -433,7 +433,7 @@ std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, ArrowSchemaView view; NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - auto type = arrow_to_cudf_type(schema); + auto type = arrow_to_cudf_type(&view); return get_column_copy(&view, &input->array, type, false, stream, mr); } From c0a3c4ecbbdc8e1d7e5ae95639c43470d8fa7ec5 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 22 May 2024 17:25:53 -0400 Subject: [PATCH 16/22] add `from_arrow_column` overload --- cpp/include/cudf/interop.hpp | 24 ++++++++++++++++++++++++ cpp/src/interop/from_arrow_host.cu | 15 +++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 116fc31b18e..e48f3e0dfbd 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -357,6 +357,8 @@ std::unique_ptr from_arrow( * * @throws cudf::data_type_error if the input array is not a struct array. * + * The conversion will not call release on the input Array. + * * @param schema `ArrowSchema` pointer to describe the type of the data * @param input `ArrowArray` pointer that needs to be converted to cudf::table * @param stream CUDA stream used for device memory operations and kernel launches @@ -368,6 +370,24 @@ std::unique_ptr from_arrow(ArrowSchema const* schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input + * + * @throws std::invalid_argument if either schema or input are NULL + * + * The conversion will not call release on the input Array. + * + * @param schema `ArrowSchema` pointer to describe the type of the data + * @param input `ArrowArray` pointer that needs to be converted to cudf::column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate `cudf::column` + * @return cudf column generated from given arrow data + */ +std::unique_ptr from_arrow_column(ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + /** * @brief Create `cudf::table` from given ArrowDeviceArray input * @@ -378,6 +398,8 @@ std::unique_ptr from_arrow(ArrowSchema const* schema, * @throws cudf::data_type_error if the input array is not a struct array, * non-struct arrays should be passed to `from_arrow_host_column` instead. * + * The conversion will not call release on the input Array. + * * @param schema `ArrowSchema` pointer to describe the type of the data * @param input `ArrowDeviceArray` pointer to object owning the Arrow data * @param stream CUDA stream used for device memory operations and kernel launches @@ -399,6 +421,8 @@ std::unique_ptr
from_arrow_host( * * @throws cudf::data_type_error if input arrow data type is not supported in cudf. * + * The conversion will not call release on the input Array. + * * @param schema `ArrowSchema` pointer to describe the type of the data * @param input `ArrowDeviceArray` pointer to object owning the Arrow data * @param stream CUDA stream used for device memory operations and kernel launches diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 25e1de74be5..3e988afba0a 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -474,4 +474,19 @@ std::unique_ptr
from_arrow(ArrowSchema const* schema, return detail::from_arrow_host(schema, &device_input, stream, mr); } +std::unique_ptr from_arrow_column(ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + ArrowDeviceArray const device_input = { + .array = *input, + .device_id = -1, + .device_type = ARROW_DEVICE_CPU, + }; + return detail::from_arrow_host_column(schema, &device_input, stream, mr); +} + } // namespace cudf From 4fa83d00bc45eb44b4425628591dd8e17651c32e Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 23 May 2024 12:31:30 -0400 Subject: [PATCH 17/22] fix test and lint --- cpp/src/interop/from_arrow_host.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 3e988afba0a..98e4523ca44 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -294,7 +294,7 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()offset == 0 + return input->offset == 0 && input->length == out->size() ? std::move(out) : std::make_unique( cudf::detail::slice(out->view(), @@ -482,8 +482,8 @@ std::unique_ptr from_arrow_column(ArrowSchema const* schema, CUDF_FUNC_RANGE(); ArrowDeviceArray const device_input = { - .array = *input, - .device_id = -1, + .array = *input, + .device_id = -1, .device_type = ARROW_DEVICE_CPU, }; return detail::from_arrow_host_column(schema, &device_input, stream, mr); From ad283033356e5428770366b796b7f5364ac7eb02 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 23 May 2024 18:48:00 -0400 Subject: [PATCH 18/22] refactor and shift tests --- cpp/tests/interop/from_arrow_host_test.cpp | 179 ++------------------- cpp/tests/interop/nanoarrow_utils.hpp | 49 ++++++ cpp/tests/interop/to_arrow_device_test.cpp | 91 +++++------ 3 files changed, 101 insertions(+), 218 deletions(-) diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp index 4179d623a42..bb0d40ca18e 100644 --- a/cpp/tests/interop/from_arrow_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -40,172 +40,22 @@ std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> get_nanoarrow_host_tables(cudf::size_type length) { - std::vector int64_data(length); - std::vector bool_data(length); - std::vector string_data(length); - std::vector validity(length); - std::vector bool_validity(length); - std::vector bool_data_validity; - cudf::size_type length_of_individual_list = 3; - cudf::size_type length_of_list = length_of_individual_list * length; - std::vector list_int64_data(length_of_list); - std::vector list_int64_data_validity(length_of_list); - std::vector list_offsets(length + 1); - - std::vector> columns; - - std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; }); - std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; }); - auto validity_generator = []() { return rand() % 7 != 0; }; - std::generate( - list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator); - std::generate( - list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable { - return (n++) * length_of_individual_list; - }); - std::generate(bool_data.begin(), bool_data.end(), validity_generator); - std::generate( - string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; }); - std::generate(validity.begin(), validity.end(), validity_generator); - std::generate(bool_validity.begin(), bool_validity.end(), validity_generator); - - std::transform(bool_validity.cbegin(), - bool_validity.cend(), - std::back_inserter(bool_data_validity), - [](auto val) { return static_cast(val); }); - - columns.emplace_back(cudf::test::fixed_width_column_wrapper( - int64_data.begin(), int64_data.end(), validity.begin()) - .release()); - columns.emplace_back( - cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) - .release()); - auto col4 = cudf::test::fixed_width_column_wrapper( - int64_data.begin(), int64_data.end(), validity.begin()); - auto dict_col = cudf::dictionary::encode(col4); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); - columns.emplace_back(cudf::test::fixed_width_column_wrapper( - bool_data.begin(), bool_data.end(), bool_validity.begin()) - .release()); - auto list_child_column = cudf::test::fixed_width_column_wrapper( - list_int64_data.begin(), list_int64_data.end(), list_int64_data_validity.begin()); - auto list_offsets_column = - cudf::test::fixed_width_column_wrapper(list_offsets.begin(), list_offsets.end()); - auto [list_mask, list_nulls] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( - bool_data_validity.begin(), bool_data_validity.end())); - columns.emplace_back(cudf::make_lists_column(length, - list_offsets_column.release(), - list_child_column.release(), - list_nulls, - std::move(*list_mask))); - auto int_column = cudf::test::fixed_width_column_wrapper( - int64_data.begin(), int64_data.end(), validity.begin()) - .release(); - auto str_column = - cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) - .release(); - vector_of_columns cols; - cols.push_back(move(int_column)); - cols.push_back(move(str_column)); - auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( - bool_data_validity.begin(), bool_data_validity.end())); - columns.emplace_back( - cudf::make_structs_column(length, std::move(cols), null_count, std::move(*null_mask))); - - nanoarrow::UniqueSchema schema; - ArrowSchemaInit(schema.get()); - NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(schema.get(), 6)); - - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_INT64)); - NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[0], "a")); - if (columns[0]->null_count() > 0) { - schema->children[0]->flags |= ARROW_FLAG_NULLABLE; - } else { - schema->children[0]->flags = 0; - } - - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[1], NANOARROW_TYPE_STRING)); - NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[1], "b")); - if (columns[1]->null_count() > 0) { - schema->children[1]->flags |= ARROW_FLAG_NULLABLE; - } else { - schema->children[1]->flags = 0; - } - - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_UINT32)); - NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(schema->children[2])); - NANOARROW_THROW_NOT_OK( - ArrowSchemaInitFromType(schema->children[2]->dictionary, NANOARROW_TYPE_INT64)); - NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[2], "c")); - if (columns[2]->null_count() > 0) { - schema->children[2]->flags |= ARROW_FLAG_NULLABLE; - } else { - schema->children[2]->flags = 0; - } - - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[3], NANOARROW_TYPE_BOOL)); - NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[3], "d")); - if (columns[3]->null_count() > 0) { - schema->children[3]->flags |= ARROW_FLAG_NULLABLE; - } else { - schema->children[3]->flags = 0; - } - - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[4], NANOARROW_TYPE_LIST)); - NANOARROW_THROW_NOT_OK( - ArrowSchemaInitFromType(schema->children[4]->children[0], NANOARROW_TYPE_INT64)); - NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[4]->children[0], "element")); - if (columns[4]->child(1).null_count() > 0) { - schema->children[4]->children[0]->flags |= ARROW_FLAG_NULLABLE; - } else { - schema->children[4]->children[0]->flags = 0; - } - - NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[4], "e")); - if (columns[4]->has_nulls()) { - schema->children[4]->flags |= ARROW_FLAG_NULLABLE; - } else { - schema->children[4]->flags = 0; - } - - ArrowSchemaInit(schema->children[5]); - NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(schema->children[5], 2)); - NANOARROW_THROW_NOT_OK( - ArrowSchemaInitFromType(schema->children[5]->children[0], NANOARROW_TYPE_INT64)); - NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5]->children[0], "integral")); - if (columns[5]->child(0).has_nulls()) { - schema->children[5]->children[0]->flags |= ARROW_FLAG_NULLABLE; - } else { - schema->children[5]->children[0]->flags = 0; - } - - NANOARROW_THROW_NOT_OK( - ArrowSchemaInitFromType(schema->children[5]->children[1], NANOARROW_TYPE_STRING)); - NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5]->children[1], "string")); - if (columns[5]->child(1).has_nulls()) { - schema->children[5]->children[1]->flags |= ARROW_FLAG_NULLABLE; - } else { - schema->children[5]->children[1]->flags = 0; - } - - NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5], "f")); - if (columns[5]->has_nulls()) { - schema->children[5]->flags |= ARROW_FLAG_NULLABLE; - } else { - schema->children[5]->flags = 0; - } + auto [table, schema, test_data] = get_nanoarrow_cudf_table(length); - auto int64_array = get_nanoarrow_array(int64_data, validity); - auto string_array = get_nanoarrow_array(string_data, validity); + auto int64_array = get_nanoarrow_array(test_data.int64_data, test_data.validity); + auto string_array = + get_nanoarrow_array(test_data.string_data, test_data.validity); cudf::dictionary_column_view view(dict_col->view()); auto keys = cudf::test::to_host(view.keys()).first; auto indices = cudf::test::to_host(view.indices()).first; auto dict_array = get_nanoarrow_dict_array(std::vector(keys.begin(), keys.end()), std::vector(indices.begin(), indices.end()), - validity); - auto boolarray = get_nanoarrow_array(bool_data, bool_validity); - auto list_array = get_nanoarrow_list_array( - list_int64_data, list_offsets, list_int64_data_validity, bool_data_validity); + test_data.validity); + auto boolarray = get_nanoarrow_array(test_data.bool_data, test_data.bool_validity); + auto list_array = get_nanoarrow_list_array(test_data.list_int64_data, + test_data.list_offsets, + test_data.list_int64_data_validity, + test_data.bool_data_validity); nanoarrow::UniqueArray arrow; NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr)); @@ -217,8 +67,8 @@ get_nanoarrow_host_tables(cudf::size_type length) boolarray.move(arrow->children[3]); list_array.move(arrow->children[4]); - int64_array = get_nanoarrow_array(int64_data, validity); - string_array = get_nanoarrow_array(string_data, validity); + int64_array = get_nanoarrow_array(test_data.int64_data, test_data.validity); + string_array = get_nanoarrow_array(test_data.string_data, test_data.validity); int64_array.move(arrow->children[5]->children[0]); string_array.move(arrow->children[5]->children[1]); @@ -226,7 +76,7 @@ get_nanoarrow_host_tables(cudf::size_type length) ArrowBitmapInit(&struct_validity); NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&struct_validity, length)); ArrowBitmapAppendInt8Unsafe( - &struct_validity, reinterpret_cast(bool_data_validity.data()), length); + &struct_validity, reinterpret_cast(test_data.bool_data_validity.data()), length); arrow->children[5]->length = length; ArrowArraySetValidityBitmap(arrow->children[5], &struct_validity); arrow->children[5]->null_count = @@ -239,8 +89,7 @@ get_nanoarrow_host_tables(cudf::size_type length) CUDF_FAIL("failed to build example arrays"); } - return std::make_tuple( - std::make_unique(std::move(columns)), std::move(schema), std::move(arrow)); + return std::make_tuple(std::move(table), std::move(schema), std::move(arrow)); } struct FromArrowHostDeviceTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index a36584062c0..5b28a654820 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -30,6 +30,52 @@ #include +struct generated_test_data { + generated_test_data(cudf::size_type length) + : int64_data(length), + bool_data(length), + string_data(length), + validity(length), + bool_validity(length), + list_int64_data(3 * length), + list_int64_data_validity(3 * length), + list_offsets(length + 1) + { + cudf::size_type length_of_individual_list = 3; + cudf::size_type length_of_list = length_of_individual_list * length; + + std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; }); + std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; }); + auto validity_generator = []() { return rand() % 7 != 0; }; + std::generate( + list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator); + std::generate( + list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable { + return (n++) * length_of_individual_list; + }); + std::generate(bool_data.begin(), bool_data.end(), validity_generator); + std::generate( + string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; }); + std::generate(validity.begin(), validity.end(), validity_generator); + std::generate(bool_validity.begin(), bool_validity.end(), validity_generator); + + std::transform(bool_validity.cbegin(), + bool_validity.cend(), + std::back_inserter(bool_data_validity), + [](auto val) { return static_cast(val); }); + } + + std::vector int64_data; + std::vector bool_data; + std::vector string_data; + std::vector validity; + std::vector bool_validity; + std::vector bool_data_validity; + std::vector list_int64_data; + std::vector list_int64_data_validity; + std::vector list_offsets; +}; + // no-op allocator/deallocator to set into ArrowArray buffers that we don't // want to own their buffers. static ArrowBufferAllocator noop_alloc = (struct ArrowBufferAllocator){ @@ -327,3 +373,6 @@ nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list data, std::vector list_mask(list_validity); return get_nanoarrow_list_array(data_vector, offset, data_mask, list_mask); } + +std::tuple, nanoarrow::UniqueSchema, generated_test_data> +get_nanoarrow_cudf_table(cudf::size_type length); \ No newline at end of file diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index a10517128d2..131738da34c 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -38,78 +38,55 @@ #include -std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> -get_nanoarrow_tables(cudf::size_type length) +std::tuple, nanoarrow::UniqueSchema, generated_test_data> +get_nanoarrow_cudf_table(cudf::size_type length) { - std::vector int64_data(length); - std::vector bool_data(length); - std::vector string_data(length); - std::vector validity(length); - std::vector bool_validity(length); - std::vector bool_data_validity; - cudf::size_type length_of_individual_list = 3; - cudf::size_type length_of_list = length_of_individual_list * length; - std::vector list_int64_data(length_of_list); - std::vector list_int64_data_validity(length_of_list); - std::vector list_offsets(length + 1); + generated_test_data test_data(length); std::vector> columns; - std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; }); - std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; }); - auto validity_generator = []() { return rand() % 7 != 0; }; - std::generate( - list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator); - std::generate( - list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable { - return (n++) * length_of_individual_list; - }); - std::generate(bool_data.begin(), bool_data.end(), validity_generator); - std::generate( - string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; }); - std::generate(validity.begin(), validity.end(), validity_generator); - std::generate(bool_validity.begin(), bool_validity.end(), validity_generator); - - std::transform(bool_validity.cbegin(), - bool_validity.cend(), - std::back_inserter(bool_data_validity), - [](auto val) { return static_cast(val); }); - - columns.emplace_back(cudf::test::fixed_width_column_wrapper( - int64_data.begin(), int64_data.end(), validity.begin()) + columns.emplace_back(cudf::test::fixed_width_column_wrapper(test_data.int64_data.begin(), + test_data.int64_data.end(), + test_data.validity.begin()) + .release()); + columns.emplace_back(cudf::test::strings_column_wrapper(test_data.string_data.begin(), + test_data.string_data.end(), + test_data.validity.begin()) .release()); - columns.emplace_back( - cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) - .release()); auto col4 = cudf::test::fixed_width_column_wrapper( - int64_data.begin(), int64_data.end(), validity.begin()); + test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin()); auto dict_col = cudf::dictionary::encode(col4); columns.emplace_back(std::move(cudf::dictionary::encode(col4))); - columns.emplace_back(cudf::test::fixed_width_column_wrapper( - bool_data.begin(), bool_data.end(), bool_validity.begin()) + columns.emplace_back(cudf::test::fixed_width_column_wrapper(test_data.bool_data.begin(), + test_data.bool_data.end(), + test_data.bool_validity.begin()) .release()); - auto list_child_column = cudf::test::fixed_width_column_wrapper( - list_int64_data.begin(), list_int64_data.end(), list_int64_data_validity.begin()); - auto list_offsets_column = - cudf::test::fixed_width_column_wrapper(list_offsets.begin(), list_offsets.end()); + auto list_child_column = + cudf::test::fixed_width_column_wrapper(test_data.list_int64_data.begin(), + test_data.list_int64_data.end(), + test_data.list_int64_data_validity.begin()); + auto list_offsets_column = cudf::test::fixed_width_column_wrapper( + test_data.list_offsets.begin(), test_data.list_offsets.end()); auto [list_mask, list_nulls] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( - bool_data_validity.begin(), bool_data_validity.end())); + test_data.bool_data_validity.begin(), test_data.bool_data_validity.end())); columns.emplace_back(cudf::make_lists_column(length, list_offsets_column.release(), list_child_column.release(), list_nulls, std::move(*list_mask))); - auto int_column = cudf::test::fixed_width_column_wrapper( - int64_data.begin(), int64_data.end(), validity.begin()) - .release(); + auto int_column = + cudf::test::fixed_width_column_wrapper( + test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin()) + .release(); auto str_column = - cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) + cudf::test::strings_column_wrapper( + test_data.string_data.begin(), test_data.string_data.end(), test_data.validity.begin()) .release(); vector_of_columns cols; cols.push_back(move(int_column)); cols.push_back(move(str_column)); auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( - bool_data_validity.begin(), bool_data_validity.end())); + test_data.bool_data_validity.begin(), test_data.bool_data_validity.end())); columns.emplace_back( cudf::make_structs_column(length, std::move(cols), null_count, std::move(*null_mask))); @@ -196,6 +173,15 @@ get_nanoarrow_tables(cudf::size_type length) schema->children[5]->flags = 0; } + return std::make_tuple( + std::make_unique(std::move(columns)), std::move(schema), std::move(test_data)); +} + +std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> +get_nanoarrow_tables(cudf::size_type length) +{ + auto [table, schema, test_data] = get_nanoarrow_cudf_table(length); + nanoarrow::UniqueArray arrow; NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr)); arrow->length = length; @@ -229,8 +215,7 @@ get_nanoarrow_tables(cudf::size_type length) CUDF_FAIL("failed to build example arrays"); } - return std::make_tuple( - std::make_unique(std::move(columns)), std::move(schema), std::move(arrow)); + return std::make_tuple(std::move(table), std::move(schema), std::move(arrow)); } // populate an ArrowArray list array from device buffers using a no-op From 4b4c887d4305263cb0b34ec0dd210e34f3946b22 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 23 May 2024 23:02:38 +0000 Subject: [PATCH 19/22] fix build issues --- cpp/tests/interop/from_arrow_host_test.cpp | 2 +- cpp/tests/interop/nanoarrow_utils.hpp | 3 +-- cpp/tests/interop/to_arrow_device_test.cpp | 12 ++++++------ 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp index bb0d40ca18e..e6e52099a0c 100644 --- a/cpp/tests/interop/from_arrow_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -45,7 +45,7 @@ get_nanoarrow_host_tables(cudf::size_type length) auto int64_array = get_nanoarrow_array(test_data.int64_data, test_data.validity); auto string_array = get_nanoarrow_array(test_data.string_data, test_data.validity); - cudf::dictionary_column_view view(dict_col->view()); + cudf::dictionary_column_view view(table->get_column(2).view()); auto keys = cudf::test::to_host(view.keys()).first; auto indices = cudf::test::to_host(view.indices()).first; auto dict_array = get_nanoarrow_dict_array(std::vector(keys.begin(), keys.end()), diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index 5b28a654820..a79e6fdc49c 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -42,7 +42,6 @@ struct generated_test_data { list_offsets(length + 1) { cudf::size_type length_of_individual_list = 3; - cudf::size_type length_of_list = length_of_individual_list * length; std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; }); std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; }); @@ -375,4 +374,4 @@ nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list data, } std::tuple, nanoarrow::UniqueSchema, generated_test_data> -get_nanoarrow_cudf_table(cudf::size_type length); \ No newline at end of file +get_nanoarrow_cudf_table(cudf::size_type length); diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 131738da34c..37ff1a48ee5 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -186,17 +186,17 @@ get_nanoarrow_tables(cudf::size_type length) NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr)); arrow->length = length; - populate_from_col(arrow->children[0], columns[0]->view()); - populate_from_col(arrow->children[1], columns[1]->view()); + populate_from_col(arrow->children[0], table->get_column(0).view()); + populate_from_col(arrow->children[1], table->get_column(1).view()); populate_dict_from_col(arrow->children[2], - cudf::dictionary_column_view(columns[2]->view())); + cudf::dictionary_column_view(table->get_column(2).view())); - populate_from_col(arrow->children[3], columns[3]->view()); - cudf::lists_column_view list_view{columns[4]->view()}; + populate_from_col(arrow->children[3], table->get_column(3).view()); + cudf::lists_column_view list_view{table->get_column(4).view()}; populate_list_from_col(arrow->children[4], list_view); populate_from_col(arrow->children[4]->children[0], list_view.child()); - cudf::structs_column_view struct_view{columns[5]->view()}; + cudf::structs_column_view struct_view{table->get_column(5).view()}; populate_from_col(arrow->children[5]->children[0], struct_view.child(0)); populate_from_col(arrow->children[5]->children[1], struct_view.child(1)); arrow->children[5]->length = struct_view.size(); From f357ad796a6b4fa1771c7cc1e478e78a1ac680e7 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 23 May 2024 20:52:15 -0400 Subject: [PATCH 20/22] style fixes --- cpp/include/cudf/interop.hpp | 8 ++++---- cpp/tests/interop/to_arrow_device_test.cpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index e48f3e0dfbd..f3ff0009d5c 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -358,7 +358,7 @@ std::unique_ptr from_arrow( * @throws cudf::data_type_error if the input array is not a struct array. * * The conversion will not call release on the input Array. - * + * * @param schema `ArrowSchema` pointer to describe the type of the data * @param input `ArrowArray` pointer that needs to be converted to cudf::table * @param stream CUDA stream used for device memory operations and kernel launches @@ -376,7 +376,7 @@ std::unique_ptr from_arrow(ArrowSchema const* schema, * @throws std::invalid_argument if either schema or input are NULL * * The conversion will not call release on the input Array. - * + * * @param schema `ArrowSchema` pointer to describe the type of the data * @param input `ArrowArray` pointer that needs to be converted to cudf::column * @param stream CUDA stream used for device memory operations and kernel launches @@ -399,7 +399,7 @@ std::unique_ptr from_arrow_column(ArrowSchema const* schema, * non-struct arrays should be passed to `from_arrow_host_column` instead. * * The conversion will not call release on the input Array. - * + * * @param schema `ArrowSchema` pointer to describe the type of the data * @param input `ArrowDeviceArray` pointer to object owning the Arrow data * @param stream CUDA stream used for device memory operations and kernel launches @@ -422,7 +422,7 @@ std::unique_ptr
from_arrow_host( * @throws cudf::data_type_error if input arrow data type is not supported in cudf. * * The conversion will not call release on the input Array. - * + * * @param schema `ArrowSchema` pointer to describe the type of the data * @param input `ArrowDeviceArray` pointer to object owning the Arrow data * @param stream CUDA stream used for device memory operations and kernel launches diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 37ff1a48ee5..4c73cd637a4 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -188,8 +188,8 @@ get_nanoarrow_tables(cudf::size_type length) populate_from_col(arrow->children[0], table->get_column(0).view()); populate_from_col(arrow->children[1], table->get_column(1).view()); - populate_dict_from_col(arrow->children[2], - cudf::dictionary_column_view(table->get_column(2).view())); + populate_dict_from_col( + arrow->children[2], cudf::dictionary_column_view(table->get_column(2).view())); populate_from_col(arrow->children[3], table->get_column(3).view()); cudf::lists_column_view list_view{table->get_column(4).view()}; From b11d8463e569415425c46354ddc63f222c19b5fd Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 24 May 2024 11:12:50 -0400 Subject: [PATCH 21/22] fix expected exception --- cpp/tests/interop/from_arrow_device_test.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp index 66bd4dd1bfb..d776ca57ef6 100644 --- a/cpp/tests/interop/from_arrow_device_test.cpp +++ b/cpp/tests/interop/from_arrow_device_test.cpp @@ -49,23 +49,23 @@ TYPED_TEST_SUITE(FromArrowDeviceTestDurationsTest, cudf::test::DurationTypes); TEST_F(FromArrowDeviceTest, FailConditions) { // can't pass null for schema or device array - EXPECT_THROW(cudf::from_arrow_device(nullptr, nullptr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device(nullptr, nullptr), std::invalid_argument); // can't pass null for device array ArrowSchema schema; - EXPECT_THROW(cudf::from_arrow_device(&schema, nullptr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device(&schema, nullptr), std::invalid_argument); // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED // should fail with ARROW_DEVICE_CPU ArrowDeviceArray arr; arr.device_type = ARROW_DEVICE_CPU; - EXPECT_THROW(cudf::from_arrow_device(&schema, &arr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device(&schema, &arr), std::invalid_argument); // can't pass null for schema or device array - EXPECT_THROW(cudf::from_arrow_device_column(nullptr, nullptr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device_column(nullptr, nullptr), std::invalid_argument); // can't pass null for device array - EXPECT_THROW(cudf::from_arrow_device_column(&schema, nullptr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device_column(&schema, nullptr), std::invalid_argument); // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED // should fail with ARROW_DEVICE_CPU - EXPECT_THROW(cudf::from_arrow_device_column(&schema, &arr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device_column(&schema, &arr), std::invalid_argument); } TEST_F(FromArrowDeviceTest, EmptyTable) From 72031972197dacd23dc26e6c7a21d89520cbbc7a Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 28 May 2024 11:12:43 -0400 Subject: [PATCH 22/22] Update cpp/src/interop/from_arrow_host.cu Co-authored-by: Vyas Ramasubramani --- cpp/src/interop/from_arrow_host.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 98e4523ca44..36bb35d9419 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -73,7 +73,7 @@ struct dispatch_copy_from_arrow_host { !std::is_same_v)> std::unique_ptr operator()(ArrowSchemaView*, ArrowArray const*, data_type, bool) { - CUDF_FAIL("Unsupported type in copy_from_arrow_device."); + CUDF_FAIL("Unsupported type in copy_from_arrow_host."); } template