From 14d7d5c8a81f6b8a765eb20c88670ee54a687a00 Mon Sep 17 00:00:00 2001 From: Crystal Zhou Date: Sat, 23 Nov 2024 21:57:07 -0500 Subject: [PATCH 1/7] Optimize string view to offset string cast --- .../compute/kernels/scalar_cast_string.cc | 58 ++++++++++++------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 4edf00225d317..82d8dbb3b418f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -314,7 +314,9 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou template enable_if_t::value && is_base_binary_type::value, Status> BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - using OutputBuilderType = typename TypeTraits::BuilderType; + using offset_type = typename O::offset_type; + using DataBuilder = TypedBufferBuilder; + using OffsetBuilder = TypedBufferBuilder; const CastOptions& options = checked_cast(*ctx->state()).options; const ArraySpan& input = batch[0].array; @@ -327,31 +329,43 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou } } - const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes( - input.GetValues(1), input.length); - - // TODO(GH-43573): A more efficient implementation that copies the validity - // bitmap all at once is possible, but would mean we don't delegate all the - // building logic to the ArrayBuilder implementation for the output type. - OutputBuilderType builder(options.to_type.GetSharedPtr(), ctx->memory_pool()); - RETURN_NOT_OK(builder.Resize(input.length)); - RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes)); - arrow::internal::ArraySpanInlineVisitor visitor; - RETURN_NOT_OK(visitor.VisitStatus( - input, - [&](std::string_view v) { - // Append valid string view - return builder.Append(v); + ArrayData* output = out->array_data().get(); + output->length = input.length; + output->SetNullCount(input.null_count); + + // Set up bitmap + if (input.offset == output->offset) { + output->buffers[0] = input.GetBuffer(0); + } else { + if (input.buffers[0].data != NULLPTR) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } + } + + // Set up offset and data buffer + DataBuilder data_builder(ctx->memory_pool()); + OffsetBuilder offset_builder(ctx->memory_pool()); + RETURN_NOT_OK(offset_builder.Reserve(batch.length + 1)); + offset_builder.UnsafeAppend(0); // offsets start at 0 + RETURN_NOT_OK(VisitArraySpanInline( + batch[0].array, + [&](std::string_view s) { + // for non-null value, append string view to buffer and calculate offset + ARROW_RETURN_NOT_OK(data_builder.Append( + reinterpret_cast(s.data()), static_cast(s.size()))); + offset_builder.UnsafeAppend(static_cast(data_builder.length())); + return Status::OK(); }, [&]() { - // Append null - builder.UnsafeAppendNull(); + // for null value, no need to update buffer + offset_builder.UnsafeAppend(static_cast(data_builder.length())); return Status::OK(); })); - - std::shared_ptr output_array; - RETURN_NOT_OK(builder.FinishInternal(&output_array)); - out->value = std::move(output_array); + RETURN_NOT_OK(data_builder.Finish(&output->buffers[2])); + RETURN_NOT_OK(offset_builder.Finish(&output->buffers[1])); return Status::OK(); } From 3d6b7156e9aa4c71410ff57e1d98abbb182df312 Mon Sep 17 00:00:00 2001 From: Crystal <45134936+CrystalZhou0529@users.noreply.github.com> Date: Sat, 30 Nov 2024 17:41:59 -0500 Subject: [PATCH 2/7] Update cpp/src/arrow/compute/kernels/scalar_cast_string.cc Co-authored-by: mwish --- cpp/src/arrow/compute/kernels/scalar_cast_string.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 82d8dbb3b418f..bc682c6257fcb 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -360,7 +360,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou return Status::OK(); }, [&]() { - // for null value, no need to update buffer + // for null value, no need to update data buffer offset_builder.UnsafeAppend(static_cast(data_builder.length())); return Status::OK(); })); From 5c4a5d1b2d2f98403b0a98a9d80c176e99253e6f Mon Sep 17 00:00:00 2001 From: Crystal <45134936+CrystalZhou0529@users.noreply.github.com> Date: Sat, 30 Nov 2024 17:42:05 -0500 Subject: [PATCH 3/7] Update cpp/src/arrow/compute/kernels/scalar_cast_string.cc Co-authored-by: mwish --- cpp/src/arrow/compute/kernels/scalar_cast_string.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index bc682c6257fcb..00e988e005b64 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -333,7 +333,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou output->length = input.length; output->SetNullCount(input.null_count); - // Set up bitmap + // Set up validity bitmap if (input.offset == output->offset) { output->buffers[0] = input.GetBuffer(0); } else { From 068300ea41c952b7503b6be973bfb2edf78d262e Mon Sep 17 00:00:00 2001 From: Crystal Zhou Date: Sat, 30 Nov 2024 18:09:15 -0500 Subject: [PATCH 4/7] Fix allocation of data buffer size --- .../arrow/compute/kernels/scalar_cast_string.cc | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 00e988e005b64..3298243a83ed8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -337,6 +337,9 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou if (input.offset == output->offset) { output->buffers[0] = input.GetBuffer(0); } else { + // When the offsets are different (e.g., due to slice operation), we need to check if + // the null bitmap buffer is not null before copying it. The null bitmap buffer can be + // null if the input array value does not contain any null value. if (input.buffers[0].data != NULLPTR) { ARROW_ASSIGN_OR_RAISE( output->buffers[0], @@ -346,16 +349,19 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou } // Set up offset and data buffer - DataBuilder data_builder(ctx->memory_pool()); OffsetBuilder offset_builder(ctx->memory_pool()); RETURN_NOT_OK(offset_builder.Reserve(batch.length + 1)); offset_builder.UnsafeAppend(0); // offsets start at 0 + const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes( + input.GetValues(1), input.length); + DataBuilder data_builder(ctx->memory_pool()); + RETURN_NOT_OK(data_builder.Reserve(sum_of_binary_view_sizes)); RETURN_NOT_OK(VisitArraySpanInline( batch[0].array, [&](std::string_view s) { // for non-null value, append string view to buffer and calculate offset - ARROW_RETURN_NOT_OK(data_builder.Append( - reinterpret_cast(s.data()), static_cast(s.size()))); + data_builder.UnsafeAppend(reinterpret_cast(s.data()), + static_cast(s.size())); offset_builder.UnsafeAppend(static_cast(data_builder.length())); return Status::OK(); }, @@ -364,8 +370,8 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou offset_builder.UnsafeAppend(static_cast(data_builder.length())); return Status::OK(); })); - RETURN_NOT_OK(data_builder.Finish(&output->buffers[2])); RETURN_NOT_OK(offset_builder.Finish(&output->buffers[1])); + RETURN_NOT_OK(data_builder.Finish(&output->buffers[2])); return Status::OK(); } From 63411696aafe1f999b566bf4723706756f0f7bc0 Mon Sep 17 00:00:00 2001 From: Crystal Zhou <45134936+CrystalZhou0529@users.noreply.github.com> Date: Fri, 7 Feb 2025 14:18:18 -0500 Subject: [PATCH 5/7] Update cpp/src/arrow/compute/kernels/scalar_cast_string.cc Co-authored-by: Felipe Oliveira Carvalho --- cpp/src/arrow/compute/kernels/scalar_cast_string.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 3298243a83ed8..d31050f3bfbba 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -350,7 +350,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou // Set up offset and data buffer OffsetBuilder offset_builder(ctx->memory_pool()); - RETURN_NOT_OK(offset_builder.Reserve(batch.length + 1)); + RETURN_NOT_OK(offset_builder.Reserve(input.length + 1)); offset_builder.UnsafeAppend(0); // offsets start at 0 const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes( input.GetValues(1), input.length); From 4312be2c70b81e17e727b53e596472b89234190f Mon Sep 17 00:00:00 2001 From: Crystal Zhou <45134936+CrystalZhou0529@users.noreply.github.com> Date: Fri, 7 Feb 2025 14:18:26 -0500 Subject: [PATCH 6/7] Update cpp/src/arrow/compute/kernels/scalar_cast_string.cc Co-authored-by: Felipe Oliveira Carvalho --- cpp/src/arrow/compute/kernels/scalar_cast_string.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index d31050f3bfbba..a3393614bc5ab 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -357,7 +357,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou DataBuilder data_builder(ctx->memory_pool()); RETURN_NOT_OK(data_builder.Reserve(sum_of_binary_view_sizes)); RETURN_NOT_OK(VisitArraySpanInline( - batch[0].array, + input, [&](std::string_view s) { // for non-null value, append string view to buffer and calculate offset data_builder.UnsafeAppend(reinterpret_cast(s.data()), From 5c262a9e0e1a2439855e2b0a146c83e996f6f661 Mon Sep 17 00:00:00 2001 From: Crystal Zhou Date: Fri, 7 Feb 2025 14:40:43 -0500 Subject: [PATCH 7/7] Copy GetNullBitmapBuffer to own file and call it --- .../compute/kernels/scalar_cast_string.cc | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index a3393614bc5ab..efff10afaaa7e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -46,6 +46,21 @@ namespace internal { namespace { +Result> GetNullBitmapBuffer(const ArraySpan& in_array, + MemoryPool* pool) { + if (in_array.buffers[0].data == nullptr) { + return nullptr; + } + + if (in_array.offset == 0) { + return in_array.GetBuffer(0); + } + + // If a non-zero offset, we need to shift the bitmap + return arrow::internal::CopyBitmap(pool, in_array.buffers[0].data, in_array.offset, + in_array.length); +} + // ---------------------------------------------------------------------- // Number / Boolean to String @@ -334,19 +349,8 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou output->SetNullCount(input.null_count); // Set up validity bitmap - if (input.offset == output->offset) { - output->buffers[0] = input.GetBuffer(0); - } else { - // When the offsets are different (e.g., due to slice operation), we need to check if - // the null bitmap buffer is not null before copying it. The null bitmap buffer can be - // null if the input array value does not contain any null value. - if (input.buffers[0].data != NULLPTR) { - ARROW_ASSIGN_OR_RAISE( - output->buffers[0], - arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, - input.offset, input.length)); - } - } + ARROW_ASSIGN_OR_RAISE(output->buffers[0], + GetNullBitmapBuffer(input, ctx->memory_pool())); // Set up offset and data buffer OffsetBuilder offset_builder(ctx->memory_pool());