diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 4edf00225d317..efff10afaaa7e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -46,6 +46,21 @@ namespace internal { namespace { +Result> GetNullBitmapBuffer(const ArraySpan& in_array, + MemoryPool* pool) { + if (in_array.buffers[0].data == nullptr) { + return nullptr; + } + + if (in_array.offset == 0) { + return in_array.GetBuffer(0); + } + + // If a non-zero offset, we need to shift the bitmap + return arrow::internal::CopyBitmap(pool, in_array.buffers[0].data, in_array.offset, + in_array.length); +} + // ---------------------------------------------------------------------- // Number / Boolean to String @@ -314,7 +329,9 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou template enable_if_t::value && is_base_binary_type::value, Status> BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { - using OutputBuilderType = typename TypeTraits::BuilderType; + using offset_type = typename O::offset_type; + using DataBuilder = TypedBufferBuilder; + using OffsetBuilder = TypedBufferBuilder; const CastOptions& options = checked_cast(*ctx->state()).options; const ArraySpan& input = batch[0].array; @@ -327,31 +344,38 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou } } + ArrayData* output = out->array_data().get(); + output->length = input.length; + output->SetNullCount(input.null_count); + + // Set up validity bitmap + ARROW_ASSIGN_OR_RAISE(output->buffers[0], + GetNullBitmapBuffer(input, ctx->memory_pool())); + + // Set up offset and data buffer + OffsetBuilder offset_builder(ctx->memory_pool()); + RETURN_NOT_OK(offset_builder.Reserve(input.length + 1)); + offset_builder.UnsafeAppend(0); // offsets start at 0 const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes( input.GetValues(1), input.length); - - // TODO(GH-43573): A more efficient implementation that copies the validity - // bitmap all at once is possible, but would mean we don't delegate all the - // building logic to the ArrayBuilder implementation for the output type. - OutputBuilderType builder(options.to_type.GetSharedPtr(), ctx->memory_pool()); - RETURN_NOT_OK(builder.Resize(input.length)); - RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes)); - arrow::internal::ArraySpanInlineVisitor visitor; - RETURN_NOT_OK(visitor.VisitStatus( + DataBuilder data_builder(ctx->memory_pool()); + RETURN_NOT_OK(data_builder.Reserve(sum_of_binary_view_sizes)); + RETURN_NOT_OK(VisitArraySpanInline( input, - [&](std::string_view v) { - // Append valid string view - return builder.Append(v); + [&](std::string_view s) { + // for non-null value, append string view to buffer and calculate offset + data_builder.UnsafeAppend(reinterpret_cast(s.data()), + static_cast(s.size())); + offset_builder.UnsafeAppend(static_cast(data_builder.length())); + return Status::OK(); }, [&]() { - // Append null - builder.UnsafeAppendNull(); + // for null value, no need to update data buffer + offset_builder.UnsafeAppend(static_cast(data_builder.length())); return Status::OK(); })); - - std::shared_ptr output_array; - RETURN_NOT_OK(builder.FinishInternal(&output_array)); - out->value = std::move(output_array); + RETURN_NOT_OK(offset_builder.Finish(&output->buffers[1])); + RETURN_NOT_OK(data_builder.Finish(&output->buffers[2])); return Status::OK(); }