-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-43573: [C++] Copy bitmap when casting from string-view to offset string and binary types #44822
Open
CrystalZhou0529
wants to merge
7
commits into
apache:main
Choose a base branch
from
CrystalZhou0529:43573-copy-bitmap-when-cast-string
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+43
−19
Open
Changes from 4 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
14d7d5c
Optimize string view to offset string cast
CrystalZhou0529 3d6b715
Update cpp/src/arrow/compute/kernels/scalar_cast_string.cc
CrystalZhou0529 5c4a5d1
Update cpp/src/arrow/compute/kernels/scalar_cast_string.cc
CrystalZhou0529 068300e
Fix allocation of data buffer size
CrystalZhou0529 6341169
Update cpp/src/arrow/compute/kernels/scalar_cast_string.cc
CrystalZhou0529 4312be2
Update cpp/src/arrow/compute/kernels/scalar_cast_string.cc
CrystalZhou0529 5c262a9
Copy GetNullBitmapBuffer to own file and call it
CrystalZhou0529 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -314,7 +314,9 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou | |
template <typename O, typename I> | ||
enable_if_t<is_binary_view_like_type<I>::value && is_base_binary_type<O>::value, Status> | ||
BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { | ||
using OutputBuilderType = typename TypeTraits<O>::BuilderType; | ||
using offset_type = typename O::offset_type; | ||
using DataBuilder = TypedBufferBuilder<uint8_t>; | ||
using OffsetBuilder = TypedBufferBuilder<offset_type>; | ||
const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options; | ||
const ArraySpan& input = batch[0].array; | ||
|
||
|
@@ -327,31 +329,49 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou | |
} | ||
} | ||
|
||
ArrayData* output = out->array_data().get(); | ||
output->length = input.length; | ||
output->SetNullCount(input.null_count); | ||
|
||
// Set up validity bitmap | ||
if (input.offset == output->offset) { | ||
output->buffers[0] = input.GetBuffer(0); | ||
} else { | ||
// When the offsets are different (e.g., due to slice operation), we need to check if | ||
// the null bitmap buffer is not null before copying it. The null bitmap buffer can be | ||
// null if the input array value does not contain any null value. | ||
if (input.buffers[0].data != NULLPTR) { | ||
ARROW_ASSIGN_OR_RAISE( | ||
output->buffers[0], | ||
arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, | ||
input.offset, input.length)); | ||
} | ||
} | ||
|
||
// Set up offset and data buffer | ||
OffsetBuilder offset_builder(ctx->memory_pool()); | ||
RETURN_NOT_OK(offset_builder.Reserve(batch.length + 1)); | ||
CrystalZhou0529 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
offset_builder.UnsafeAppend(0); // offsets start at 0 | ||
const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes( | ||
input.GetValues<BinaryViewType::c_type>(1), input.length); | ||
|
||
// TODO(GH-43573): A more efficient implementation that copies the validity | ||
// bitmap all at once is possible, but would mean we don't delegate all the | ||
// building logic to the ArrayBuilder implementation for the output type. | ||
OutputBuilderType builder(options.to_type.GetSharedPtr(), ctx->memory_pool()); | ||
RETURN_NOT_OK(builder.Resize(input.length)); | ||
RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes)); | ||
arrow::internal::ArraySpanInlineVisitor<I> visitor; | ||
RETURN_NOT_OK(visitor.VisitStatus( | ||
input, | ||
[&](std::string_view v) { | ||
// Append valid string view | ||
return builder.Append(v); | ||
DataBuilder data_builder(ctx->memory_pool()); | ||
RETURN_NOT_OK(data_builder.Reserve(sum_of_binary_view_sizes)); | ||
RETURN_NOT_OK(VisitArraySpanInline<I>( | ||
batch[0].array, | ||
CrystalZhou0529 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
[&](std::string_view s) { | ||
// for non-null value, append string view to buffer and calculate offset | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does calculate offset means? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. taking it from the length of the buffer builder after the append |
||
data_builder.UnsafeAppend(reinterpret_cast<const uint8_t*>(s.data()), | ||
static_cast<int64_t>(s.size())); | ||
offset_builder.UnsafeAppend(static_cast<offset_type>(data_builder.length())); | ||
return Status::OK(); | ||
}, | ||
[&]() { | ||
// Append null | ||
builder.UnsafeAppendNull(); | ||
// for null value, no need to update data buffer | ||
offset_builder.UnsafeAppend(static_cast<offset_type>(data_builder.length())); | ||
return Status::OK(); | ||
})); | ||
|
||
std::shared_ptr<ArrayData> output_array; | ||
RETURN_NOT_OK(builder.FinishInternal(&output_array)); | ||
out->value = std::move(output_array); | ||
RETURN_NOT_OK(offset_builder.Finish(&output->buffers[1])); | ||
RETURN_NOT_OK(data_builder.Finish(&output->buffers[2])); | ||
return Status::OK(); | ||
} | ||
|
||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we also need a comment here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you copy and paste this utility function [1] to this compilation unit and call it from here instead?
[1] https://github.com/apache/arrow/blob/main/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc#L42
(Later the utility could be moved to a .h so it's callable from anywhere and inlinable. But I'm suggesting a copy because it's tricky to name this function in an informative and non-error-prone way.)