-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Core: Update variant class visibility #12105
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,11 +29,11 @@ class SerializedArray extends Variants.SerializedValue implements VariantArray { | |
private static final int IS_LARGE = 0b10000; | ||
|
||
@VisibleForTesting | ||
static SerializedArray from(SerializedMetadata metadata, byte[] bytes) { | ||
static SerializedArray from(VariantMetadata metadata, byte[] bytes) { | ||
return from(metadata, ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN), bytes[0]); | ||
} | ||
|
||
static SerializedArray from(SerializedMetadata metadata, ByteBuffer value, int header) { | ||
static SerializedArray from(VariantMetadata metadata, ByteBuffer value, int header) { | ||
Preconditions.checkArgument( | ||
value.order() == ByteOrder.LITTLE_ENDIAN, "Unsupported byte order: big endian"); | ||
Variants.BasicType basicType = VariantUtil.basicType(header); | ||
|
@@ -42,14 +42,14 @@ static SerializedArray from(SerializedMetadata metadata, ByteBuffer value, int h | |
return new SerializedArray(metadata, value, header); | ||
} | ||
|
||
private final SerializedMetadata metadata; | ||
private final VariantMetadata metadata; | ||
private final ByteBuffer value; | ||
private final int offsetSize; | ||
private final int offsetListOffset; | ||
private final int dataOffset; | ||
private final VariantValue[] array; | ||
|
||
private SerializedArray(SerializedMetadata metadata, ByteBuffer value, int header) { | ||
private SerializedArray(VariantMetadata metadata, ByteBuffer value, int header) { | ||
this.metadata = metadata; | ||
this.value = value; | ||
this.offsetSize = 1 + ((header & OFFSET_SIZE_MASK) >> OFFSET_SIZE_SHIFT); | ||
|
@@ -61,8 +61,8 @@ private SerializedArray(SerializedMetadata metadata, ByteBuffer value, int heade | |
this.array = new VariantValue[numElements]; | ||
} | ||
|
||
@VisibleForTesting | ||
int numElements() { | ||
@Override | ||
public int numElements() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for addressing it. I was trying to make Parquet change in (https://github.com/apache/iceberg/pull/11653/files#diff-b8e8443fcec3843e538dbc702d4c131ff58359cb83ccdb211d8679c1d77c16bd) and we need to expose this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I needed it for the readers and for the updates here to |
||
return array.length; | ||
} | ||
|
||
|
@@ -76,7 +76,7 @@ public VariantValue get(int index) { | |
VariantUtil.readLittleEndianUnsigned( | ||
value, offsetListOffset + (offsetSize * (1 + index)), offsetSize); | ||
array[index] = | ||
Variants.from(metadata, VariantUtil.slice(value, dataOffset + offset, next - offset)); | ||
Variants.value(metadata, VariantUtil.slice(value, dataOffset + offset, next - offset)); | ||
} | ||
return array[index]; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,9 +21,13 @@ | |
import java.nio.ByteBuffer; | ||
import java.nio.ByteOrder; | ||
import java.util.Map; | ||
import java.util.Set; | ||
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; | ||
import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; | ||
import org.apache.iceberg.relocated.com.google.common.collect.Iterables; | ||
import org.apache.iceberg.relocated.com.google.common.collect.Maps; | ||
import org.apache.iceberg.relocated.com.google.common.collect.Sets; | ||
import org.apache.iceberg.util.Pair; | ||
import org.apache.iceberg.util.SortedMerge; | ||
|
||
|
@@ -35,22 +39,55 @@ | |
* fields. This also does not allow updating or replacing the metadata for the unshredded object, | ||
* which could require recursively rewriting field IDs. | ||
*/ | ||
class ShreddedObject implements VariantObject { | ||
private final SerializedMetadata metadata; | ||
private final SerializedObject unshredded; | ||
public class ShreddedObject implements VariantObject { | ||
private final VariantMetadata metadata; | ||
private final VariantObject unshredded; | ||
private final Map<String, VariantValue> shreddedFields = Maps.newHashMap(); | ||
private final Set<String> removedFields = Sets.newHashSet(); | ||
private SerializationState serializationState = null; | ||
|
||
ShreddedObject(SerializedMetadata metadata) { | ||
ShreddedObject(VariantMetadata metadata) { | ||
this.metadata = metadata; | ||
this.unshredded = null; | ||
} | ||
|
||
ShreddedObject(SerializedObject unshredded) { | ||
this.metadata = unshredded.metadata(); | ||
ShreddedObject(VariantMetadata metadata, VariantObject unshredded) { | ||
this.metadata = metadata; | ||
this.unshredded = unshredded; | ||
} | ||
|
||
@VisibleForTesting | ||
VariantMetadata metadata() { | ||
return metadata; | ||
} | ||
|
||
private Set<String> nameSet() { | ||
Set<String> names = Sets.newHashSet(shreddedFields.keySet()); | ||
|
||
if (unshredded != null) { | ||
Iterables.addAll(names, unshredded.fieldNames()); | ||
} | ||
|
||
names.removeAll(removedFields); | ||
|
||
return names; | ||
} | ||
|
||
@Override | ||
public Iterable<String> fieldNames() { | ||
return nameSet(); | ||
} | ||
|
||
@Override | ||
public int numFields() { | ||
return nameSet().size(); | ||
} | ||
|
||
public void remove(String field) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does this remove() try to support or to be used? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The purpose of this class is to create objects from an unshredded, serialized variant in This is intended to handle fields that are "missing" because the field's I thought it was cleaner to handle missing fields by calling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for explanation. If a field is missing and we remove the field from
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That doesn't handle the case where |
||
shreddedFields.remove(field); | ||
removedFields.add(field); | ||
} | ||
|
||
public void put(String field, VariantValue value) { | ||
Preconditions.checkArgument( | ||
metadata.id(field) >= 0, "Cannot find field name in metadata: %s", field); | ||
|
@@ -63,6 +100,10 @@ public void put(String field, VariantValue value) { | |
|
||
@Override | ||
public VariantValue get(String field) { | ||
if (removedFields.contains(field)) { | ||
return null; | ||
} | ||
|
||
// the shredded value takes precedence if there is a conflict | ||
VariantValue value = shreddedFields.get(field); | ||
if (value != null) { | ||
|
@@ -79,7 +120,8 @@ public VariantValue get(String field) { | |
@Override | ||
public int sizeInBytes() { | ||
if (null == serializationState) { | ||
this.serializationState = new SerializationState(metadata, unshredded, shreddedFields); | ||
this.serializationState = | ||
new SerializationState(metadata, unshredded, shreddedFields, removedFields); | ||
} | ||
|
||
return serializationState.size(); | ||
|
@@ -91,15 +133,16 @@ public int writeTo(ByteBuffer buffer, int offset) { | |
buffer.order() == ByteOrder.LITTLE_ENDIAN, "Invalid byte order: big endian"); | ||
|
||
if (null == serializationState) { | ||
this.serializationState = new SerializationState(metadata, unshredded, shreddedFields); | ||
this.serializationState = | ||
new SerializationState(metadata, unshredded, shreddedFields, removedFields); | ||
} | ||
|
||
return serializationState.writeTo(buffer, offset); | ||
} | ||
|
||
/** Common state for {@link #size()} and {@link #writeTo(ByteBuffer, int)} */ | ||
private static class SerializationState { | ||
private final SerializedMetadata metadata; | ||
private final VariantMetadata metadata; | ||
private final Map<String, ByteBuffer> unshreddedFields; | ||
private final Map<String, VariantValue> shreddedFields; | ||
private final int dataSize; | ||
|
@@ -109,28 +152,38 @@ private static class SerializationState { | |
private final int offsetSize; | ||
|
||
private SerializationState( | ||
SerializedMetadata metadata, | ||
SerializedObject unshredded, | ||
Map<String, VariantValue> shreddedFields) { | ||
VariantMetadata metadata, | ||
VariantObject unshredded, | ||
Map<String, VariantValue> shreddedFields, | ||
Set<String> removedFields) { | ||
this.metadata = metadata; | ||
// field ID size is the size needed to store the largest field ID in the data | ||
this.fieldIdSize = VariantUtil.sizeOf(metadata.dictionarySize()); | ||
this.shreddedFields = shreddedFields; | ||
this.shreddedFields = Maps.newHashMap(shreddedFields); | ||
|
||
int totalDataSize = 0; | ||
// get the unshredded field names and values as byte buffers | ||
ImmutableMap.Builder<String, ByteBuffer> unshreddedBuilder = ImmutableMap.builder(); | ||
if (unshredded != null) { | ||
for (Pair<String, Integer> field : unshredded.fields()) { | ||
if (unshredded instanceof SerializedObject) { | ||
// for serialized objects, use existing buffers instead of materializing values | ||
SerializedObject serialized = (SerializedObject) unshredded; | ||
for (Pair<String, Integer> field : serialized.fields()) { | ||
// if the value is replaced by an unshredded field, don't include it | ||
String name = field.first(); | ||
boolean replaced = shreddedFields.containsKey(name); | ||
boolean replaced = shreddedFields.containsKey(name) || removedFields.contains(name); | ||
if (!replaced) { | ||
ByteBuffer value = unshredded.sliceValue(field.second()); | ||
ByteBuffer value = serialized.sliceValue(field.second()); | ||
unshreddedBuilder.put(name, value); | ||
totalDataSize += value.remaining(); | ||
} | ||
} | ||
} else if (unshredded != null) { | ||
for (String name : unshredded.fieldNames()) { | ||
boolean replaced = shreddedFields.containsKey(name) || removedFields.contains(name); | ||
if (!replaced) { | ||
shreddedFields.put(name, unshredded.get(name)); | ||
} | ||
} | ||
} | ||
|
||
this.unshreddedFields = unshreddedBuilder.build(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: seems I prefer the existing implementation which is cleaner and consistent with other types.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The trade-off is that this would require a separate reader just for boolean values. I'm open to that if there are strong objections, but it seems to me that allowing the actual type to be fixed up depending on the value is a reasonable change that saves quite a bit of code. If you object to this, I can revert it and add the boolean-specific reader.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah. You are right.
BOOLEAN_TRUE
andBOOLEAN_FALSE
physical types need special handling. When we shred them, they should be grouped together, considered as one type.And also we may need to have a type list {
NULL
,BOOLEAN
,INT8
,INT16
, etc}, which is almost same as physical type list with BOOLEAN forBOOLEAN_TRUE
andBOOLEAN_FALSE
. Otherwise, we can't represent a shredded column type for true/false.I'm fine to keep it simple for now. We can revisit if needed.