Skip to content

Commit

Permalink
Read nested Parquet 2-level lists correctly (#6757)
Browse files Browse the repository at this point in the history
* read nested 2-level lists correctly

* address review comments

* Apply suggestions from code review

Co-authored-by: Raphael Taylor-Davies <[email protected]>

---------

Co-authored-by: Raphael Taylor-Davies <[email protected]>
  • Loading branch information
etseidl and tustvold authored Nov 27, 2024
1 parent 088e5db commit c8fda9a
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 6 deletions.
52 changes: 51 additions & 1 deletion parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -937,7 +937,7 @@ mod tests {
};
use arrow_array::*;
use arrow_buffer::{i256, ArrowNativeType, Buffer, IntervalDayTime};
use arrow_data::ArrayDataBuilder;
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{
ArrowError, DataType as ArrowDataType, Field, Fields, Schema, SchemaRef, TimeUnit,
};
Expand Down Expand Up @@ -4178,6 +4178,56 @@ mod tests {
}
}

#[test]
fn test_read_old_nested_list() {
use arrow::datatypes::DataType;
use arrow::datatypes::ToByteSlice;

let testdata = arrow::util::test_util::parquet_test_data();
// message my_record {
// REQUIRED group a (LIST) {
// REPEATED group array (LIST) {
// REPEATED INT32 array;
// }
// }
// }
// should be read as list<list<int32>>
let path = format!("{testdata}/old_list_structure.parquet");
let test_file = File::open(path).unwrap();

// create expected ListArray
let a_values = Int32Array::from(vec![1, 2, 3, 4]);

// Construct a buffer for value offsets, for the nested array: [[1, 2], [3, 4]]
let a_value_offsets = arrow::buffer::Buffer::from([0, 2, 4].to_byte_slice());

// Construct a list array from the above two
let a_list_data = ArrayData::builder(DataType::List(Arc::new(Field::new(
"array",
DataType::Int32,
false,
))))
.len(2)
.add_buffer(a_value_offsets)
.add_child_data(a_values.into_data())
.build()
.unwrap();
let a = ListArray::from(a_list_data);

let builder = ParquetRecordBatchReaderBuilder::try_new(test_file).unwrap();
let mut reader = builder.build().unwrap();
let out = reader.next().unwrap().unwrap();
assert_eq!(out.num_rows(), 1);
assert_eq!(out.num_columns(), 1);
// grab first column
let c0 = out.column(0);
let c0arr = c0.as_any().downcast_ref::<ListArray>().unwrap();
// get first row: [[1, 2], [3, 4]]
let r0 = c0arr.value(0);
let r0arr = r0.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(r0arr, &a);
}

#[test]
fn test_map_no_value() {
// File schema:
Expand Down
16 changes: 11 additions & 5 deletions parquet/src/arrow/schema/complex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,15 +453,21 @@ impl Visitor {
};
}

// test to see if the repeated field is a struct or one-tuple
let items = repeated_field.get_fields();
if items.len() != 1
|| repeated_field.name() == "array"
|| repeated_field.name() == format!("{}_tuple", list_type.name())
|| (!repeated_field.is_list()
&& !repeated_field.has_single_repeated_child()
&& (repeated_field.name() == "array"
|| repeated_field.name() == format!("{}_tuple", list_type.name())))
{
// If the repeated field is a group with multiple fields, then its type is the element type and elements are required.
// If the repeated field is a group with multiple fields, then its type is the element
// type and elements are required.
//
// If the repeated field is a group with one field and is named either array or uses the LIST-annotated group's name
// with _tuple appended then the repeated type is the element type and elements are required.
// If the repeated field is a group with one field and is named either array or uses
// the LIST-annotated group's name with _tuple appended then the repeated type is the
// element type and elements are required. But this rule only applies if the
// repeated field is not annotated, and the single child field is not `repeated`.
let context = VisitorContext {
rep_level: context.rep_level,
def_level,
Expand Down
30 changes: 30 additions & 0 deletions parquet/src/record/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,19 @@ impl Reader {
/// <https://github.com/apache/parquet-format/blob/master/LogicalTypes.md>
/// #backward-compatibility-rules
fn is_element_type(repeated_type: &Type) -> bool {
// For legacy 2-level list types whose element type is a 2-level list
//
// // ARRAY<ARRAY<INT>> (nullable list, non-null elements)
// optional group my_list (LIST) {
// repeated group array (LIST) {
// repeated int32 array;
// };
// }
//
if repeated_type.is_list() || repeated_type.has_single_repeated_child() {
return false;
}

// For legacy 2-level list types with primitive element type, e.g.:
//
// // ARRAY<INT> (nullable list, non-null elements)
Expand Down Expand Up @@ -1882,4 +1895,21 @@ mod tests {
let iter = row_group_reader.get_row_iter(schema)?;
Ok(iter.map(|row| row.unwrap()).collect())
}

#[test]
fn test_read_old_nested_list() {
let rows = test_file_reader_rows("old_list_structure.parquet", None).unwrap();
let expected_rows = vec![row![(
"a".to_string(),
Field::ListInternal(make_list(
[
make_list([1, 2].map(Field::Int).to_vec()),
make_list([3, 4].map(Field::Int).to_vec())
]
.map(Field::ListInternal)
.to_vec()
))
),]];
assert_eq!(rows, expected_rows);
}
}
23 changes: 23 additions & 0 deletions parquet/src/schema/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,29 @@ impl Type {
self.get_basic_info().has_repetition()
&& self.get_basic_info().repetition() != Repetition::REQUIRED
}

/// Returns `true` if this type is annotated as a list.
pub(crate) fn is_list(&self) -> bool {
if self.is_group() {
let basic_info = self.get_basic_info();
if let Some(logical_type) = basic_info.logical_type() {
return logical_type == LogicalType::List;
}
return basic_info.converted_type() == ConvertedType::LIST;
}
false
}

/// Returns `true` if this type is a group with a single child field that is `repeated`.
pub(crate) fn has_single_repeated_child(&self) -> bool {
if self.is_group() {
let children = self.get_fields();
return children.len() == 1
&& children[0].get_basic_info().has_repetition()
&& children[0].get_basic_info().repetition() == Repetition::REPEATED;
}
false
}
}

/// A builder for primitive types. All attributes are optional
Expand Down

0 comments on commit c8fda9a

Please sign in to comment.