From c0521478a869e9c74a76f44baeb68c2760952a1f Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Thu, 10 Oct 2024 18:17:09 +0800 Subject: [PATCH 1/2] allow to apply hint for binary as Utf8 type --- parquet/src/arrow/arrow_reader/mod.rs | 113 ++++++++++++++++++++++++++ parquet/src/arrow/schema/primitive.rs | 5 ++ 2 files changed, 118 insertions(+) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index a109851f72b..7d1b8bcf1c5 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -3077,6 +3077,119 @@ mod tests { ); } + #[test] + fn test_read_binary_as_utf8() { + let file = write_parquet_from_iter(vec![ + ( + "binary_to_utf8", + Arc::new(BinaryArray::from(vec![ + b"one".as_ref(), + b"two".as_ref(), + b"three".as_ref(), + ])) as ArrayRef, + ), + ( + "large_binary_to_large_utf8", + Arc::new(LargeBinaryArray::from(vec![ + b"one".as_ref(), + b"two".as_ref(), + b"three".as_ref(), + ])) as ArrayRef, + ), + ( + "binary_view_to_utf8_view", + Arc::new(BinaryViewArray::from(vec![ + b"one".as_ref(), + b"two".as_ref(), + b"three".as_ref(), + ])) as ArrayRef, + ), + ]); + let supplied_fields = Fields::from(vec![ + Field::new("binary_to_utf8", ArrowDataType::Utf8, false), + Field::new( + "large_binary_to_large_utf8", + ArrowDataType::LargeUtf8, + false, + ), + Field::new("binary_view_to_utf8_view", ArrowDataType::Utf8View, false), + ]); + + let options = ArrowReaderOptions::new().with_schema(Arc::new(Schema::new(supplied_fields))); + let mut arrow_reader = ParquetRecordBatchReaderBuilder::try_new_with_options( + file.try_clone().unwrap(), + options, + ) + .expect("reader builder with schema") + .build() + .expect("reader with schema"); + + let batch = arrow_reader.next().unwrap().unwrap(); + assert_eq!(batch.num_columns(), 3); + assert_eq!(batch.num_rows(), 3); + assert_eq!( + batch + .column(0) + .as_any() + .downcast_ref::() + .expect("downcast to string") + .iter() + .collect::>(), + vec![Some("one"), Some("two"), Some("three")] + ); + + assert_eq!( + batch + .column(1) + .as_any() + .downcast_ref::() + .expect("downcast to large string") + .iter() + .collect::>(), + vec![Some("one"), Some("two"), Some("three")] + ); + + assert_eq!( + batch + .column(2) + .as_any() + .downcast_ref::() + .expect("downcast to string view") + .iter() + .collect::>(), + vec![Some("one"), Some("two"), Some("three")] + ); + } + + #[test] + #[should_panic(expected = "Invalid UTF8 sequence at")] + fn test_read_non_utf8_binary_as_utf8() { + let file = write_parquet_from_iter(vec![( + "non_utf8_binary", + Arc::new(BinaryArray::from(vec![ + b"\xDE\x00\xFF".as_ref(), + b"\xDE\x01\xAA".as_ref(), + b"\xDE\x02\xFF".as_ref(), + ])) as ArrayRef, + )]); + let supplied_fields = Fields::from(vec![Field::new( + "non_utf8_binary", + ArrowDataType::Utf8, + false, + )]); + + let options = ArrowReaderOptions::new().with_schema(Arc::new(Schema::new(supplied_fields))); + let mut arrow_reader = ParquetRecordBatchReaderBuilder::try_new_with_options( + file.try_clone().unwrap(), + options, + ) + .expect("reader builder with schema") + .build() + .expect("reader with schema"); + + arrow_reader.next(); + } + #[test] fn test_with_schema() { let nested_fields = Fields::from(vec![ diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index 17dd7862f3d..9f215b4dc07 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -57,6 +57,11 @@ fn apply_hint(parquet: DataType, hint: DataType) -> DataType { (DataType::Utf8, DataType::LargeUtf8) => hint, (DataType::Binary, DataType::LargeBinary) => hint, + // Read as Utf8 + (DataType::Binary, DataType::Utf8) => hint, + (DataType::Binary, DataType::LargeUtf8) => hint, + (DataType::Binary, DataType::Utf8View) => hint, + // Determine view type (DataType::Utf8, DataType::Utf8View) => hint, (DataType::Binary, DataType::BinaryView) => hint, From 3c2e694e89b12d6b70d7b1aed3f4dcff169de2da Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Thu, 10 Oct 2024 19:56:36 +0800 Subject: [PATCH 2/2] refactor tests --- parquet/src/arrow/arrow_reader/mod.rs | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 7d1b8bcf1c5..d3709c03e99 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -3130,9 +3130,7 @@ mod tests { assert_eq!( batch .column(0) - .as_any() - .downcast_ref::() - .expect("downcast to string") + .as_string::() .iter() .collect::>(), vec![Some("one"), Some("two"), Some("three")] @@ -3141,22 +3139,14 @@ mod tests { assert_eq!( batch .column(1) - .as_any() - .downcast_ref::() - .expect("downcast to large string") + .as_string::() .iter() .collect::>(), vec![Some("one"), Some("two"), Some("three")] ); assert_eq!( - batch - .column(2) - .as_any() - .downcast_ref::() - .expect("downcast to string view") - .iter() - .collect::>(), + batch.column(2).as_string_view().iter().collect::>(), vec![Some("one"), Some("two"), Some("three")] ); } @@ -3186,8 +3176,7 @@ mod tests { .expect("reader builder with schema") .build() .expect("reader with schema"); - - arrow_reader.next(); + arrow_reader.next().unwrap().unwrap_err(); } #[test]