diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index a109851f72b..d3709c03e99 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -3077,6 +3077,108 @@ mod tests { ); } + #[test] + fn test_read_binary_as_utf8() { + let file = write_parquet_from_iter(vec![ + ( + "binary_to_utf8", + Arc::new(BinaryArray::from(vec![ + b"one".as_ref(), + b"two".as_ref(), + b"three".as_ref(), + ])) as ArrayRef, + ), + ( + "large_binary_to_large_utf8", + Arc::new(LargeBinaryArray::from(vec![ + b"one".as_ref(), + b"two".as_ref(), + b"three".as_ref(), + ])) as ArrayRef, + ), + ( + "binary_view_to_utf8_view", + Arc::new(BinaryViewArray::from(vec![ + b"one".as_ref(), + b"two".as_ref(), + b"three".as_ref(), + ])) as ArrayRef, + ), + ]); + let supplied_fields = Fields::from(vec![ + Field::new("binary_to_utf8", ArrowDataType::Utf8, false), + Field::new( + "large_binary_to_large_utf8", + ArrowDataType::LargeUtf8, + false, + ), + Field::new("binary_view_to_utf8_view", ArrowDataType::Utf8View, false), + ]); + + let options = ArrowReaderOptions::new().with_schema(Arc::new(Schema::new(supplied_fields))); + let mut arrow_reader = ParquetRecordBatchReaderBuilder::try_new_with_options( + file.try_clone().unwrap(), + options, + ) + .expect("reader builder with schema") + .build() + .expect("reader with schema"); + + let batch = arrow_reader.next().unwrap().unwrap(); + assert_eq!(batch.num_columns(), 3); + assert_eq!(batch.num_rows(), 3); + assert_eq!( + batch + .column(0) + .as_string::() + .iter() + .collect::>(), + vec![Some("one"), Some("two"), Some("three")] + ); + + assert_eq!( + batch + .column(1) + .as_string::() + .iter() + .collect::>(), + vec![Some("one"), Some("two"), Some("three")] + ); + + assert_eq!( + batch.column(2).as_string_view().iter().collect::>(), + vec![Some("one"), Some("two"), Some("three")] + ); + } + + #[test] + #[should_panic(expected = "Invalid UTF8 sequence at")] + fn test_read_non_utf8_binary_as_utf8() { + let file = write_parquet_from_iter(vec![( + "non_utf8_binary", + Arc::new(BinaryArray::from(vec![ + b"\xDE\x00\xFF".as_ref(), + b"\xDE\x01\xAA".as_ref(), + b"\xDE\x02\xFF".as_ref(), + ])) as ArrayRef, + )]); + let supplied_fields = Fields::from(vec![Field::new( + "non_utf8_binary", + ArrowDataType::Utf8, + false, + )]); + + let options = ArrowReaderOptions::new().with_schema(Arc::new(Schema::new(supplied_fields))); + let mut arrow_reader = ParquetRecordBatchReaderBuilder::try_new_with_options( + file.try_clone().unwrap(), + options, + ) + .expect("reader builder with schema") + .build() + .expect("reader with schema"); + arrow_reader.next().unwrap().unwrap_err(); + } + #[test] fn test_with_schema() { let nested_fields = Fields::from(vec![ diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index 17dd7862f3d..9f215b4dc07 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -57,6 +57,11 @@ fn apply_hint(parquet: DataType, hint: DataType) -> DataType { (DataType::Utf8, DataType::LargeUtf8) => hint, (DataType::Binary, DataType::LargeBinary) => hint, + // Read as Utf8 + (DataType::Binary, DataType::Utf8) => hint, + (DataType::Binary, DataType::LargeUtf8) => hint, + (DataType::Binary, DataType::Utf8View) => hint, + // Determine view type (DataType::Utf8, DataType::Utf8View) => hint, (DataType::Binary, DataType::BinaryView) => hint,