From 84e6d80971fc93817d4b4266047df1715bf4270d Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Mon, 2 Oct 2023 09:21:51 -0600 Subject: [PATCH 1/7] Implement Take for UnionArray (#4883) --- arrow-select/src/take.rs | 54 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index 0f5689ff9990..a6f394d525a5 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -28,7 +28,7 @@ use arrow_buffer::{ ScalarBuffer, }; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType, FieldRef}; +use arrow_schema::{ArrowError, DataType, FieldRef, UnionMode}; use num::{One, Zero}; @@ -207,6 +207,21 @@ fn take_impl( Ok(new_null_array(&DataType::Null, indices.len())) } } + DataType::Union(fields, UnionMode::Sparse) => { + let mut field_type_ids = Vec::with_capacity(fields.len()); + let mut children = Vec::with_capacity(fields.len()); + let values = values.as_any().downcast_ref::().unwrap(); + let type_ids = take_native(values.type_ids(), indices).into_inner(); + for (type_id, field) in fields.iter() { + let values = values.child(type_id); + let values = take_impl(values, indices)?; + let field = (**field).clone(); + children.push((field, values)); + field_type_ids.push(type_id); + } + let array = UnionArray::try_new(field_type_ids.as_slice(), type_ids, None, children)?; + Ok(Arc::new(array)) + } t => unimplemented!("Take not supported for data type {:?}", t) } } @@ -1949,4 +1964,41 @@ mod tests { .collect::>(); assert_eq!(&values, &[Some(23), Some(4), None, None]) } + + #[test] + fn test_take_union() { + let structs = create_test_struct(vec![ + Some((Some(true), Some(42))), + Some((Some(false), Some(28))), + Some((Some(false), Some(19))), + Some((Some(true), Some(31))), + None, + ]); + let strings = + StringArray::from(vec![Some("a"), None, Some("c"), None, Some("d")]); + let type_ids = Buffer::from_slice_ref(vec![1i8; 5]); + + let children: Vec<(Field, Arc)> = vec![ + ( + Field::new("f1", structs.data_type().clone(), true), + Arc::new(structs), + ), + ( + Field::new("f2", strings.data_type().clone(), true), + Arc::new(strings), + ), + ]; + let array = UnionArray::try_new(&[0, 1], type_ids, None, children).unwrap(); + + let indices = vec![0, 3, 1, 0, 2, 4]; + let index = UInt32Array::from(indices.clone()); + let actual = take(&array, &index, None).unwrap(); + let actual = actual.as_any().downcast_ref::().unwrap(); + let strings = actual.child(1); + let strings = strings.as_any().downcast_ref::().unwrap(); + + let actual = strings.iter().collect::>(); + let expected = vec![Some("a"), None, None, Some("a"), Some("c"), Some("d")]; + assert_eq!(expected, actual); + } } From cbaf98c5ddb2b57d7f8b1d00653014fa3e43c3a4 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Mon, 2 Oct 2023 10:10:21 -0600 Subject: [PATCH 2/7] fix build --- arrow-select/src/take.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index a6f394d525a5..3870eed104e1 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -214,7 +214,7 @@ fn take_impl( let type_ids = take_native(values.type_ids(), indices).into_inner(); for (type_id, field) in fields.iter() { let values = values.child(type_id); - let values = take_impl(values, indices)?; + let values = take_impl(values, indices, None)?; let field = (**field).clone(); children.push((field, values)); field_type_ids.push(type_id); From ee3161763ffc55fd46c85001f9f81f0897604459 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Mon, 2 Oct 2023 10:20:41 -0600 Subject: [PATCH 3/7] clippy --- arrow-buffer/src/native.rs | 2 +- arrow-buffer/src/util/bit_chunk_iterator.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index 8fe6cf2b7894..1e6b688e5c91 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -222,7 +222,7 @@ pub trait ToByteSlice { impl ToByteSlice for [T] { #[inline] fn to_byte_slice(&self) -> &[u8] { - let raw_ptr = self.as_ptr() as *const T as *const u8; + let raw_ptr = self.as_ptr(); unsafe { std::slice::from_raw_parts(raw_ptr, std::mem::size_of_val(self)) } } } diff --git a/arrow-buffer/src/util/bit_chunk_iterator.rs b/arrow-buffer/src/util/bit_chunk_iterator.rs index 3d9632e73229..6830acae94a1 100644 --- a/arrow-buffer/src/util/bit_chunk_iterator.rs +++ b/arrow-buffer/src/util/bit_chunk_iterator.rs @@ -157,7 +157,7 @@ impl<'a> UnalignedBitChunk<'a> { self.prefix .into_iter() .chain(self.chunks.iter().cloned()) - .chain(self.suffix.into_iter()) + .chain(self.suffix) } /// Counts the number of ones From 50769ab587b09413b8892510f7ea305643ad655d Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Mon, 2 Oct 2023 10:26:41 -0600 Subject: [PATCH 4/7] build error --- arrow-buffer/src/native.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-buffer/src/native.rs b/arrow-buffer/src/native.rs index 1e6b688e5c91..38074a8dc26c 100644 --- a/arrow-buffer/src/native.rs +++ b/arrow-buffer/src/native.rs @@ -222,7 +222,7 @@ pub trait ToByteSlice { impl ToByteSlice for [T] { #[inline] fn to_byte_slice(&self) -> &[u8] { - let raw_ptr = self.as_ptr(); + let raw_ptr = self.as_ptr() as *const u8; unsafe { std::slice::from_raw_parts(raw_ptr, std::mem::size_of_val(self)) } } } From 17b86e61f1cca5b7bb20290ee9ff0054dbe8de1c Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Mon, 2 Oct 2023 10:29:20 -0600 Subject: [PATCH 5/7] python error --- .github/workflows/docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 7e80aea6b978..6ecde626b80f 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -49,7 +49,7 @@ jobs: - name: Install python dev run: | apt update - apt install -y libpython3.9-dev + apt install -y libpython3.11-dev - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: From f510794a0bd5bde51eab1bf61a7635d2f8647920 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Mon, 2 Oct 2023 10:35:22 -0600 Subject: [PATCH 6/7] clippy --- object_store/src/memory.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index 82d485997e88..3138fe43d109 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -371,7 +371,7 @@ impl AsyncWrite for InMemoryAppend { if let Some((bytes, _)) = writer.remove(&self.location) { let buf = std::mem::take(&mut self.data); - let concat = Bytes::from_iter(bytes.into_iter().chain(buf.into_iter())); + let concat = Bytes::from_iter(bytes.into_iter().chain(buf)); writer.insert(self.location.clone(), (concat, Utc::now())); } else { writer.insert( From bbf8ec42777b4a9e32614d9c4a1257a59dbefb48 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Mon, 2 Oct 2023 10:40:21 -0600 Subject: [PATCH 7/7] clippy --- object_store/src/util.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 79ca4bb7a834..13affccec1a0 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -32,8 +32,9 @@ where D: serde::Deserializer<'de>, { let s: String = serde::Deserialize::deserialize(deserializer)?; - chrono::TimeZone::datetime_from_str(&chrono::Utc, &s, RFC1123_FMT) - .map_err(serde::de::Error::custom) + let naive = chrono::NaiveDateTime::parse_from_str(&s, RFC1123_FMT) + .map_err(serde::de::Error::custom)?; + Ok(chrono::TimeZone::from_utc_datetime(&chrono::Utc, &naive)) } #[cfg(any(feature = "aws", feature = "azure"))]