From be6a67db405ab3c6493202ff3ecdc058e7cc2a2e Mon Sep 17 00:00:00 2001 From: kamille Date: Thu, 17 Oct 2024 02:31:35 +0800 Subject: [PATCH 01/60] simple support vectorized append. --- .../src/aggregates/group_values/column.rs | 57 +++++++++---- .../aggregates/group_values/group_column.rs | 80 ++++++++++++++++++- 2 files changed, 122 insertions(+), 15 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 4ad75844f7b7..60dd15aa2672 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -73,12 +73,17 @@ pub struct GroupValuesColumn { /// Random state for creating hashes random_state: RandomState, + + column_nullables_buffer: Vec, + + append_rows_buffer: Vec, } impl GroupValuesColumn { /// Create a new instance of GroupValuesColumn if supported for the specified schema pub fn try_new(schema: SchemaRef) -> Result { let map = RawTable::with_capacity(0); + let num_cols = schema.fields.len(); Ok(Self { schema, map, @@ -86,6 +91,8 @@ impl GroupValuesColumn { group_values: vec![], hashes_buffer: Default::default(), random_state: Default::default(), + column_nullables_buffer: vec![false; num_cols], + append_rows_buffer: Vec::new(), }) } @@ -146,6 +153,13 @@ macro_rules! instantiate_primitive { }; } +fn append_col_value(mut core: C, array: &ArrayRef, row: usize) +where + C: FnMut(&ArrayRef, usize), +{ + core(array, row); +} + impl GroupValues for GroupValuesColumn { fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec) -> Result<()> { let n_rows = cols[0].len(); @@ -213,6 +227,14 @@ impl GroupValues for GroupValuesColumn { batch_hashes.resize(n_rows, 0); create_hashes(cols, &self.random_state, batch_hashes)?; + // 1.2 Check if columns nullable + for (col_idx, col) in cols.iter().enumerate() { + self.column_nullables_buffer[col_idx] = (col.null_count() != 0); + } + + // 1.3 Check and record which rows of the input should be appended + self.append_rows_buffer.clear(); + let mut current_group_idx = self.group_values[0].len(); for (row, &target_hash) in batch_hashes.iter().enumerate() { let entry = self.map.get_mut(target_hash, |(exist_hash, group_idx)| { // Somewhat surprisingly, this closure can be called even if the @@ -249,31 +271,38 @@ impl GroupValues for GroupValuesColumn { // Add new entry to aggr_state and save newly created index // let group_idx = group_values.num_rows(); // group_values.push(group_rows.row(row)); - - let mut checklen = 0; - let group_idx = self.group_values[0].len(); - for (i, group_value) in self.group_values.iter_mut().enumerate() { - group_value.append_val(&cols[i], row); - let len = group_value.len(); - if i == 0 { - checklen = len; - } else { - debug_assert_eq!(checklen, len); - } - } + let prev_group_idx = current_group_idx; // for hasher function, use precomputed hash value self.map.insert_accounted( - (target_hash, group_idx), + (target_hash, prev_group_idx), |(hash, _group_index)| *hash, &mut self.map_size, ); - group_idx + self.append_rows_buffer.push(row); + current_group_idx += 1; + + prev_group_idx } }; groups.push(group_idx); } + // 1.4 Vectorized append values + for (col_idx, col) in cols.iter().enumerate() { + let col_nullable = self.column_nullables_buffer[col_idx]; + let group_value = &mut self.group_values[col_idx]; + if col_nullable { + for &row in self.append_rows_buffer.iter() { + group_value.append_val(&cols[col_idx], row); + } + } else { + for &row in self.append_rows_buffer.iter() { + group_value.append_non_nullable_val(&cols[col_idx], row); + } + } + } + Ok(()) } diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 41534958602e..8ea8b110a7aa 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -58,6 +58,9 @@ pub trait GroupColumn: Send + Sync { fn equal_to(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool; /// Appends the row at `row` in `array` to this builder fn append_val(&mut self, array: &ArrayRef, row: usize); + + fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize); + /// Returns the number of rows stored in this builder fn len(&self) -> usize; /// Returns the number of bytes used by this [`GroupColumn`] @@ -113,6 +116,7 @@ impl GroupColumn self.group_values[lhs_row] == array.as_primitive::().value(rhs_row) } + fn append_val(&mut self, array: &ArrayRef, row: usize) { // Perf: skip null check if input can't have nulls if NULLABLE { @@ -128,6 +132,15 @@ impl GroupColumn } } + fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize) { + if NULLABLE { + self.nulls.append(false); + self.group_values.push(array.as_primitive::().value(row)); + } else { + self.group_values.push(array.as_primitive::().value(row)); + } + } + fn len(&self) -> usize { self.group_values.len() } @@ -218,6 +231,17 @@ where } } + fn append_non_nullable_val_inner(&mut self, array: &ArrayRef, row: usize) + where + B: ByteArrayType, + { + let arr = array.as_bytes::(); + self.nulls.append(false); + let value: &[u8] = arr.value(row).as_ref(); + self.buffer.append_slice(value); + self.offsets.push(O::usize_as(self.buffer.len())); + } + fn equal_to_inner(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool where B: ByteArrayType, @@ -287,6 +311,27 @@ where }; } + fn append_non_nullable_val(&mut self, column: &ArrayRef, row: usize) { + // Sanity array type + match self.output_type { + OutputType::Binary => { + debug_assert!(matches!( + column.data_type(), + DataType::Binary | DataType::LargeBinary + )); + self.append_non_nullable_val_inner::>(column, row) + } + OutputType::Utf8 => { + debug_assert!(matches!( + column.data_type(), + DataType::Utf8 | DataType::LargeUtf8 + )); + self.append_non_nullable_val_inner::>(column, row) + } + _ => unreachable!("View types should use `ArrowBytesViewMap`"), + }; + } + fn len(&self) -> usize { self.offsets.len() - 1 } @@ -382,7 +427,7 @@ where } _ => unreachable!("View types should use `ArrowBytesViewMap`"), } - } + } } /// An implementation of [`GroupColumn`] for binary view and utf8 view types. @@ -482,6 +527,35 @@ impl ByteViewGroupValueBuilder { self.views.push(view); } + fn append_val_non_nullable_inner(&mut self, array: &ArrayRef, row: usize) + where + B: ByteViewType, + { + let arr = array.as_byte_view::(); + + // Not null row case + self.nulls.append(false); + let value: &[u8] = arr.value(row).as_ref(); + + let value_len = value.len(); + let view = if value_len <= 12 { + make_view(value, 0, 0) + } else { + // Ensure big enough block to hold the value firstly + self.ensure_in_progress_big_enough(value_len); + + // Append value + let buffer_index = self.completed.len(); + let offset = self.in_progress.len(); + self.in_progress.extend_from_slice(value); + + make_view(value, buffer_index as u32, offset as u32) + }; + + // Append view + self.views.push(view); + } + fn ensure_in_progress_big_enough(&mut self, value_len: usize) { debug_assert!(value_len > 12); let require_cap = self.in_progress.len() + value_len; @@ -776,6 +850,10 @@ impl GroupColumn for ByteViewGroupValueBuilder { fn append_val(&mut self, array: &ArrayRef, row: usize) { self.append_val_inner(array, row) } + + fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize) { + self.append_val_non_nullable_inner(array, row); + } fn len(&self) -> usize { self.views.len() From 2cdf05dfcef0770b00bbba95e26a99f96cdc95ec Mon Sep 17 00:00:00 2001 From: kamille Date: Thu, 17 Oct 2024 03:23:56 +0800 Subject: [PATCH 02/60] fix tests. --- .../src/aggregates/group_values/column.rs | 100 ++++++++++++++++-- 1 file changed, 90 insertions(+), 10 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 60dd15aa2672..8e90f883668d 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -21,21 +21,28 @@ use crate::aggregates::group_values::group_column::{ }; use crate::aggregates::group_values::GroupValues; use ahash::RandomState; -use arrow::compute::cast; +use arrow::compute::{self, cast}; use arrow::datatypes::{ BinaryViewType, Date32Type, Date64Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, StringViewType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use arrow::record_batch::RecordBatch; -use arrow_array::{Array, ArrayRef}; -use arrow_schema::{DataType, Schema, SchemaRef}; +use arrow_array::{ + Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Decimal128Array, + Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, + LargeStringArray, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, + UInt64Array, UInt8Array, +}; +use arrow_schema::{DataType, Schema, SchemaRef, TimeUnit}; use datafusion_common::hash_utils::create_hashes; use datafusion_common::{not_impl_err, DataFusionError, Result}; use datafusion_execution::memory_pool::proxy::{RawTableAllocExt, VecAllocExt}; use datafusion_expr::EmitTo; use datafusion_physical_expr::binary_map::OutputType; +use datafusion_physical_expr_common::datum::compare_with_eq; use hashbrown::raw::RawTable; /// A [`GroupValues`] that stores multiple columns of group values. @@ -234,7 +241,8 @@ impl GroupValues for GroupValuesColumn { // 1.3 Check and record which rows of the input should be appended self.append_rows_buffer.clear(); - let mut current_group_idx = self.group_values[0].len(); + let group_values_len = self.group_values[0].len(); + let mut next_group_idx = self.group_values[0].len(); for (row, &target_hash) in batch_hashes.iter().enumerate() { let entry = self.map.get_mut(target_hash, |(exist_hash, group_idx)| { // Somewhat surprisingly, this closure can be called even if the @@ -254,10 +262,17 @@ impl GroupValues for GroupValuesColumn { array_row.equal_to(lhs_row, array, rhs_row) } - for (i, group_val) in self.group_values.iter().enumerate() { - if !check_row_equal(group_val.as_ref(), *group_idx, &cols[i], row) { - return false; + if *group_idx < group_values_len { + for (i, group_val) in self.group_values.iter().enumerate() { + if !check_row_equal(group_val.as_ref(), *group_idx, &cols[i], row) + { + return false; + } } + } else { + let row_idx_offset = group_idx - group_values_len; + let row_idx = self.append_rows_buffer[row_idx_offset]; + return is_rows_eq(cols, row, cols, row_idx).unwrap(); } true @@ -271,7 +286,7 @@ impl GroupValues for GroupValuesColumn { // Add new entry to aggr_state and save newly created index // let group_idx = group_values.num_rows(); // group_values.push(group_rows.row(row)); - let prev_group_idx = current_group_idx; + let prev_group_idx = next_group_idx; // for hasher function, use precomputed hash value self.map.insert_accounted( @@ -280,7 +295,7 @@ impl GroupValues for GroupValuesColumn { &mut self.map_size, ); self.append_rows_buffer.push(row); - current_group_idx += 1; + next_group_idx += 1; prev_group_idx } @@ -289,7 +304,7 @@ impl GroupValues for GroupValuesColumn { } // 1.4 Vectorized append values - for (col_idx, col) in cols.iter().enumerate() { + for col_idx in 0..cols.len() { let col_nullable = self.column_nullables_buffer[col_idx]; let group_value = &mut self.group_values[col_idx]; if col_nullable { @@ -385,3 +400,68 @@ impl GroupValues for GroupValuesColumn { self.hashes_buffer.shrink_to(count); } } + +fn is_rows_eq( + left_arrays: &[ArrayRef], + left: usize, + right_arrays: &[ArrayRef], + right: usize, +) -> Result { + let mut is_equal = true; + for (left_array, right_array) in left_arrays.iter().zip(right_arrays) { + macro_rules! compare_value { + ($T:ty) => {{ + match (left_array.is_null(left), right_array.is_null(right)) { + (false, false) => { + let left_array = + left_array.as_any().downcast_ref::<$T>().unwrap(); + let right_array = + right_array.as_any().downcast_ref::<$T>().unwrap(); + if left_array.value(left) != right_array.value(right) { + is_equal = false; + } + } + (true, false) => is_equal = false, + (false, true) => is_equal = false, + _ => {} + } + }}; + } + + match left_array.data_type() { + DataType::Null => {} + DataType::Boolean => compare_value!(BooleanArray), + DataType::Int8 => compare_value!(Int8Array), + DataType::Int16 => compare_value!(Int16Array), + DataType::Int32 => compare_value!(Int32Array), + DataType::Int64 => compare_value!(Int64Array), + DataType::UInt8 => compare_value!(UInt8Array), + DataType::UInt16 => compare_value!(UInt16Array), + DataType::UInt32 => compare_value!(UInt32Array), + DataType::UInt64 => compare_value!(UInt64Array), + DataType::Float32 => compare_value!(Float32Array), + DataType::Float64 => compare_value!(Float64Array), + DataType::Utf8 => compare_value!(StringArray), + DataType::LargeUtf8 => compare_value!(LargeStringArray), + DataType::Decimal128(..) => compare_value!(Decimal128Array), + DataType::Timestamp(time_unit, None) => match time_unit { + TimeUnit::Second => compare_value!(TimestampSecondArray), + TimeUnit::Millisecond => compare_value!(TimestampMillisecondArray), + TimeUnit::Microsecond => compare_value!(TimestampMicrosecondArray), + TimeUnit::Nanosecond => compare_value!(TimestampNanosecondArray), + }, + DataType::Date32 => compare_value!(Date32Array), + DataType::Date64 => compare_value!(Date64Array), + dt => { + return not_impl_err!( + "Unsupported data type in sort merge join comparator: {}", + dt + ); + } + } + if !is_equal { + return Ok(false); + } + } + Ok(true) +} From 04ea2d27dc1fd7adca2bc84fae2f29390db3ce52 Mon Sep 17 00:00:00 2001 From: kamille Date: Fri, 18 Oct 2024 00:10:17 +0800 Subject: [PATCH 03/60] some logs. --- .../aggregates/group_values/group_column.rs | 45 +++++++++++++++++-- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 8ea8b110a7aa..348780661213 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -84,6 +84,8 @@ pub trait GroupColumn: Send + Sync { pub struct PrimitiveGroupValueBuilder { group_values: Vec, nulls: MaybeNullBufferBuilder, + nullable_call: usize, + non_nullable_call: usize, } impl PrimitiveGroupValueBuilder @@ -95,6 +97,8 @@ where Self { group_values: vec![], nulls: MaybeNullBufferBuilder::new(), + nullable_call: 0, + non_nullable_call: 0, } } } @@ -116,8 +120,8 @@ impl GroupColumn self.group_values[lhs_row] == array.as_primitive::().value(rhs_row) } - fn append_val(&mut self, array: &ArrayRef, row: usize) { + self.nullable_call += 1; // Perf: skip null check if input can't have nulls if NULLABLE { if array.is_null(row) { @@ -133,6 +137,7 @@ impl GroupColumn } fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize) { + self.non_nullable_call += 1; if NULLABLE { self.nulls.append(false); self.group_values.push(array.as_primitive::().value(row)); @@ -153,8 +158,14 @@ impl GroupColumn let Self { group_values, nulls, + nullable_call, + non_nullable_call, } = *self; + println!( + "### nullable_call:{nullable_call}, non_nullable_call:{non_nullable_call}" + ); + let nulls = nulls.build(); if !NULLABLE { assert!(nulls.is_none(), "unexpected nulls in non nullable input"); @@ -198,6 +209,10 @@ where offsets: Vec, /// Nulls nulls: MaybeNullBufferBuilder, + + nullable_call: usize, + + non_nullable_call: usize, } impl ByteGroupValueBuilder @@ -210,6 +225,8 @@ where buffer: BufferBuilder::new(INITIAL_BUFFER_CAPACITY), offsets: vec![O::default()], nulls: MaybeNullBufferBuilder::new(), + nullable_call: 0, + non_nullable_call: 0, } } @@ -217,6 +234,7 @@ where where B: ByteArrayType, { + self.nullable_call += 1; let arr = array.as_bytes::(); if arr.is_null(row) { self.nulls.append(true); @@ -235,6 +253,7 @@ where where B: ByteArrayType, { + self.non_nullable_call += 1; let arr = array.as_bytes::(); self.nulls.append(false); let value: &[u8] = arr.value(row).as_ref(); @@ -330,7 +349,7 @@ where } _ => unreachable!("View types should use `ArrowBytesViewMap`"), }; - } + } fn len(&self) -> usize { self.offsets.len() - 1 @@ -348,8 +367,14 @@ where mut buffer, offsets, nulls, + nullable_call, + non_nullable_call, } = *self; + println!( + "### nullable_call:{nullable_call}, non_nullable_call:{non_nullable_call}" + ); + let null_buffer = nulls.build(); // SAFETY: the offsets were constructed correctly in `insert_if_new` -- @@ -427,7 +452,7 @@ where } _ => unreachable!("View types should use `ArrowBytesViewMap`"), } - } + } } /// An implementation of [`GroupColumn`] for binary view and utf8 view types. @@ -469,6 +494,10 @@ pub struct ByteViewGroupValueBuilder { /// Nulls nulls: MaybeNullBufferBuilder, + nullable_call: usize, + + non_nullable_call: usize, + /// phantom data so the type requires `` _phantom: PhantomData, } @@ -482,6 +511,8 @@ impl ByteViewGroupValueBuilder { max_block_size: BYTE_VIEW_MAX_BLOCK_SIZE, nulls: MaybeNullBufferBuilder::new(), _phantom: PhantomData {}, + nullable_call: 0, + non_nullable_call: 0, } } @@ -495,6 +526,7 @@ impl ByteViewGroupValueBuilder { where B: ByteViewType, { + self.nullable_call += 1; let arr = array.as_byte_view::(); // Null row case, set and return @@ -531,6 +563,7 @@ impl ByteViewGroupValueBuilder { where B: ByteViewType, { + self.non_nullable_call += 1; let arr = array.as_byte_view::(); // Not null row case @@ -666,6 +699,10 @@ impl ByteViewGroupValueBuilder { let views = ScalarBuffer::from(views); + println!( + "### nullable_call:{}, non_nullable_call:{}", + self.nullable_call, self.non_nullable_call + ); // Safety: // * all views were correctly made // * (if utf8): Input was valid Utf8 so buffer contents are @@ -850,7 +887,7 @@ impl GroupColumn for ByteViewGroupValueBuilder { fn append_val(&mut self, array: &ArrayRef, row: usize) { self.append_val_inner(array, row) } - + fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize) { self.append_val_non_nullable_inner(array, row); } From a83c2ea8019d3f7ce465550edba19d09ff0f7a4f Mon Sep 17 00:00:00 2001 From: kamille Date: Sat, 19 Oct 2024 17:01:52 +0800 Subject: [PATCH 04/60] add `append_n` in `MaybeNullBufferBuilder`. --- .../aggregates/group_values/group_column.rs | 8 ++++++-- .../aggregates/group_values/null_builder.rs | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 348780661213..4a9b42d6f45f 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -61,6 +61,8 @@ pub trait GroupColumn: Send + Sync { fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize); + fn append_batch(&mut self, array: &ArrayRef, rows: &[usize]); + /// Returns the number of rows stored in this builder fn len(&self) -> usize; /// Returns the number of bytes used by this [`GroupColumn`] @@ -120,8 +122,11 @@ impl GroupColumn self.group_values[lhs_row] == array.as_primitive::().value(rhs_row) } + fn append_batch(&mut self, array: &ArrayRef, rows: &[usize]) { + todo!() + } + fn append_val(&mut self, array: &ArrayRef, row: usize) { - self.nullable_call += 1; // Perf: skip null check if input can't have nulls if NULLABLE { if array.is_null(row) { @@ -137,7 +142,6 @@ impl GroupColumn } fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize) { - self.non_nullable_call += 1; if NULLABLE { self.nulls.append(false); self.group_values.push(array.as_primitive::().value(row)); diff --git a/datafusion/physical-plan/src/aggregates/group_values/null_builder.rs b/datafusion/physical-plan/src/aggregates/group_values/null_builder.rs index 0249390f38cd..a584cf58e50a 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/null_builder.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/null_builder.rs @@ -70,6 +70,24 @@ impl MaybeNullBufferBuilder { } } + pub fn append_n(&mut self, n: usize, is_null: bool) { + match self { + Self::NoNulls { row_count } if is_null => { + // have seen no nulls so far, this is the first null, + // need to create the nulls buffer for all currently valid values + // alloc 2x the need given we push a new but immediately + let mut nulls = BooleanBufferBuilder::new(*row_count * 2); + nulls.append_n(*row_count, true); + nulls.append_n(n, false); + *self = Self::Nulls(nulls); + } + Self::NoNulls { row_count } => { + *row_count += n; + } + Self::Nulls(builder) => builder.append_n(n, !is_null), + } + } + /// return the number of heap allocated bytes used by this structure to store boolean values pub fn allocated_size(&self) -> usize { match self { From 3df75ac8eb1f51c17a8efbde0713b34382997ea8 Mon Sep 17 00:00:00 2001 From: kamille Date: Sat, 19 Oct 2024 18:36:44 +0800 Subject: [PATCH 05/60] impl basic append_batch --- .../src/aggregates/group_values/column.rs | 16 +- .../aggregates/group_values/group_column.rs | 209 ++++++++++-------- 2 files changed, 119 insertions(+), 106 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 8e90f883668d..26e7c22dcbbb 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -305,17 +305,13 @@ impl GroupValues for GroupValuesColumn { // 1.4 Vectorized append values for col_idx in 0..cols.len() { - let col_nullable = self.column_nullables_buffer[col_idx]; + let all_non_null = !self.column_nullables_buffer[col_idx]; let group_value = &mut self.group_values[col_idx]; - if col_nullable { - for &row in self.append_rows_buffer.iter() { - group_value.append_val(&cols[col_idx], row); - } - } else { - for &row in self.append_rows_buffer.iter() { - group_value.append_non_nullable_val(&cols[col_idx], row); - } - } + group_value.append_batch( + &cols[col_idx], + &self.append_rows_buffer, + all_non_null, + ); } Ok(()) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 4a9b42d6f45f..a6961835edef 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -22,6 +22,7 @@ use arrow::array::GenericBinaryArray; use arrow::array::GenericStringArray; use arrow::array::OffsetSizeTrait; use arrow::array::PrimitiveArray; +use arrow::array::StringViewBuilder; use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, AsArray}; use arrow::buffer::OffsetBuffer; use arrow::buffer::ScalarBuffer; @@ -29,9 +30,11 @@ use arrow::datatypes::ByteArrayType; use arrow::datatypes::ByteViewType; use arrow::datatypes::DataType; use arrow::datatypes::GenericBinaryType; +use arrow_array::GenericByteArray; use arrow_array::GenericByteViewArray; use arrow_buffer::Buffer; use datafusion_common::utils::proxy::VecAllocExt; +use datafusion_expr::sqlparser::keywords::NULLABLE; use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder; use arrow_array::types::GenericStringType; @@ -59,9 +62,7 @@ pub trait GroupColumn: Send + Sync { /// Appends the row at `row` in `array` to this builder fn append_val(&mut self, array: &ArrayRef, row: usize); - fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize); - - fn append_batch(&mut self, array: &ArrayRef, rows: &[usize]); + fn append_batch(&mut self, array: &ArrayRef, rows: &[usize], all_non_null: bool); /// Returns the number of rows stored in this builder fn len(&self) -> usize; @@ -86,8 +87,6 @@ pub trait GroupColumn: Send + Sync { pub struct PrimitiveGroupValueBuilder { group_values: Vec, nulls: MaybeNullBufferBuilder, - nullable_call: usize, - non_nullable_call: usize, } impl PrimitiveGroupValueBuilder @@ -99,8 +98,6 @@ where Self { group_values: vec![], nulls: MaybeNullBufferBuilder::new(), - nullable_call: 0, - non_nullable_call: 0, } } } @@ -122,9 +119,35 @@ impl GroupColumn self.group_values[lhs_row] == array.as_primitive::().value(rhs_row) } - fn append_batch(&mut self, array: &ArrayRef, rows: &[usize]) { - todo!() - } + fn append_batch(&mut self, array: &ArrayRef, rows: &[usize], all_non_null: bool) { + let arr = array.as_primitive::(); + match (NULLABLE, all_non_null) { + (true, true) => { + self.nulls.append_n(rows.len(), false); + self.group_values.reserve(rows.len()); + for &row in rows { + self.group_values.push(arr.value(row)); + } + } + (true, false) => { + for &row in rows { + if array.is_null(row) { + self.nulls.append(true); + self.group_values.push(T::default_value()); + } else { + self.nulls.append(false); + self.group_values.push(arr.value(row)); + } + } + } + (false, _) => { + self.group_values.reserve(rows.len()); + for &row in rows { + self.group_values.push(arr.value(row)); + } + } + } + } fn append_val(&mut self, array: &ArrayRef, row: usize) { // Perf: skip null check if input can't have nulls @@ -141,15 +164,6 @@ impl GroupColumn } } - fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize) { - if NULLABLE { - self.nulls.append(false); - self.group_values.push(array.as_primitive::().value(row)); - } else { - self.group_values.push(array.as_primitive::().value(row)); - } - } - fn len(&self) -> usize { self.group_values.len() } @@ -162,14 +176,8 @@ impl GroupColumn let Self { group_values, nulls, - nullable_call, - non_nullable_call, } = *self; - println!( - "### nullable_call:{nullable_call}, non_nullable_call:{non_nullable_call}" - ); - let nulls = nulls.build(); if !NULLABLE { assert!(nulls.is_none(), "unexpected nulls in non nullable input"); @@ -213,10 +221,6 @@ where offsets: Vec, /// Nulls nulls: MaybeNullBufferBuilder, - - nullable_call: usize, - - non_nullable_call: usize, } impl ByteGroupValueBuilder @@ -229,8 +233,36 @@ where buffer: BufferBuilder::new(INITIAL_BUFFER_CAPACITY), offsets: vec![O::default()], nulls: MaybeNullBufferBuilder::new(), - nullable_call: 0, - non_nullable_call: 0, + } + } + + fn append_batch_inner( + &mut self, + array: &ArrayRef, + rows: &[usize], + all_non_null: bool, + ) where + B: ByteArrayType, + { + let arr = array.as_bytes::(); + + if all_non_null { + self.nulls.append_n(rows.len(), false); + for &row in rows { + self.append_value(arr, row); + } + } else { + for &row in rows { + if arr.is_null(row) { + self.nulls.append(true); + // nulls need a zero length in the offset buffer + let offset = self.buffer.len(); + self.offsets.push(O::usize_as(offset)); + } else { + self.nulls.append(false); + self.append_value(arr, row); + } + } } } @@ -238,7 +270,6 @@ where where B: ByteArrayType, { - self.nullable_call += 1; let arr = array.as_bytes::(); if arr.is_null(row) { self.nulls.append(true); @@ -247,20 +278,15 @@ where self.offsets.push(O::usize_as(offset)); } else { self.nulls.append(false); - let value: &[u8] = arr.value(row).as_ref(); - self.buffer.append_slice(value); - self.offsets.push(O::usize_as(self.buffer.len())); + self.append_value(arr, row); } } - fn append_non_nullable_val_inner(&mut self, array: &ArrayRef, row: usize) + fn append_value(&mut self, array: &GenericByteArray, row: usize) where B: ByteArrayType, { - self.non_nullable_call += 1; - let arr = array.as_bytes::(); - self.nulls.append(false); - let value: &[u8] = arr.value(row).as_ref(); + let value: &[u8] = array.value(row).as_ref(); self.buffer.append_slice(value); self.offsets.push(O::usize_as(self.buffer.len())); } @@ -313,28 +339,35 @@ where } } - fn append_val(&mut self, column: &ArrayRef, row: usize) { - // Sanity array type + fn append_batch(&mut self, column: &ArrayRef, rows: &[usize], all_non_null: bool) { match self.output_type { OutputType::Binary => { debug_assert!(matches!( column.data_type(), DataType::Binary | DataType::LargeBinary )); - self.append_val_inner::>(column, row) + self.append_batch_inner::>( + column, + rows, + all_non_null, + ) } OutputType::Utf8 => { debug_assert!(matches!( column.data_type(), DataType::Utf8 | DataType::LargeUtf8 )); - self.append_val_inner::>(column, row) + self.append_batch_inner::>( + column, + rows, + all_non_null, + ) } _ => unreachable!("View types should use `ArrowBytesViewMap`"), }; } - fn append_non_nullable_val(&mut self, column: &ArrayRef, row: usize) { + fn append_val(&mut self, column: &ArrayRef, row: usize) { // Sanity array type match self.output_type { OutputType::Binary => { @@ -342,14 +375,14 @@ where column.data_type(), DataType::Binary | DataType::LargeBinary )); - self.append_non_nullable_val_inner::>(column, row) + self.append_val_inner::>(column, row) } OutputType::Utf8 => { debug_assert!(matches!( column.data_type(), DataType::Utf8 | DataType::LargeUtf8 )); - self.append_non_nullable_val_inner::>(column, row) + self.append_val_inner::>(column, row) } _ => unreachable!("View types should use `ArrowBytesViewMap`"), }; @@ -371,14 +404,8 @@ where mut buffer, offsets, nulls, - nullable_call, - non_nullable_call, } = *self; - println!( - "### nullable_call:{nullable_call}, non_nullable_call:{non_nullable_call}" - ); - let null_buffer = nulls.build(); // SAFETY: the offsets were constructed correctly in `insert_if_new` -- @@ -498,10 +525,6 @@ pub struct ByteViewGroupValueBuilder { /// Nulls nulls: MaybeNullBufferBuilder, - nullable_call: usize, - - non_nullable_call: usize, - /// phantom data so the type requires `` _phantom: PhantomData, } @@ -515,8 +538,6 @@ impl ByteViewGroupValueBuilder { max_block_size: BYTE_VIEW_MAX_BLOCK_SIZE, nulls: MaybeNullBufferBuilder::new(), _phantom: PhantomData {}, - nullable_call: 0, - non_nullable_call: 0, } } @@ -526,11 +547,34 @@ impl ByteViewGroupValueBuilder { self } - fn append_val_inner(&mut self, array: &ArrayRef, row: usize) - where - B: ByteViewType, - { - self.nullable_call += 1; + fn append_batch_inner( + &mut self, + array: &ArrayRef, + rows: &[usize], + all_non_null: bool, + ) { + let arr = array.as_byte_view::(); + + if all_non_null { + self.nulls.append_n(rows.len(), false); + for &row in rows { + self.append_value(arr, row); + } + } else { + for &row in rows { + // Null row case, set and return + if arr.is_valid(row) { + self.nulls.append(false); + self.append_value(arr, row); + } else { + self.nulls.append(true); + self.views.push(0); + } + } + } + } + + fn append_val_inner(&mut self, array: &ArrayRef, row: usize) { let arr = array.as_byte_view::(); // Null row case, set and return @@ -542,37 +586,14 @@ impl ByteViewGroupValueBuilder { // Not null row case self.nulls.append(false); - let value: &[u8] = arr.value(row).as_ref(); - - let value_len = value.len(); - let view = if value_len <= 12 { - make_view(value, 0, 0) - } else { - // Ensure big enough block to hold the value firstly - self.ensure_in_progress_big_enough(value_len); - - // Append value - let buffer_index = self.completed.len(); - let offset = self.in_progress.len(); - self.in_progress.extend_from_slice(value); - - make_view(value, buffer_index as u32, offset as u32) - }; - - // Append view - self.views.push(view); + self.append_value(arr, row); } - fn append_val_non_nullable_inner(&mut self, array: &ArrayRef, row: usize) + fn append_value(&mut self, array: &GenericByteViewArray, row: usize) where B: ByteViewType, { - self.non_nullable_call += 1; - let arr = array.as_byte_view::(); - - // Not null row case - self.nulls.append(false); - let value: &[u8] = arr.value(row).as_ref(); + let value: &[u8] = array.value(row).as_ref(); let value_len = value.len(); let view = if value_len <= 12 { @@ -703,10 +724,6 @@ impl ByteViewGroupValueBuilder { let views = ScalarBuffer::from(views); - println!( - "### nullable_call:{}, non_nullable_call:{}", - self.nullable_call, self.non_nullable_call - ); // Safety: // * all views were correctly made // * (if utf8): Input was valid Utf8 so buffer contents are @@ -892,8 +909,8 @@ impl GroupColumn for ByteViewGroupValueBuilder { self.append_val_inner(array, row) } - fn append_non_nullable_val(&mut self, array: &ArrayRef, row: usize) { - self.append_val_non_nullable_inner(array, row); + fn append_batch(&mut self, array: &ArrayRef, rows: &[usize], all_non_null: bool) { + self.append_batch_inner(array, rows, all_non_null); } fn len(&self) -> usize { From 13c9489d7eaae7c194d86f5bf783d0293896cdf1 Mon Sep 17 00:00:00 2001 From: kamille Date: Sat, 19 Oct 2024 19:07:44 +0800 Subject: [PATCH 06/60] fix equal to. --- .../src/aggregates/group_values/column.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 26e7c22dcbbb..29e0e434c392 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -29,11 +29,11 @@ use arrow::datatypes::{ }; use arrow::record_batch::RecordBatch; use arrow_array::{ - Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Decimal128Array, - Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - LargeStringArray, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, - UInt64Array, UInt8Array, + Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, + Date64Array, Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, + Int64Array, Int8Array, LargeStringArray, StringArray, StringViewArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow_schema::{DataType, Schema, SchemaRef, TimeUnit}; use datafusion_common::hash_utils::create_hashes; @@ -439,6 +439,9 @@ fn is_rows_eq( DataType::Float64 => compare_value!(Float64Array), DataType::Utf8 => compare_value!(StringArray), DataType::LargeUtf8 => compare_value!(LargeStringArray), + DataType::Binary => compare_value!(BinaryArray), + DataType::Utf8View => compare_value!(StringViewArray), + DataType::BinaryView => compare_value!(BinaryViewArray), DataType::Decimal128(..) => compare_value!(Decimal128Array), DataType::Timestamp(time_unit, None) => match time_unit { TimeUnit::Second => compare_value!(TimestampSecondArray), From 5fd63e8a5537de1098916a41d1ed49624cf3089f Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 22 Oct 2024 02:40:10 +0800 Subject: [PATCH 07/60] define `GroupIndexContext`. --- .../src/aggregates/group_values/column.rs | 48 +++++++++++++++---- .../aggregates/group_values/group_column.rs | 2 +- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 29e0e434c392..b6c7096ea98f 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -45,9 +45,42 @@ use datafusion_physical_expr::binary_map::OutputType; use datafusion_physical_expr_common::datum::compare_with_eq; use hashbrown::raw::RawTable; +/// Group index context for performing `vectorized compare` and `vectorized append` +struct GroupIndexContext { + /// It is possible that hash value collision exists, + /// and we will chain the `group indices` with same hash value + /// + /// The chained indices is like: + /// `latest group index -> older group index -> even older group index -> ...` + prev_group_index: usize, + + /// It is possible that rows with same hash values exist in `input cols`. + /// And if we `vectorized compare` and `vectorized append` them + /// in the same round, some fault cases will occur especially when + /// they are totally the repeated rows... + /// + /// For example: + /// - Two repeated rows exist in `input cols`. + /// + /// - We found their hash values equal to one exist group + /// + /// - We then perform `vectorized compare` for them to the exist group, + /// and found their values not equal to the exist one + /// + /// - Finally when perform `vectorized append`, we decide to build two + /// respective new groups for them, even we actually just need one + /// new group... + /// + /// So for solving such cases simply, if some rows with same hash value + /// in `input cols`, just allow to process one of them in a round, + /// and this flag is used to represent that one of them is processing + /// in current round. + /// + checking: bool, +} + /// A [`GroupValues`] that stores multiple columns of group values. /// -/// pub struct GroupValuesColumn { /// The output schema schema: SchemaRef, @@ -62,6 +95,11 @@ pub struct GroupValuesColumn { /// values: (hash, group_index) map: RawTable<(u64, usize)>, + group_index_ctxs: Vec, + + /// Some + remaining_indices: Vec, + /// The size of `map` in bytes map_size: usize, @@ -94,6 +132,7 @@ impl GroupValuesColumn { Ok(Self { schema, map, + group_index_ctxs: Vec::new(), map_size: 0, group_values: vec![], hashes_buffer: Default::default(), @@ -160,13 +199,6 @@ macro_rules! instantiate_primitive { }; } -fn append_col_value(mut core: C, array: &ArrayRef, row: usize) -where - C: FnMut(&ArrayRef, usize), -{ - core(array, row); -} - impl GroupValues for GroupValuesColumn { fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec) -> Result<()> { let n_rows = cols[0].len(); diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index a6961835edef..95690fce596d 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -554,7 +554,7 @@ impl ByteViewGroupValueBuilder { all_non_null: bool, ) { let arr = array.as_byte_view::(); - + if all_non_null { self.nulls.append_n(rows.len(), false); for &row in rows { From d4b58205916045f8d488da66ce38ee56fbed2378 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 22 Oct 2024 02:59:08 +0800 Subject: [PATCH 08/60] define the structs useful in vectorizing. --- .../src/aggregates/group_values/column.rs | 33 ++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index b6c7096ea98f..c80c15f873ae 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -95,13 +95,32 @@ pub struct GroupValuesColumn { /// values: (hash, group_index) map: RawTable<(u64, usize)>, + /// The size of `map` in bytes + map_size: usize, + + /// Contexts useful for `vectorized compare` and `vectorized append`, + /// detail can see [`GroupIndexContext`] group_index_ctxs: Vec, - /// Some + /// We need multiple rounds to process the `input cols`, + /// and the rows processing in current round is stored here. + current_indices: Vec, + + /// Similar as `current_indices`, but `remaining_indices` + /// is used to store the rows will be processed in next round. remaining_indices: Vec, - /// The size of `map` in bytes - map_size: usize, + /// The `vectorized compared` row indices buffer + vectorized_compare_row_indices: Vec, + + /// The `vectorized compared` group indices buffer + vectorized_compare_group_indices: Vec, + + /// The `vectorized compared` result buffer + vectorized_compare_results: Vec, + + /// The `vectorized append` row indices buffer + vectorized_append_row_indices: Vec, /// The actual group by values, stored column-wise. Compare from /// the left to right, each column is stored as [`GroupColumn`]. @@ -138,7 +157,13 @@ impl GroupValuesColumn { hashes_buffer: Default::default(), random_state: Default::default(), column_nullables_buffer: vec![false; num_cols], - append_rows_buffer: Vec::new(), + append_rows_buffer: Default::default(), + current_indices: Default::default(), + remaining_indices: Default::default(), + vectorized_compare_row_indices: Default::default(), + vectorized_compare_group_indices: Default::default(), + vectorized_compare_results: Default::default(), + vectorized_append_row_indices: Default::default(), }) } From 04f35bb85d9e62620ce63fc058403e92265df65b Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 22 Oct 2024 13:55:26 +0800 Subject: [PATCH 09/60] re-define some structs for vectorized operations. --- .../src/aggregates/group_values/column.rs | 115 ++++++++++++------ 1 file changed, 78 insertions(+), 37 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index c80c15f873ae..4e38e87721c3 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -43,40 +43,71 @@ use datafusion_expr::EmitTo; use datafusion_physical_expr::binary_map::OutputType; use datafusion_physical_expr_common::datum::compare_with_eq; -use hashbrown::raw::RawTable; +use hashbrown::raw::{Bucket, RawTable}; -/// Group index context for performing `vectorized compare` and `vectorized append` -struct GroupIndexContext { - /// It is possible that hash value collision exists, - /// and we will chain the `group indices` with same hash value - /// - /// The chained indices is like: - /// `latest group index -> older group index -> even older group index -> ...` - prev_group_index: usize, +const CHECKING_FLAG_MASK: u64 = 0x8000000000000000; +const SET_CHECKING_FLAG_MASK: u64 = 0x8000000000000000; +const UNSET_CHECKING_FLAG_MASK: u64 = 0x7FFFFFFFFFFFFFFF; - /// It is possible that rows with same hash values exist in `input cols`. - /// And if we `vectorized compare` and `vectorized append` them - /// in the same round, some fault cases will occur especially when - /// they are totally the repeated rows... - /// - /// For example: - /// - Two repeated rows exist in `input cols`. - /// - /// - We found their hash values equal to one exist group - /// - /// - We then perform `vectorized compare` for them to the exist group, - /// and found their values not equal to the exist one - /// - /// - Finally when perform `vectorized append`, we decide to build two - /// respective new groups for them, even we actually just need one - /// new group... - /// - /// So for solving such cases simply, if some rows with same hash value - /// in `input cols`, just allow to process one of them in a round, - /// and this flag is used to represent that one of them is processing - /// in current round. - /// - checking: bool, +/// `BucketContext` is a packed struct +/// +/// ### Format: +/// +/// +---------------------+--------------------+ +/// | checking flag(1bit) | group index(63bit) | +/// +---------------------+--------------------+ +/// +/// ### Checking flag +/// +/// It is possible that rows with same hash values exist in `input cols`. +/// And if we `vectorized compare` and `vectorized append` them +/// in the same round, some fault cases will occur especially when +/// they are totally the repeated rows... +/// +/// For example: +/// - Two repeated rows exist in `input cols`. +/// +/// - We found their hash values equal to one exist group +/// +/// - We then perform `vectorized compare` for them to the exist group, +/// and found their values not equal to the exist one +/// +/// - Finally when perform `vectorized append`, we decide to build two +/// respective new groups for them, even we actually just need one +/// new group... +/// +/// So for solving such cases simply, if some rows with same hash value +/// in `input cols`, just allow to process one of them in a round, +/// and this flag is used to represent that one of them is processing +/// in current round. +/// +/// ### Group index +/// +/// The group's index in group values +/// +#[derive(Debug, Clone, Copy)] +struct BucketContext(u64); + +impl BucketContext { + #[inline] + pub fn is_checking(&self) -> bool { + (self.0 & CHECKING_FLAG_MASK) > 0 + } + + #[inline] + pub fn set_checking(&mut self) { + self.0 |= SET_CHECKING_FLAG_MASK + } + + #[inline] + pub fn unset_checking(&mut self) { + self.0 &= UNSET_CHECKING_FLAG_MASK + } + + #[inline] + pub fn group_index(&self) -> u64 { + self.0 & UNSET_CHECKING_FLAG_MASK + } } /// A [`GroupValues`] that stores multiple columns of group values. @@ -93,14 +124,24 @@ pub struct GroupValuesColumn { /// /// keys: u64 hashes of the GroupValue /// values: (hash, group_index) - map: RawTable<(u64, usize)>, + map: RawTable<(u64, BucketContext)>, /// The size of `map` in bytes map_size: usize, - /// Contexts useful for `vectorized compare` and `vectorized append`, - /// detail can see [`GroupIndexContext`] - group_index_ctxs: Vec, + /// The lists for group indices with the same hash value + /// + /// It is possible that hash value collision exists, + /// and we will chain the `group indices` with same hash value + /// + /// The chained indices is like: + /// `latest group index -> older group index -> even older group index -> ...` + group_index_lists: Vec, + + /// The marked checking buckets in this round + /// + /// About the checking flag you can see [`BucketContext`] + checking_bucket: Vec>, /// We need multiple rounds to process the `input cols`, /// and the rows processing in current round is stored here. @@ -151,7 +192,7 @@ impl GroupValuesColumn { Ok(Self { schema, map, - group_index_ctxs: Vec::new(), + group_index_lists: Vec::new(), map_size: 0, group_values: vec![], hashes_buffer: Default::default(), From d215937ad04f3f3ac3ce84b9bbe9d27d7da26f93 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 22 Oct 2024 22:23:28 +0800 Subject: [PATCH 10/60] impl some vectorized logics. --- .../src/aggregates/group_values/column.rs | 141 +++++++++--------- 1 file changed, 67 insertions(+), 74 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 4e38e87721c3..967001b8dd13 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -141,7 +141,7 @@ pub struct GroupValuesColumn { /// The marked checking buckets in this round /// /// About the checking flag you can see [`BucketContext`] - checking_bucket: Vec>, + checking_buckets: Vec>, /// We need multiple rounds to process the `input cols`, /// and the rows processing in current round is stored here. @@ -193,6 +193,7 @@ impl GroupValuesColumn { schema, map, group_index_lists: Vec::new(), + checking_buckets: Default::default(), map_size: 0, group_values: vec![], hashes_buffer: Default::default(), @@ -326,92 +327,84 @@ impl GroupValues for GroupValuesColumn { // tracks to which group each of the input rows belongs groups.clear(); - // 1.1 Calculate the group keys for the group values + // General steps for one round `vectorized compare & append`: + // 1. Calculate and check hash values of `cols` in `map` + // 2. Perform `vectorized compare` + // 3. Perform `vectorized append` + // 4. Reset the checking flag in `BucketContext` + + // 1. Calculate and check hash values of `cols` in `map` + // + // 1.1 If bucket not found + // - Insert the `new bucket` build from the `group index` + // and its hash value to `map` + // - Mark this `new bucket` checking, and add it to `checking_buckets` + // - Add row index to `vectorized_append_row_indices` + // + // 1.2 bucket found + // - Check if the `bucket` checking, if so add it to `remaining_indices`, + // and just process it in next round, otherwise we continue the process + // - Mark `bucket` checking, and add it to `checking_buckets` + // - Add row index to `vectorized_compare_row_indices` + // - Add group indices(from `group_index_lists`) to `vectorized_compare_group_indices` + // let batch_hashes = &mut self.hashes_buffer; batch_hashes.clear(); batch_hashes.resize(n_rows, 0); create_hashes(cols, &self.random_state, batch_hashes)?; - // 1.2 Check if columns nullable - for (col_idx, col) in cols.iter().enumerate() { - self.column_nullables_buffer[col_idx] = (col.null_count() != 0); - } - - // 1.3 Check and record which rows of the input should be appended - self.append_rows_buffer.clear(); - let group_values_len = self.group_values[0].len(); - let mut next_group_idx = self.group_values[0].len(); - for (row, &target_hash) in batch_hashes.iter().enumerate() { - let entry = self.map.get_mut(target_hash, |(exist_hash, group_idx)| { - // Somewhat surprisingly, this closure can be called even if the - // hash doesn't match, so check the hash first with an integer - // comparison first avoid the more expensive comparison with - // group value. https://github.com/apache/datafusion/pull/11718 - if target_hash != *exist_hash { - return false; - } - - fn check_row_equal( - array_row: &dyn GroupColumn, - lhs_row: usize, - array: &ArrayRef, - rhs_row: usize, - ) -> bool { - array_row.equal_to(lhs_row, array, rhs_row) - } - - if *group_idx < group_values_len { - for (i, group_val) in self.group_values.iter().enumerate() { - if !check_row_equal(group_val.as_ref(), *group_idx, &cols[i], row) - { - return false; - } - } - } else { - let row_idx_offset = group_idx - group_values_len; - let row_idx = self.append_rows_buffer[row_idx_offset]; - return is_rows_eq(cols, row, cols, row_idx).unwrap(); - } - - true - }); - - let group_idx = match entry { - // Existing group_index for this group value - Some((_hash, group_idx)) => *group_idx, - // 1.2 Need to create new entry for the group - None => { - // Add new entry to aggr_state and save newly created index - // let group_idx = group_values.num_rows(); - // group_values.push(group_rows.row(row)); - let prev_group_idx = next_group_idx; + let num_rows = cols[0].len(); + self.current_indices.clear(); + self.current_indices.extend(0..num_rows); + while self.current_indices.len() > 0 { + let mut next_group_idx = self.group_values[0].len() as u64; + for (row, &target_hash) in batch_hashes.iter().enumerate() { + let entry = self.map.get_mut(target_hash, |(exist_hash, _)| { + // Somewhat surprisingly, this closure can be called even if the + // hash doesn't match, so check the hash first with an integer + // comparison first avoid the more expensive comparison with + // group value. https://github.com/apache/datafusion/pull/11718 + target_hash == *exist_hash + }); + + let Some((_, bucket_ctx)) = entry else { + // 1.1 Bucket not found case + // Insert the `new bucket` build from the `group index` + // Mark this `new bucket` checking, and add it to `checking_buckets` + let current_group_idx = next_group_idx; // for hasher function, use precomputed hash value - self.map.insert_accounted( - (target_hash, prev_group_idx), - |(hash, _group_index)| *hash, + let mut bucket_ctx = BucketContext(current_group_idx); + bucket_ctx.set_checking(); + let bucket = self.map.insert_accounted( + (target_hash, bucket_ctx), + |(hash, _)| *hash, &mut self.map_size, ); - self.append_rows_buffer.push(row); - next_group_idx += 1; + self.checking_buckets.push(bucket); + + // Add row index to `vectorized_append_row_indices` + self.vectorized_append_row_indices.push(row); - prev_group_idx + next_group_idx += 1; + continue; + }; + + // 1.2 bucket found + // Check if the `bucket` checking, if so add it to `remaining_indices`, + // and just process it in next round, otherwise we continue the process + if bucket_ctx.is_checking() { + self.remaining_indices.push(row); + continue; } - }; - groups.push(group_idx); - } + // Mark `bucket` checking, and add it to `checking_buckets` + bucket_ctx.set_checking(); - // 1.4 Vectorized append values - for col_idx in 0..cols.len() { - let all_non_null = !self.column_nullables_buffer[col_idx]; - let group_value = &mut self.group_values[col_idx]; - group_value.append_batch( - &cols[col_idx], - &self.append_rows_buffer, - all_non_null, - ); + // Add row index to `vectorized_compare_row_indices` + // Add group indices(from `group_index_lists`) to `vectorized_compare_group_indices` + self.vectorized_compare_row_indices.push(row); + } } - Ok(()) } From 2af6ff5ad48e481707e5698d8a9d79335df6e749 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 22 Oct 2024 22:47:00 +0800 Subject: [PATCH 11/60] impl chekcing hashmap stage. --- .../src/aggregates/group_values/column.rs | 73 +++++++++++++++---- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 967001b8dd13..e652573879a9 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -136,7 +136,7 @@ pub struct GroupValuesColumn { /// /// The chained indices is like: /// `latest group index -> older group index -> even older group index -> ...` - group_index_lists: Vec, + group_index_lists: Vec, /// The marked checking buckets in this round /// @@ -245,6 +245,44 @@ impl GroupValuesColumn { | DataType::BinaryView ) } + + #[inline] + fn get_group_indices_from_list( + &self, + start_group_index: usize, + ) -> GroupIndicesIterator { + GroupIndicesIterator::new(start_group_index, &self.group_index_lists) + } +} + +struct GroupIndicesIterator<'a> { + next_group_index: usize, + group_index_lists: &'a [usize], +} + +impl<'a> GroupIndicesIterator<'a> { + fn new(start_group_index: usize, group_index_lists: &'a [usize]) -> Self { + Self { + next_group_index: start_group_index + 1, + group_index_lists, + } + } +} + +impl<'a> Iterator for GroupIndicesIterator<'a> { + type Item = usize; + + fn next(&mut self) -> Option { + if self.next_group_index == 0 { + return None; + } + + let current_group_index = self.next_group_index; + let next_group_index = self.group_index_lists[current_group_index]; + self.next_group_index = next_group_index; + + Some(current_group_index - 1) + } } /// instantiates a [`PrimitiveGroupValueBuilder`] and pushes it into $v @@ -358,6 +396,10 @@ impl GroupValues for GroupValuesColumn { self.current_indices.extend(0..num_rows); while self.current_indices.len() > 0 { let mut next_group_idx = self.group_values[0].len() as u64; + self.vectorized_append_row_indices.clear(); + self.vectorized_compare_row_indices.clear(); + self.vectorized_compare_group_indices.clear(); + self.vectorized_compare_results.clear(); for (row, &target_hash) in batch_hashes.iter().enumerate() { let entry = self.map.get_mut(target_hash, |(exist_hash, _)| { // Somewhat surprisingly, this closure can be called even if the @@ -402,7 +444,12 @@ impl GroupValues for GroupValuesColumn { // Add row index to `vectorized_compare_row_indices` // Add group indices(from `group_index_lists`) to `vectorized_compare_group_indices` - self.vectorized_compare_row_indices.push(row); + let group_indices = + self.get_group_indices_from_list(bucket_ctx.group_index() as usize); + for group_index in group_indices { + self.vectorized_compare_row_indices.push(row); + self.vectorized_compare_group_indices.push(group_index); + } } } Ok(()) @@ -444,17 +491,17 @@ impl GroupValues for GroupValuesColumn { .collect::>(); // SAFETY: self.map outlives iterator and is not modified concurrently - unsafe { - for bucket in self.map.iter() { - // Decrement group index by n - match bucket.as_ref().1.checked_sub(n) { - // Group index was >= n, shift value down - Some(sub) => bucket.as_mut().1 = sub, - // Group index was < n, so remove from table - None => self.map.erase(bucket), - } - } - } + // unsafe { + // for bucket in self.map.iter() { + // // Decrement group index by n + // match bucket.as_ref().1.0.checked_sub(n) { + // // Group index was >= n, shift value down + // Some(sub) => bucket.as_mut().1 = sub, + // // Group index was < n, so remove from table + // None => self.map.erase(bucket), + // } + // } + // } output } From 473914ab64b30b4741461b23bce48a99861cc921 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 22 Oct 2024 22:59:24 +0800 Subject: [PATCH 12/60] fix compile. --- .../src/aggregates/group_values/column.rs | 127 +++++++----------- 1 file changed, 52 insertions(+), 75 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index e652573879a9..a13094d26a54 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -253,35 +253,62 @@ impl GroupValuesColumn { ) -> GroupIndicesIterator { GroupIndicesIterator::new(start_group_index, &self.group_index_lists) } -} -struct GroupIndicesIterator<'a> { - next_group_index: usize, - group_index_lists: &'a [usize], -} + fn collect_vectorized_process_context(&mut self, batch_hashes: &[u64]) { + let mut next_group_idx = self.group_values[0].len() as u64; + for &row in self.current_indices.iter() { + let target_hash = batch_hashes[row]; + let entry = self.map.get_mut(target_hash, |(exist_hash, _)| { + // Somewhat surprisingly, this closure can be called even if the + // hash doesn't match, so check the hash first with an integer + // comparison first avoid the more expensive comparison with + // group value. https://github.com/apache/datafusion/pull/11718 + target_hash == *exist_hash + }); + + let Some((_, bucket_ctx)) = entry else { + // 1.1 Bucket not found case + // Insert the `new bucket` build from the `group index` + // Mark this `new bucket` checking, and add it to `checking_buckets` + let current_group_idx = next_group_idx; + + // for hasher function, use precomputed hash value + let mut bucket_ctx = BucketContext(current_group_idx); + bucket_ctx.set_checking(); + let bucket = self.map.insert_accounted( + (target_hash, bucket_ctx), + |(hash, _)| *hash, + &mut self.map_size, + ); + self.checking_buckets.push(bucket); -impl<'a> GroupIndicesIterator<'a> { - fn new(start_group_index: usize, group_index_lists: &'a [usize]) -> Self { - Self { - next_group_index: start_group_index + 1, - group_index_lists, - } - } -} + // Add row index to `vectorized_append_row_indices` + self.vectorized_append_row_indices.push(row); -impl<'a> Iterator for GroupIndicesIterator<'a> { - type Item = usize; + next_group_idx += 1; + continue; + }; - fn next(&mut self) -> Option { - if self.next_group_index == 0 { - return None; + // 1.2 bucket found + // Check if the `bucket` checking, if so add it to `remaining_indices`, + // and just process it in next round, otherwise we continue the process + if bucket_ctx.is_checking() { + self.remaining_indices.push(row); + continue; + } + // Mark `bucket` checking, and add it to `checking_buckets` + bucket_ctx.set_checking(); + + // Add row index to `vectorized_compare_row_indices` + // Add group indices(from `group_index_lists`) to `vectorized_compare_group_indices` + let mut next_group_index = bucket_ctx.group_index() as usize + 1; + while next_group_index > 0 { + let current_group_index = next_group_index; + self.vectorized_compare_row_indices.push(row); + self.vectorized_compare_group_indices.push(current_group_index - 1); + next_group_index = self.group_index_lists[current_group_index]; + } } - - let current_group_index = self.next_group_index; - let next_group_index = self.group_index_lists[current_group_index]; - self.next_group_index = next_group_index; - - Some(current_group_index - 1) } } @@ -395,62 +422,12 @@ impl GroupValues for GroupValuesColumn { self.current_indices.clear(); self.current_indices.extend(0..num_rows); while self.current_indices.len() > 0 { - let mut next_group_idx = self.group_values[0].len() as u64; self.vectorized_append_row_indices.clear(); self.vectorized_compare_row_indices.clear(); self.vectorized_compare_group_indices.clear(); self.vectorized_compare_results.clear(); - for (row, &target_hash) in batch_hashes.iter().enumerate() { - let entry = self.map.get_mut(target_hash, |(exist_hash, _)| { - // Somewhat surprisingly, this closure can be called even if the - // hash doesn't match, so check the hash first with an integer - // comparison first avoid the more expensive comparison with - // group value. https://github.com/apache/datafusion/pull/11718 - target_hash == *exist_hash - }); - - let Some((_, bucket_ctx)) = entry else { - // 1.1 Bucket not found case - // Insert the `new bucket` build from the `group index` - // Mark this `new bucket` checking, and add it to `checking_buckets` - let current_group_idx = next_group_idx; - - // for hasher function, use precomputed hash value - let mut bucket_ctx = BucketContext(current_group_idx); - bucket_ctx.set_checking(); - let bucket = self.map.insert_accounted( - (target_hash, bucket_ctx), - |(hash, _)| *hash, - &mut self.map_size, - ); - self.checking_buckets.push(bucket); - - // Add row index to `vectorized_append_row_indices` - self.vectorized_append_row_indices.push(row); - - next_group_idx += 1; - continue; - }; - - // 1.2 bucket found - // Check if the `bucket` checking, if so add it to `remaining_indices`, - // and just process it in next round, otherwise we continue the process - if bucket_ctx.is_checking() { - self.remaining_indices.push(row); - continue; - } - // Mark `bucket` checking, and add it to `checking_buckets` - bucket_ctx.set_checking(); - // Add row index to `vectorized_compare_row_indices` - // Add group indices(from `group_index_lists`) to `vectorized_compare_group_indices` - let group_indices = - self.get_group_indices_from_list(bucket_ctx.group_index() as usize); - for group_index in group_indices { - self.vectorized_compare_row_indices.push(row); - self.vectorized_compare_group_indices.push(group_index); - } - } + // 2. Perform `vectorized compare` } Ok(()) } From 14f888136406e718ae86b8937d1b6316c321b3df Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 22 Oct 2024 23:04:32 +0800 Subject: [PATCH 13/60] tmp --- .../src/aggregates/group_values/column.rs | 52 ++++++++++--------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index a13094d26a54..4891c989e959 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -254,6 +254,21 @@ impl GroupValuesColumn { GroupIndicesIterator::new(start_group_index, &self.group_index_lists) } + /// Collect vectorized context by checking hash values of `cols` in `map` + /// + /// 1. If bucket not found + /// - Insert the `new bucket` build from the `group index` + /// and its hash value to `map` + /// - Mark this `new bucket` checking, and add it to `checking_buckets` + /// - Add row index to `vectorized_append_row_indices` + /// + /// 2. bucket found + /// - Check if the `bucket` checking, if so add it to `remaining_indices`, + /// and just process it in next round, otherwise we continue the process + /// - Mark `bucket` checking, and add it to `checking_buckets` + /// - Add row index to `vectorized_compare_row_indices` + /// - Add group indices(from `group_index_lists`) to `vectorized_compare_group_indices` + /// fn collect_vectorized_process_context(&mut self, batch_hashes: &[u64]) { let mut next_group_idx = self.group_values[0].len() as u64; for &row in self.current_indices.iter() { @@ -267,7 +282,7 @@ impl GroupValuesColumn { }); let Some((_, bucket_ctx)) = entry else { - // 1.1 Bucket not found case + // 1. Bucket not found case // Insert the `new bucket` build from the `group index` // Mark this `new bucket` checking, and add it to `checking_buckets` let current_group_idx = next_group_idx; @@ -289,7 +304,7 @@ impl GroupValuesColumn { continue; }; - // 1.2 bucket found + // 2. bucket found // Check if the `bucket` checking, if so add it to `remaining_indices`, // and just process it in next round, otherwise we continue the process if bucket_ctx.is_checking() { @@ -305,7 +320,8 @@ impl GroupValuesColumn { while next_group_index > 0 { let current_group_index = next_group_index; self.vectorized_compare_row_indices.push(row); - self.vectorized_compare_group_indices.push(current_group_index - 1); + self.vectorized_compare_group_indices + .push(current_group_index - 1); next_group_index = self.group_index_lists[current_group_index]; } } @@ -392,32 +408,17 @@ impl GroupValues for GroupValuesColumn { // tracks to which group each of the input rows belongs groups.clear(); - // General steps for one round `vectorized compare & append`: - // 1. Calculate and check hash values of `cols` in `map` - // 2. Perform `vectorized compare` - // 3. Perform `vectorized append` - // 4. Reset the checking flag in `BucketContext` - - // 1. Calculate and check hash values of `cols` in `map` - // - // 1.1 If bucket not found - // - Insert the `new bucket` build from the `group index` - // and its hash value to `map` - // - Mark this `new bucket` checking, and add it to `checking_buckets` - // - Add row index to `vectorized_append_row_indices` - // - // 1.2 bucket found - // - Check if the `bucket` checking, if so add it to `remaining_indices`, - // and just process it in next round, otherwise we continue the process - // - Mark `bucket` checking, and add it to `checking_buckets` - // - Add row index to `vectorized_compare_row_indices` - // - Add group indices(from `group_index_lists`) to `vectorized_compare_group_indices` - // let batch_hashes = &mut self.hashes_buffer; batch_hashes.clear(); batch_hashes.resize(n_rows, 0); create_hashes(cols, &self.random_state, batch_hashes)?; + // General steps for one round `vectorized compare & append`: + // 1. Collect vectorized context by checking hash values of `cols` in `map` + // 2. Perform `vectorized compare` + // 3. Perform `vectorized append` + // 4. Reset the checking flag in `BucketContext` + let num_rows = cols[0].len(); self.current_indices.clear(); self.current_indices.extend(0..num_rows); @@ -427,6 +428,9 @@ impl GroupValues for GroupValuesColumn { self.vectorized_compare_group_indices.clear(); self.vectorized_compare_results.clear(); + // 1. Collect vectorized context by checking hash values of `cols` in `map` + self.collect_vectorized_process_context(batch_hashes); + // 2. Perform `vectorized compare` } Ok(()) From ebbeb5a4ed83b4778f94798df5395293d7430b41 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 22 Oct 2024 23:22:32 +0800 Subject: [PATCH 14/60] define and impl `vectorized_compare`. --- .../src/aggregates/group_values/column.rs | 4 + .../aggregates/group_values/group_column.rs | 140 +++++++++++++----- 2 files changed, 111 insertions(+), 33 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 4891c989e959..d7d3196e8c95 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -326,6 +326,10 @@ impl GroupValuesColumn { } } } + + fn vectorized_compare(&mut self) { + + } } /// instantiates a [`PrimitiveGroupValueBuilder`] and pushes it into $v diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 95690fce596d..83ec39c6d8f8 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -62,7 +62,15 @@ pub trait GroupColumn: Send + Sync { /// Appends the row at `row` in `array` to this builder fn append_val(&mut self, array: &ArrayRef, row: usize); - fn append_batch(&mut self, array: &ArrayRef, rows: &[usize], all_non_null: bool); + fn vectorized_compare( + &mut self, + group_indices: &[usize], + array: &ArrayRef, + rows: &[usize], + compare_results: &mut [bool], + ); + + fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize], all_non_null: bool); /// Returns the number of rows stored in this builder fn len(&self) -> usize; @@ -119,7 +127,58 @@ impl GroupColumn self.group_values[lhs_row] == array.as_primitive::().value(rhs_row) } - fn append_batch(&mut self, array: &ArrayRef, rows: &[usize], all_non_null: bool) { + fn append_val(&mut self, array: &ArrayRef, row: usize) { + // Perf: skip null check if input can't have nulls + if NULLABLE { + if array.is_null(row) { + self.nulls.append(true); + self.group_values.push(T::default_value()); + } else { + self.nulls.append(false); + self.group_values.push(array.as_primitive::().value(row)); + } + } else { + self.group_values.push(array.as_primitive::().value(row)); + } + } + + fn vectorized_compare( + &mut self, + group_indices: &[usize], + array: &ArrayRef, + rows: &[usize], + compare_results: &mut [bool], + ) { + let array = array.as_primitive::(); + + for (idx, &lhs_row) in group_indices.iter().enumerate() { + // Has found not equal to, don't need to check + if !compare_results[idx] { + continue; + } + + let rhs_row = rows[idx]; + // Perf: skip null check (by short circuit) if input is not nullable + if NULLABLE { + let exist_null = self.nulls.is_null(lhs_row); + let input_null = array.is_null(rhs_row); + if let Some(result) = nulls_equal_to(exist_null, input_null) { + compare_results[idx] = result; + continue; + } + // Otherwise, we need to check their values + } + + compare_results[idx] = self.group_values[lhs_row] == array.value(rhs_row); + } + } + + fn vectorized_append( + &mut self, + array: &ArrayRef, + rows: &[usize], + all_non_null: bool, + ) { let arr = array.as_primitive::(); match (NULLABLE, all_non_null) { (true, true) => { @@ -149,21 +208,6 @@ impl GroupColumn } } - fn append_val(&mut self, array: &ArrayRef, row: usize) { - // Perf: skip null check if input can't have nulls - if NULLABLE { - if array.is_null(row) { - self.nulls.append(true); - self.group_values.push(T::default_value()); - } else { - self.nulls.append(false); - self.group_values.push(array.as_primitive::().value(row)); - } - } else { - self.group_values.push(array.as_primitive::().value(row)); - } - } - fn len(&self) -> usize { self.group_values.len() } @@ -339,50 +383,65 @@ where } } - fn append_batch(&mut self, column: &ArrayRef, rows: &[usize], all_non_null: bool) { + fn append_val(&mut self, column: &ArrayRef, row: usize) { + // Sanity array type match self.output_type { OutputType::Binary => { debug_assert!(matches!( column.data_type(), DataType::Binary | DataType::LargeBinary )); - self.append_batch_inner::>( - column, - rows, - all_non_null, - ) + self.append_val_inner::>(column, row) } OutputType::Utf8 => { debug_assert!(matches!( column.data_type(), DataType::Utf8 | DataType::LargeUtf8 )); - self.append_batch_inner::>( - column, - rows, - all_non_null, - ) + self.append_val_inner::>(column, row) } _ => unreachable!("View types should use `ArrowBytesViewMap`"), }; } - fn append_val(&mut self, column: &ArrayRef, row: usize) { - // Sanity array type + fn vectorized_compare( + &mut self, + group_indices: &[usize], + array: &ArrayRef, + rows: &[usize], + compare_results: &mut [bool], + ) { + todo!() + } + + fn vectorized_append( + &mut self, + column: &ArrayRef, + rows: &[usize], + all_non_null: bool, + ) { match self.output_type { OutputType::Binary => { debug_assert!(matches!( column.data_type(), DataType::Binary | DataType::LargeBinary )); - self.append_val_inner::>(column, row) + self.append_batch_inner::>( + column, + rows, + all_non_null, + ) } OutputType::Utf8 => { debug_assert!(matches!( column.data_type(), DataType::Utf8 | DataType::LargeUtf8 )); - self.append_val_inner::>(column, row) + self.append_batch_inner::>( + column, + rows, + all_non_null, + ) } _ => unreachable!("View types should use `ArrowBytesViewMap`"), }; @@ -909,7 +968,22 @@ impl GroupColumn for ByteViewGroupValueBuilder { self.append_val_inner(array, row) } - fn append_batch(&mut self, array: &ArrayRef, rows: &[usize], all_non_null: bool) { + fn vectorized_compare( + &mut self, + group_indices: &[usize], + array: &ArrayRef, + rows: &[usize], + compare_results: &mut [bool], + ) { + todo!() + } + + fn vectorized_append( + &mut self, + array: &ArrayRef, + rows: &[usize], + all_non_null: bool, + ) { self.append_batch_inner(array, rows, all_non_null); } From dad79c0010232b0db1026a839159556f6e247893 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 22 Oct 2024 23:28:58 +0800 Subject: [PATCH 15/60] fix compile. --- .../src/aggregates/group_values/column.rs | 21 ++++++++----------- .../aggregates/group_values/group_column.rs | 16 ++++++++++---- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index d7d3196e8c95..87bcd8e10ff5 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use std::mem; + use crate::aggregates::group_values::group_column::{ ByteGroupValueBuilder, ByteViewGroupValueBuilder, GroupColumn, PrimitiveGroupValueBuilder, @@ -246,14 +248,6 @@ impl GroupValuesColumn { ) } - #[inline] - fn get_group_indices_from_list( - &self, - start_group_index: usize, - ) -> GroupIndicesIterator { - GroupIndicesIterator::new(start_group_index, &self.group_index_lists) - } - /// Collect vectorized context by checking hash values of `cols` in `map` /// /// 1. If bucket not found @@ -328,7 +322,7 @@ impl GroupValuesColumn { } fn vectorized_compare(&mut self) { - + } } @@ -412,10 +406,10 @@ impl GroupValues for GroupValuesColumn { // tracks to which group each of the input rows belongs groups.clear(); - let batch_hashes = &mut self.hashes_buffer; + let mut batch_hashes = mem::take(&mut self.hashes_buffer); batch_hashes.clear(); batch_hashes.resize(n_rows, 0); - create_hashes(cols, &self.random_state, batch_hashes)?; + create_hashes(cols, &self.random_state, &mut batch_hashes)?; // General steps for one round `vectorized compare & append`: // 1. Collect vectorized context by checking hash values of `cols` in `map` @@ -433,10 +427,13 @@ impl GroupValues for GroupValuesColumn { self.vectorized_compare_results.clear(); // 1. Collect vectorized context by checking hash values of `cols` in `map` - self.collect_vectorized_process_context(batch_hashes); + self.collect_vectorized_process_context(&batch_hashes); // 2. Perform `vectorized compare` } + + self.hashes_buffer = batch_hashes; + Ok(()) } diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 83ec39c6d8f8..89cfea3b4de6 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -617,14 +617,14 @@ impl ByteViewGroupValueBuilder { if all_non_null { self.nulls.append_n(rows.len(), false); for &row in rows { - self.append_value(arr, row); + self.do_append_val_inner(arr, row); } } else { for &row in rows { // Null row case, set and return if arr.is_valid(row) { self.nulls.append(false); - self.append_value(arr, row); + self.do_append_val_inner(arr, row); } else { self.nulls.append(true); self.views.push(0); @@ -645,10 +645,10 @@ impl ByteViewGroupValueBuilder { // Not null row case self.nulls.append(false); - self.append_value(arr, row); + self.do_append_val_inner(arr, row); } - fn append_value(&mut self, array: &GenericByteViewArray, row: usize) + fn do_append_val_inner(&mut self, array: &GenericByteViewArray, row: usize) where B: ByteViewType, { @@ -690,7 +690,15 @@ impl ByteViewGroupValueBuilder { fn equal_to_inner(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool { let array = array.as_byte_view::(); + self.do_equal_to_inner(lhs_row, array, rhs_row) + } + fn do_equal_to_inner( + &self, + lhs_row: usize, + array: &GenericByteViewArray, + rhs_row: usize, + ) -> bool { // Check if nulls equal firstly let exist_null = self.nulls.is_null(lhs_row); let input_null = array.is_null(rhs_row); From 1a7c2ebc5502c4b004c3713df8e2bdbccc69e1e3 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 22 Oct 2024 23:45:18 +0800 Subject: [PATCH 16/60] impl `vectorized_equal_to`. --- .../src/aggregates/group_values/column.rs | 67 ++++++++++++------- .../aggregates/group_values/group_column.rs | 30 ++++----- 2 files changed, 56 insertions(+), 41 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 87bcd8e10ff5..2045cbc9b790 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -62,7 +62,7 @@ const UNSET_CHECKING_FLAG_MASK: u64 = 0x7FFFFFFFFFFFFFFF; /// ### Checking flag /// /// It is possible that rows with same hash values exist in `input cols`. -/// And if we `vectorized compare` and `vectorized append` them +/// And if we `vectorized_equal_to` and `vectorized append` them /// in the same round, some fault cases will occur especially when /// they are totally the repeated rows... /// @@ -71,7 +71,7 @@ const UNSET_CHECKING_FLAG_MASK: u64 = 0x7FFFFFFFFFFFFFFF; /// /// - We found their hash values equal to one exist group /// -/// - We then perform `vectorized compare` for them to the exist group, +/// - We then perform `vectorized_equal_to` for them to the exist group, /// and found their values not equal to the exist one /// /// - Finally when perform `vectorized append`, we decide to build two @@ -153,14 +153,14 @@ pub struct GroupValuesColumn { /// is used to store the rows will be processed in next round. remaining_indices: Vec, - /// The `vectorized compared` row indices buffer - vectorized_compare_row_indices: Vec, + /// The `vectorized_equal_tod` row indices buffer + vectorized_equal_to_row_indices: Vec, - /// The `vectorized compared` group indices buffer - vectorized_compare_group_indices: Vec, + /// The `vectorized_equal_tod` group indices buffer + vectorized_equal_to_group_indices: Vec, - /// The `vectorized compared` result buffer - vectorized_compare_results: Vec, + /// The `vectorized_equal_tod` result buffer + vectorized_equal_to_results: Vec, /// The `vectorized append` row indices buffer vectorized_append_row_indices: Vec, @@ -204,9 +204,9 @@ impl GroupValuesColumn { append_rows_buffer: Default::default(), current_indices: Default::default(), remaining_indices: Default::default(), - vectorized_compare_row_indices: Default::default(), - vectorized_compare_group_indices: Default::default(), - vectorized_compare_results: Default::default(), + vectorized_equal_to_row_indices: Default::default(), + vectorized_equal_to_group_indices: Default::default(), + vectorized_equal_to_results: Default::default(), vectorized_append_row_indices: Default::default(), }) } @@ -260,8 +260,8 @@ impl GroupValuesColumn { /// - Check if the `bucket` checking, if so add it to `remaining_indices`, /// and just process it in next round, otherwise we continue the process /// - Mark `bucket` checking, and add it to `checking_buckets` - /// - Add row index to `vectorized_compare_row_indices` - /// - Add group indices(from `group_index_lists`) to `vectorized_compare_group_indices` + /// - Add row index to `vectorized_equal_to_row_indices` + /// - Add group indices(from `group_index_lists`) to `vectorized_equal_to_group_indices` /// fn collect_vectorized_process_context(&mut self, batch_hashes: &[u64]) { let mut next_group_idx = self.group_values[0].len() as u64; @@ -308,21 +308,36 @@ impl GroupValuesColumn { // Mark `bucket` checking, and add it to `checking_buckets` bucket_ctx.set_checking(); - // Add row index to `vectorized_compare_row_indices` - // Add group indices(from `group_index_lists`) to `vectorized_compare_group_indices` + // Add row index to `vectorized_equal_to_row_indices` + // Add group indices(from `group_index_lists`) to `vectorized_equal_to_group_indices` let mut next_group_index = bucket_ctx.group_index() as usize + 1; while next_group_index > 0 { let current_group_index = next_group_index; - self.vectorized_compare_row_indices.push(row); - self.vectorized_compare_group_indices + self.vectorized_equal_to_row_indices.push(row); + self.vectorized_equal_to_group_indices .push(current_group_index - 1); next_group_index = self.group_index_lists[current_group_index]; } } - } - fn vectorized_compare(&mut self) { + self.vectorized_equal_to_results + .resize(self.vectorized_equal_to_group_indices.len(), true); + } + /// Perform `vectorized_equal_to` + /// + /// + fn vectorized_equal_to(&mut self, cols: &[ArrayRef]) { + let mut equal_to_results = mem::take(&mut self.vectorized_equal_to_results); + for (col_idx, group_col) in self.group_values.iter().enumerate() { + group_col.vectorized_equal_to( + &self.vectorized_equal_to_group_indices, + &cols[col_idx], + &self.vectorized_equal_to_row_indices, + &mut equal_to_results, + ); + } + self.vectorized_equal_to_results = equal_to_results; } } @@ -411,10 +426,10 @@ impl GroupValues for GroupValuesColumn { batch_hashes.resize(n_rows, 0); create_hashes(cols, &self.random_state, &mut batch_hashes)?; - // General steps for one round `vectorized compare & append`: + // General steps for one round `vectorized equal_to & append`: // 1. Collect vectorized context by checking hash values of `cols` in `map` - // 2. Perform `vectorized compare` - // 3. Perform `vectorized append` + // 2. Perform `vectorized_equal_to` + // 3. Perform `vectorized_append` // 4. Reset the checking flag in `BucketContext` let num_rows = cols[0].len(); @@ -422,14 +437,14 @@ impl GroupValues for GroupValuesColumn { self.current_indices.extend(0..num_rows); while self.current_indices.len() > 0 { self.vectorized_append_row_indices.clear(); - self.vectorized_compare_row_indices.clear(); - self.vectorized_compare_group_indices.clear(); - self.vectorized_compare_results.clear(); + self.vectorized_equal_to_row_indices.clear(); + self.vectorized_equal_to_group_indices.clear(); + self.vectorized_equal_to_results.clear(); // 1. Collect vectorized context by checking hash values of `cols` in `map` self.collect_vectorized_process_context(&batch_hashes); - // 2. Perform `vectorized compare` + // 2. Perform `vectorized_equal_to` } self.hashes_buffer = batch_hashes; diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 89cfea3b4de6..7db5bd6d6d3a 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -62,12 +62,12 @@ pub trait GroupColumn: Send + Sync { /// Appends the row at `row` in `array` to this builder fn append_val(&mut self, array: &ArrayRef, row: usize); - fn vectorized_compare( - &mut self, + fn vectorized_equal_to( + &self, group_indices: &[usize], array: &ArrayRef, rows: &[usize], - compare_results: &mut [bool], + equal_to_results: &mut [bool], ); fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize], all_non_null: bool); @@ -142,18 +142,18 @@ impl GroupColumn } } - fn vectorized_compare( - &mut self, + fn vectorized_equal_to( + &self, group_indices: &[usize], array: &ArrayRef, rows: &[usize], - compare_results: &mut [bool], + equal_to_results: &mut [bool], ) { let array = array.as_primitive::(); for (idx, &lhs_row) in group_indices.iter().enumerate() { // Has found not equal to, don't need to check - if !compare_results[idx] { + if !equal_to_results[idx] { continue; } @@ -163,13 +163,13 @@ impl GroupColumn let exist_null = self.nulls.is_null(lhs_row); let input_null = array.is_null(rhs_row); if let Some(result) = nulls_equal_to(exist_null, input_null) { - compare_results[idx] = result; + equal_to_results[idx] = result; continue; } // Otherwise, we need to check their values } - compare_results[idx] = self.group_values[lhs_row] == array.value(rhs_row); + equal_to_results[idx] = self.group_values[lhs_row] == array.value(rhs_row); } } @@ -404,12 +404,12 @@ where }; } - fn vectorized_compare( - &mut self, + fn vectorized_equal_to( + &self, group_indices: &[usize], array: &ArrayRef, rows: &[usize], - compare_results: &mut [bool], + equal_to_results: &mut [bool], ) { todo!() } @@ -976,12 +976,12 @@ impl GroupColumn for ByteViewGroupValueBuilder { self.append_val_inner(array, row) } - fn vectorized_compare( - &mut self, + fn vectorized_equal_to( + &self, group_indices: &[usize], array: &ArrayRef, rows: &[usize], - compare_results: &mut [bool], + equal_to_results: &mut [bool], ) { todo!() } From d79b813b90d17cc087312cca9ff12aa51e145eb7 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 23 Oct 2024 14:19:12 +0800 Subject: [PATCH 17/60] impl `vectorized_append`. --- .../src/aggregates/group_values/column.rs | 47 ++- .../aggregates/group_values/group_column.rs | 308 ++++++++++++------ 2 files changed, 247 insertions(+), 108 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 2045cbc9b790..ced09d26e6a8 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -319,16 +319,23 @@ impl GroupValuesColumn { next_group_index = self.group_index_lists[current_group_index]; } } - - self.vectorized_equal_to_results - .resize(self.vectorized_equal_to_group_indices.len(), true); } /// Perform `vectorized_equal_to` /// - /// fn vectorized_equal_to(&mut self, cols: &[ArrayRef]) { + debug_assert_eq!( + self.vectorized_equal_to_group_indices.len(), + self.vectorized_equal_to_row_indices.len() + ); + + if self.vectorized_equal_to_group_indices.is_empty() { + return; + } + + // Vectorized equal to `cols` and `group columns` let mut equal_to_results = mem::take(&mut self.vectorized_equal_to_results); + equal_to_results.resize(self.vectorized_equal_to_group_indices.len(), true); for (col_idx, group_col) in self.group_values.iter().enumerate() { group_col.vectorized_equal_to( &self.vectorized_equal_to_group_indices, @@ -337,8 +344,40 @@ impl GroupValuesColumn { &mut equal_to_results, ); } + + let mut current_row_equal_to_result = false; + let mut current_row = *self.vectorized_equal_to_row_indices.first().unwrap(); + for (idx, &row) in self.vectorized_equal_to_row_indices.iter().enumerate() { + // If found next row, according to the equal to result of `current_row` + if current_row != row { + if !current_row_equal_to_result { + self.vectorized_append_row_indices.push(row); + } + current_row = row; + current_row_equal_to_result = equal_to_results[idx]; + continue; + } + current_row_equal_to_result |= equal_to_results[idx]; + } + + if !current_row_equal_to_result { + self.vectorized_append_row_indices.push(current_row); + } + self.vectorized_equal_to_results = equal_to_results; } + + /// Perform `vectorized_append` + /// + /// 1. Vectorized append new values into `group_values` + /// 2. Update `map` and `group_index_lists` + fn vectorized_append(&mut self, cols: &[ArrayRef], batch_hashes: &[u64]) { + if self.vectorized_append_row_indices.is_empty() { + return; + } + + // 1. Vectorized append new values into `group_values` + } } /// instantiates a [`PrimitiveGroupValueBuilder`] and pushes it into $v diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 7db5bd6d6d3a..22cbe70e90ca 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -70,7 +70,7 @@ pub trait GroupColumn: Send + Sync { equal_to_results: &mut [bool], ); - fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize], all_non_null: bool); + fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize]); /// Returns the number of rows stored in this builder fn len(&self) -> usize; @@ -173,22 +173,21 @@ impl GroupColumn } } - fn vectorized_append( - &mut self, - array: &ArrayRef, - rows: &[usize], - all_non_null: bool, - ) { + fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize]) { let arr = array.as_primitive::(); - match (NULLABLE, all_non_null) { - (true, true) => { - self.nulls.append_n(rows.len(), false); - self.group_values.reserve(rows.len()); - for &row in rows { - self.group_values.push(arr.value(row)); - } - } - (true, false) => { + + let null_count = array.null_count(); + let num_rows = array.len(); + let all_null_or_non_null = if null_count == 0 { + Some(true) + } else if null_count == num_rows { + Some(false) + } else { + None + }; + + match (NULLABLE, all_null_or_non_null) { + (true, None) => { for &row in rows { if array.is_null(row) { self.nulls.append(true); @@ -199,6 +198,19 @@ impl GroupColumn } } } + + (true, Some(true)) => { + self.nulls.append_n(rows.len(), false); + self.group_values.reserve(rows.len()); + for &row in rows { + self.group_values.push(arr.value(row)); + } + } + + (true, Some(false)) => { + self.nulls.append_n(rows.len(), true); + } + (false, _) => { self.group_values.reserve(rows.len()); for &row in rows { @@ -280,34 +292,12 @@ where } } - fn append_batch_inner( - &mut self, - array: &ArrayRef, - rows: &[usize], - all_non_null: bool, - ) where + fn equal_to_inner(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool + where B: ByteArrayType, { - let arr = array.as_bytes::(); - - if all_non_null { - self.nulls.append_n(rows.len(), false); - for &row in rows { - self.append_value(arr, row); - } - } else { - for &row in rows { - if arr.is_null(row) { - self.nulls.append(true); - // nulls need a zero length in the offset buffer - let offset = self.buffer.len(); - self.offsets.push(O::usize_as(offset)); - } else { - self.nulls.append(false); - self.append_value(arr, row); - } - } - } + let array = array.as_bytes::(); + self.do_equal_to_inner(lhs_row, array, rhs_row) } fn append_val_inner(&mut self, array: &ArrayRef, row: usize) @@ -322,24 +312,84 @@ where self.offsets.push(O::usize_as(offset)); } else { self.nulls.append(false); - self.append_value(arr, row); + self.do_append_val_inner(arr, row); } } - fn append_value(&mut self, array: &GenericByteArray, row: usize) + fn vectorized_equal_to_inner( + &self, + group_indices: &[usize], + array: &ArrayRef, + rows: &[usize], + equal_to_results: &mut [bool], + ) where + B: ByteArrayType, + { + let array = array.as_bytes::(); + + for (idx, &lhs_row) in group_indices.iter().enumerate() { + // Has found not equal to, don't need to check + if !equal_to_results[idx] { + continue; + } + + let rhs_row = rows[idx]; + equal_to_results[idx] = self.do_equal_to_inner(lhs_row, array, rhs_row); + } + } + + fn vectorized_append_inner(&mut self, array: &ArrayRef, rows: &[usize]) where B: ByteArrayType, { - let value: &[u8] = array.value(row).as_ref(); - self.buffer.append_slice(value); - self.offsets.push(O::usize_as(self.buffer.len())); + let arr = array.as_bytes::(); + let null_count = array.null_count(); + let num_rows = array.len(); + let all_null_or_non_null = if null_count == 0 { + Some(true) + } else if null_count == num_rows { + Some(false) + } else { + None + }; + + match all_null_or_non_null { + None => { + for &row in rows { + if arr.is_null(row) { + self.nulls.append(true); + // nulls need a zero length in the offset buffer + let offset = self.buffer.len(); + self.offsets.push(O::usize_as(offset)); + } else { + self.nulls.append(false); + self.do_append_val_inner(arr, row); + } + } + } + + Some(true) => { + self.nulls.append_n(rows.len(), false); + for &row in rows { + self.do_append_val_inner(arr, row); + } + } + + Some(false) => { + self.nulls.append_n(rows.len(), true); + } + } } - fn equal_to_inner(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool + fn do_equal_to_inner( + &self, + lhs_row: usize, + array: &GenericByteArray, + rhs_row: usize, + ) -> bool where B: ByteArrayType, { - let array = array.as_bytes::(); let exist_null = self.nulls.is_null(lhs_row); let input_null = array.is_null(rhs_row); if let Some(result) = nulls_equal_to(exist_null, input_null) { @@ -349,6 +399,15 @@ where self.value(lhs_row) == (array.value(rhs_row).as_ref() as &[u8]) } + fn do_append_val_inner(&mut self, array: &GenericByteArray, row: usize) + where + B: ByteArrayType, + { + let value: &[u8] = array.value(row).as_ref(); + self.buffer.append_slice(value); + self.offsets.push(O::usize_as(self.buffer.len())); + } + /// return the current value of the specified row irrespective of null pub fn value(&self, row: usize) -> &[u8] { let l = self.offsets[row].as_usize(); @@ -411,37 +470,51 @@ where rows: &[usize], equal_to_results: &mut [bool], ) { - todo!() + // Sanity array type + match self.output_type { + OutputType::Binary => { + debug_assert!(matches!( + array.data_type(), + DataType::Binary | DataType::LargeBinary + )); + self.vectorized_equal_to_inner::>( + group_indices, + array, + rows, + equal_to_results, + ); + } + OutputType::Utf8 => { + debug_assert!(matches!( + array.data_type(), + DataType::Utf8 | DataType::LargeUtf8 + )); + self.vectorized_equal_to_inner::>( + group_indices, + array, + rows, + equal_to_results, + ); + } + _ => unreachable!("View types should use `ArrowBytesViewMap`"), + } } - fn vectorized_append( - &mut self, - column: &ArrayRef, - rows: &[usize], - all_non_null: bool, - ) { + fn vectorized_append(&mut self, column: &ArrayRef, rows: &[usize]) { match self.output_type { OutputType::Binary => { debug_assert!(matches!( column.data_type(), DataType::Binary | DataType::LargeBinary )); - self.append_batch_inner::>( - column, - rows, - all_non_null, - ) + self.vectorized_append_inner::>(column, rows) } OutputType::Utf8 => { debug_assert!(matches!( column.data_type(), DataType::Utf8 | DataType::LargeUtf8 )); - self.append_batch_inner::>( - column, - rows, - all_non_null, - ) + self.vectorized_append_inner::>(column, rows) } _ => unreachable!("View types should use `ArrowBytesViewMap`"), }; @@ -606,31 +679,9 @@ impl ByteViewGroupValueBuilder { self } - fn append_batch_inner( - &mut self, - array: &ArrayRef, - rows: &[usize], - all_non_null: bool, - ) { - let arr = array.as_byte_view::(); - - if all_non_null { - self.nulls.append_n(rows.len(), false); - for &row in rows { - self.do_append_val_inner(arr, row); - } - } else { - for &row in rows { - // Null row case, set and return - if arr.is_valid(row) { - self.nulls.append(false); - self.do_append_val_inner(arr, row); - } else { - self.nulls.append(true); - self.views.push(0); - } - } - } + fn equal_to_inner(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool { + let array = array.as_byte_view::(); + self.do_equal_to_inner(lhs_row, array, rhs_row) } fn append_val_inner(&mut self, array: &ArrayRef, row: usize) { @@ -648,6 +699,65 @@ impl ByteViewGroupValueBuilder { self.do_append_val_inner(arr, row); } + fn vectorized_equal_to_inner( + &self, + group_indices: &[usize], + array: &ArrayRef, + rows: &[usize], + equal_to_results: &mut [bool], + ) { + let array = array.as_byte_view::(); + + for (idx, &lhs_row) in group_indices.iter().enumerate() { + // Has found not equal to, don't need to check + if !equal_to_results[idx] { + continue; + } + + let rhs_row = rows[idx]; + equal_to_results[idx] = self.do_equal_to_inner(lhs_row, array, rhs_row); + } + } + + fn vectorized_append_inner(&mut self, array: &ArrayRef, rows: &[usize]) { + let arr = array.as_byte_view::(); + let null_count = array.null_count(); + let num_rows = array.len(); + let all_null_or_non_null = if null_count == 0 { + Some(true) + } else if null_count == num_rows { + Some(false) + } else { + None + }; + + match all_null_or_non_null { + None => { + for &row in rows { + // Null row case, set and return + if arr.is_valid(row) { + self.nulls.append(false); + self.do_append_val_inner(arr, row); + } else { + self.nulls.append(true); + self.views.push(0); + } + } + } + + Some(true) => { + self.nulls.append_n(rows.len(), false); + for &row in rows { + self.do_append_val_inner(arr, row); + } + } + + Some(false) => { + self.nulls.append_n(rows.len(), true); + } + } + } + fn do_append_val_inner(&mut self, array: &GenericByteViewArray, row: usize) where B: ByteViewType, @@ -688,11 +798,6 @@ impl ByteViewGroupValueBuilder { } } - fn equal_to_inner(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool { - let array = array.as_byte_view::(); - self.do_equal_to_inner(lhs_row, array, rhs_row) - } - fn do_equal_to_inner( &self, lhs_row: usize, @@ -983,16 +1088,11 @@ impl GroupColumn for ByteViewGroupValueBuilder { rows: &[usize], equal_to_results: &mut [bool], ) { - todo!() + self.vectorized_equal_to_inner(group_indices, array, rows, equal_to_results); } - fn vectorized_append( - &mut self, - array: &ArrayRef, - rows: &[usize], - all_non_null: bool, - ) { - self.append_batch_inner(array, rows, all_non_null); + fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize]) { + self.vectorized_append_inner(array, rows); } fn len(&self) -> usize { From 6edc64665b1b61ca29bb5dec901867b5de779f1f Mon Sep 17 00:00:00 2001 From: kamille Date: Fri, 25 Oct 2024 18:14:44 +0800 Subject: [PATCH 18/60] finish the basic vectorized ops logic. --- .../src/aggregates/group_values/column.rs | 259 +++++++++--------- 1 file changed, 134 insertions(+), 125 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index ced09d26e6a8..d25e00797e1e 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::mem; +use std::{iter, mem}; use crate::aggregates::group_values::group_column::{ ByteGroupValueBuilder, ByteViewGroupValueBuilder, GroupColumn, @@ -140,10 +140,7 @@ pub struct GroupValuesColumn { /// `latest group index -> older group index -> even older group index -> ...` group_index_lists: Vec, - /// The marked checking buckets in this round - /// - /// About the checking flag you can see [`BucketContext`] - checking_buckets: Vec>, + index_lists_updates: Vec<(usize, usize)>, /// We need multiple rounds to process the `input cols`, /// and the rows processing in current round is stored here. @@ -153,6 +150,16 @@ pub struct GroupValuesColumn { /// is used to store the rows will be processed in next round. remaining_indices: Vec, + /// The marked checking buckets in this round + /// + /// About the checking flag you can see [`BucketContext`] + empty_buckets: Vec>, + + /// The marked checking buckets in this round + /// + /// About the checking flag you can see [`BucketContext`] + occupied_buckets: Vec>, + /// The `vectorized_equal_tod` row indices buffer vectorized_equal_to_row_indices: Vec, @@ -175,15 +182,13 @@ pub struct GroupValuesColumn { /// [`GroupValuesRows`]: crate::aggregates::group_values::row::GroupValuesRows group_values: Vec>, + group_values_len: usize, + /// reused buffer to store hashes hashes_buffer: Vec, /// Random state for creating hashes random_state: RandomState, - - column_nullables_buffer: Vec, - - append_rows_buffer: Vec, } impl GroupValuesColumn { @@ -195,15 +200,16 @@ impl GroupValuesColumn { schema, map, group_index_lists: Vec::new(), - checking_buckets: Default::default(), + index_lists_updates: Vec::new(), map_size: 0, group_values: vec![], hashes_buffer: Default::default(), random_state: Default::default(), - column_nullables_buffer: vec![false; num_cols], - append_rows_buffer: Default::default(), current_indices: Default::default(), remaining_indices: Default::default(), + group_values_len: 0, + empty_buckets: Default::default(), + occupied_buckets: Default::default(), vectorized_equal_to_row_indices: Default::default(), vectorized_equal_to_group_indices: Default::default(), vectorized_equal_to_results: Default::default(), @@ -264,10 +270,16 @@ impl GroupValuesColumn { /// - Add group indices(from `group_index_lists`) to `vectorized_equal_to_group_indices` /// fn collect_vectorized_process_context(&mut self, batch_hashes: &[u64]) { - let mut next_group_idx = self.group_values[0].len() as u64; + self.vectorized_append_row_indices.clear(); + self.vectorized_equal_to_row_indices.clear(); + self.vectorized_equal_to_group_indices.clear(); + self.empty_buckets.clear(); + self.occupied_buckets.clear(); + self.group_values_len = self.group_values[0].len(); + for &row in self.current_indices.iter() { let target_hash = batch_hashes[row]; - let entry = self.map.get_mut(target_hash, |(exist_hash, _)| { + let entry = self.map.find(target_hash, |(exist_hash, _)| { // Somewhat surprisingly, this closure can be called even if the // hash doesn't match, so check the hash first with an integer // comparison first avoid the more expensive comparison with @@ -275,11 +287,11 @@ impl GroupValuesColumn { target_hash == *exist_hash }); - let Some((_, bucket_ctx)) = entry else { + let Some(bucket) = entry else { // 1. Bucket not found case // Insert the `new bucket` build from the `group index` - // Mark this `new bucket` checking, and add it to `checking_buckets` - let current_group_idx = next_group_idx; + // Mark this `new bucket` checking + let current_group_idx = self.group_values_len as u64; // for hasher function, use precomputed hash value let mut bucket_ctx = BucketContext(current_group_idx); @@ -289,42 +301,55 @@ impl GroupValuesColumn { |(hash, _)| *hash, &mut self.map_size, ); - self.checking_buckets.push(bucket); + self.empty_buckets.push(bucket); // Add row index to `vectorized_append_row_indices` self.vectorized_append_row_indices.push(row); - next_group_idx += 1; + self.group_values_len += 1; continue; }; // 2. bucket found // Check if the `bucket` checking, if so add it to `remaining_indices`, // and just process it in next round, otherwise we continue the process - if bucket_ctx.is_checking() { - self.remaining_indices.push(row); - continue; - } - // Mark `bucket` checking, and add it to `checking_buckets` - bucket_ctx.set_checking(); + let mut list_next_group_idx = unsafe { + let (_, bucket_ctx) = bucket.as_mut(); + if bucket_ctx.is_checking() { + self.remaining_indices.push(row); + continue; + } + + // Mark `bucket` checking + bucket_ctx.set_checking(); + bucket_ctx.group_index() as usize + 1 + }; + + // Add it to `checking_buckets` // Add row index to `vectorized_equal_to_row_indices` // Add group indices(from `group_index_lists`) to `vectorized_equal_to_group_indices` - let mut next_group_index = bucket_ctx.group_index() as usize + 1; - while next_group_index > 0 { - let current_group_index = next_group_index; + while list_next_group_idx > 0 { + self.occupied_buckets.push(bucket.clone()); + + let list_group_idx = list_next_group_idx; self.vectorized_equal_to_row_indices.push(row); self.vectorized_equal_to_group_indices - .push(current_group_index - 1); - next_group_index = self.group_index_lists[current_group_index]; + .push(list_group_idx - 1); + list_next_group_idx = self.group_index_lists[list_group_idx]; } } + + // Reset empty bucket's checking flag + self.empty_buckets.iter().for_each(|bucket| unsafe { + let (_, bucket_ctx) = bucket.as_mut(); + bucket_ctx.unset_checking(); + }); } /// Perform `vectorized_equal_to` - /// fn vectorized_equal_to(&mut self, cols: &[ArrayRef]) { - debug_assert_eq!( + assert_eq!( self.vectorized_equal_to_group_indices.len(), self.vectorized_equal_to_row_indices.len() ); @@ -335,7 +360,9 @@ impl GroupValuesColumn { // Vectorized equal to `cols` and `group columns` let mut equal_to_results = mem::take(&mut self.vectorized_equal_to_results); + equal_to_results.clear(); equal_to_results.resize(self.vectorized_equal_to_group_indices.len(), true); + for (col_idx, group_col) in self.group_values.iter().enumerate() { group_col.vectorized_equal_to( &self.vectorized_equal_to_group_indices, @@ -345,38 +372,87 @@ impl GroupValuesColumn { ); } + self.vectorized_equal_to_results = equal_to_results; + } + + /// Perform `vectorized_append` + /// + /// 1. Check equal to results, if found a equal row, nothing to do; + /// otherwise, we should create a new group for the row: + /// + /// - Modify the related bucket stored in `checking_buckets`, + /// - Store updates for `group_index_lists` in `pending_index_lists_updates` + /// - Increase the `group_values_len` + /// + /// 2. Resize the `group_index_lists`, apply `pending_index_lists_updates` to it + /// + /// 3. Perform `vectorized_append` + /// + fn vectorized_append(&mut self, cols: &[ArrayRef]) { + let mut index_lists_updates = mem::take(&mut self.index_lists_updates); + index_lists_updates.clear(); + + // 1. Check equal to results, if found a equal row, nothing to do; + // otherwise, we should create a new group for the row. let mut current_row_equal_to_result = false; - let mut current_row = *self.vectorized_equal_to_row_indices.first().unwrap(); for (idx, &row) in self.vectorized_equal_to_row_indices.iter().enumerate() { - // If found next row, according to the equal to result of `current_row` - if current_row != row { + current_row_equal_to_result |= self.vectorized_equal_to_results[idx]; + + // Look forward next one row and check + let next_row = self + .vectorized_append_row_indices + .get(idx + 1) + .unwrap_or(&usize::MAX); + if row != *next_row { + // If we should create a new group for the row + // Store update for `group_index_lists` + // Update related `BucketContext`(set the group index to latest) + // Increase the `group_values_len` if !current_row_equal_to_result { self.vectorized_append_row_indices.push(row); + unsafe { + let (_, bucket_ctx) = self.occupied_buckets[idx].as_mut(); + + index_lists_updates.push(( + self.group_values_len + 1, + (bucket_ctx.group_index() + 1) as usize, + )); + + *bucket_ctx = BucketContext(self.group_values_len as u64); + } + + self.group_values_len += 1; + } else { + unsafe { + let (_, bucket_ctx) = self.occupied_buckets[idx].as_mut(); + bucket_ctx.unset_checking(); + } } - current_row = row; - current_row_equal_to_result = equal_to_results[idx]; - continue; + + current_row_equal_to_result = false; } - current_row_equal_to_result |= equal_to_results[idx]; } - if !current_row_equal_to_result { - self.vectorized_append_row_indices.push(current_row); + // 2. Resize the `group_index_lists`, apply `pending_index_lists_updates` to it + self.group_index_lists.resize(self.group_values_len + 1, 0); + for &(latest_index, prev_index) in index_lists_updates.iter() { + self.group_index_lists[latest_index] = prev_index; } - self.vectorized_equal_to_results = equal_to_results; - } - - /// Perform `vectorized_append` - /// - /// 1. Vectorized append new values into `group_values` - /// 2. Update `map` and `group_index_lists` - fn vectorized_append(&mut self, cols: &[ArrayRef], batch_hashes: &[u64]) { + // 3. Perform `vectorized_append` if self.vectorized_append_row_indices.is_empty() { return; } - // 1. Vectorized append new values into `group_values` + let iter = self.group_values.iter_mut().zip(cols.iter()); + for (group_column, col) in iter { + group_column.vectorized_append(col, &self.vectorized_append_row_indices); + } + + assert_eq!(self.group_values[0].len(), self.group_values_len); + + // Set back `index_lists_updates`. + self.index_lists_updates = index_lists_updates; } } @@ -469,21 +545,22 @@ impl GroupValues for GroupValuesColumn { // 1. Collect vectorized context by checking hash values of `cols` in `map` // 2. Perform `vectorized_equal_to` // 3. Perform `vectorized_append` - // 4. Reset the checking flag in `BucketContext` - + // 4. Update `current_indices` let num_rows = cols[0].len(); self.current_indices.clear(); self.current_indices.extend(0..num_rows); while self.current_indices.len() > 0 { - self.vectorized_append_row_indices.clear(); - self.vectorized_equal_to_row_indices.clear(); - self.vectorized_equal_to_group_indices.clear(); - self.vectorized_equal_to_results.clear(); - // 1. Collect vectorized context by checking hash values of `cols` in `map` self.collect_vectorized_process_context(&batch_hashes); // 2. Perform `vectorized_equal_to` + self.vectorized_equal_to(cols); + + // 3. Perform `vectorized_append` + self.vectorized_append(cols); + + // 4. Update `current_indices` + mem::swap(&mut self.current_indices, &mut self.remaining_indices); } self.hashes_buffer = batch_hashes; @@ -570,71 +647,3 @@ impl GroupValues for GroupValuesColumn { self.hashes_buffer.shrink_to(count); } } - -fn is_rows_eq( - left_arrays: &[ArrayRef], - left: usize, - right_arrays: &[ArrayRef], - right: usize, -) -> Result { - let mut is_equal = true; - for (left_array, right_array) in left_arrays.iter().zip(right_arrays) { - macro_rules! compare_value { - ($T:ty) => {{ - match (left_array.is_null(left), right_array.is_null(right)) { - (false, false) => { - let left_array = - left_array.as_any().downcast_ref::<$T>().unwrap(); - let right_array = - right_array.as_any().downcast_ref::<$T>().unwrap(); - if left_array.value(left) != right_array.value(right) { - is_equal = false; - } - } - (true, false) => is_equal = false, - (false, true) => is_equal = false, - _ => {} - } - }}; - } - - match left_array.data_type() { - DataType::Null => {} - DataType::Boolean => compare_value!(BooleanArray), - DataType::Int8 => compare_value!(Int8Array), - DataType::Int16 => compare_value!(Int16Array), - DataType::Int32 => compare_value!(Int32Array), - DataType::Int64 => compare_value!(Int64Array), - DataType::UInt8 => compare_value!(UInt8Array), - DataType::UInt16 => compare_value!(UInt16Array), - DataType::UInt32 => compare_value!(UInt32Array), - DataType::UInt64 => compare_value!(UInt64Array), - DataType::Float32 => compare_value!(Float32Array), - DataType::Float64 => compare_value!(Float64Array), - DataType::Utf8 => compare_value!(StringArray), - DataType::LargeUtf8 => compare_value!(LargeStringArray), - DataType::Binary => compare_value!(BinaryArray), - DataType::Utf8View => compare_value!(StringViewArray), - DataType::BinaryView => compare_value!(BinaryViewArray), - DataType::Decimal128(..) => compare_value!(Decimal128Array), - DataType::Timestamp(time_unit, None) => match time_unit { - TimeUnit::Second => compare_value!(TimestampSecondArray), - TimeUnit::Millisecond => compare_value!(TimestampMillisecondArray), - TimeUnit::Microsecond => compare_value!(TimestampMicrosecondArray), - TimeUnit::Nanosecond => compare_value!(TimestampNanosecondArray), - }, - DataType::Date32 => compare_value!(Date32Array), - DataType::Date64 => compare_value!(Date64Array), - dt => { - return not_impl_err!( - "Unsupported data type in sort merge join comparator: {}", - dt - ); - } - } - if !is_equal { - return Ok(false); - } - } - Ok(true) -} From 150248f1203bb231fdce9c7f69d15c5fc603c514 Mon Sep 17 00:00:00 2001 From: kamille Date: Sat, 26 Oct 2024 16:55:08 +0800 Subject: [PATCH 19/60] impl `take_n`. --- .../src/aggregates/group_values/column.rs | 75 +++++++++++++++---- 1 file changed, 59 insertions(+), 16 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index d25e00797e1e..0b9d9176f421 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use std::{iter, mem}; +use std::ops::Sub; +use std::{iter, mem, usize}; use crate::aggregates::group_values::group_column::{ ByteGroupValueBuilder, ByteViewGroupValueBuilder, GroupColumn, @@ -388,15 +389,24 @@ impl GroupValuesColumn { /// /// 3. Perform `vectorized_append` /// - fn vectorized_append(&mut self, cols: &[ArrayRef]) { + fn vectorized_append(&mut self, cols: &[ArrayRef], groups: &mut Vec) { let mut index_lists_updates = mem::take(&mut self.index_lists_updates); index_lists_updates.clear(); + // Set the default value to usize::MAX, so when we made a mistake, + // panic will happen rather than + groups.resize(cols[0].len(), usize::MAX); // 1. Check equal to results, if found a equal row, nothing to do; // otherwise, we should create a new group for the row. let mut current_row_equal_to_result = false; + let mut current_match_group_index = None; for (idx, &row) in self.vectorized_equal_to_row_indices.iter().enumerate() { - current_row_equal_to_result |= self.vectorized_equal_to_results[idx]; + let equal_to_result = self.vectorized_equal_to_results[idx]; + if equal_to_result { + current_match_group_index = + Some(self.vectorized_equal_to_group_indices[idx]); + } + current_row_equal_to_result |= equal_to_result; // Look forward next one row and check let next_row = self @@ -427,8 +437,10 @@ impl GroupValuesColumn { let (_, bucket_ctx) = self.occupied_buckets[idx].as_mut(); bucket_ctx.unset_checking(); } + groups[row] = current_match_group_index.unwrap(); } + current_match_group_index = None; current_row_equal_to_result = false; } } @@ -545,7 +557,7 @@ impl GroupValues for GroupValuesColumn { // 1. Collect vectorized context by checking hash values of `cols` in `map` // 2. Perform `vectorized_equal_to` // 3. Perform `vectorized_append` - // 4. Update `current_indices` + // 4. Update `current_indices` let num_rows = cols[0].len(); self.current_indices.clear(); self.current_indices.extend(0..num_rows); @@ -557,7 +569,7 @@ impl GroupValues for GroupValuesColumn { self.vectorized_equal_to(cols); // 3. Perform `vectorized_append` - self.vectorized_append(cols); + self.vectorized_append(cols, groups); // 4. Update `current_indices` mem::swap(&mut self.current_indices, &mut self.remaining_indices); @@ -603,18 +615,39 @@ impl GroupValues for GroupValuesColumn { .map(|v| v.take_n(n)) .collect::>(); + // Update `map` // SAFETY: self.map outlives iterator and is not modified concurrently - // unsafe { - // for bucket in self.map.iter() { - // // Decrement group index by n - // match bucket.as_ref().1.0.checked_sub(n) { - // // Group index was >= n, shift value down - // Some(sub) => bucket.as_mut().1 = sub, - // // Group index was < n, so remove from table - // None => self.map.erase(bucket), - // } - // } - // } + unsafe { + for bucket in self.map.iter() { + let group_index = { + let (_, bucket_ctx) = bucket.as_ref(); + debug_assert!(!bucket_ctx.is_checking()); + bucket_ctx.group_index() + }; + + // Decrement group index in map by n + match group_index.checked_sub(n as u64) { + // Group index was >= n, shift value down + Some(sub) => bucket.as_mut().1 = BucketContext(sub), + // Group index was < n, so remove from table + None => self.map.erase(bucket), + } + } + } + + // Update `group_index_lists` + // Loop and decrement the [n+1..] list nodes + let start_idx = n + 1; + let list_len = self.group_index_lists.len(); + for idx in start_idx..list_len { + let new_idx = idx - n; + + let next_idx = self.group_index_lists[idx]; + let new_next_idx = next_idx.checked_sub(n).unwrap_or(0); + + self.group_index_lists[new_idx] = new_next_idx; + } + self.group_index_lists.resize(self.group_values[0].len() + 1, 0); output } @@ -645,5 +678,15 @@ impl GroupValues for GroupValuesColumn { self.map_size = self.map.capacity() * std::mem::size_of::<(u64, usize)>(); self.hashes_buffer.clear(); self.hashes_buffer.shrink_to(count); + self.group_index_lists.clear(); + self.index_lists_updates.clear(); + self.current_indices.clear(); + self.remaining_indices.clear(); + self.empty_buckets.clear(); + self.occupied_buckets.clear(); + self.vectorized_append_row_indices.clear(); + self.vectorized_equal_to_row_indices.clear(); + self.vectorized_equal_to_group_indices.clear(); + self.vectorized_equal_to_results.clear(); } } From 37d68e62c0b13d31db2f3240d226d9620af57e16 Mon Sep 17 00:00:00 2001 From: kamille Date: Sun, 27 Oct 2024 08:53:23 +0800 Subject: [PATCH 20/60] fix `renaming clear` and `groups fill`. --- .../src/aggregates/group_values/column.rs | 113 +++++++++++++++++- 1 file changed, 110 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 0b9d9176f421..f96429890d46 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -276,8 +276,9 @@ impl GroupValuesColumn { self.vectorized_equal_to_group_indices.clear(); self.empty_buckets.clear(); self.occupied_buckets.clear(); - self.group_values_len = self.group_values[0].len(); + self.remaining_indices.clear(); + self.group_values_len = self.group_values[0].len(); for &row in self.current_indices.iter() { let target_hash = batch_hashes[row]; let entry = self.map.find(target_hash, |(exist_hash, _)| { @@ -456,13 +457,21 @@ impl GroupValuesColumn { return; } + let group_len_before_appending = self.group_values[0].len(); let iter = self.group_values.iter_mut().zip(cols.iter()); for (group_column, col) in iter { group_column.vectorized_append(col, &self.vectorized_append_row_indices); } - assert_eq!(self.group_values[0].len(), self.group_values_len); + let iter = self + .vectorized_append_row_indices + .iter() + .zip(group_len_before_appending..self.group_values_len); + for (&row, group_idx) in iter { + groups[row] = group_idx; + } + // Set back `index_lists_updates`. self.index_lists_updates = index_lists_updates; } @@ -647,7 +656,8 @@ impl GroupValues for GroupValuesColumn { self.group_index_lists[new_idx] = new_next_idx; } - self.group_index_lists.resize(self.group_values[0].len() + 1, 0); + self.group_index_lists + .resize(self.group_values[0].len() + 1, 0); output } @@ -690,3 +700,100 @@ impl GroupValues for GroupValuesColumn { self.vectorized_equal_to_results.clear(); } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_array::{ArrayRef, Int64Array, StringArray}; + use arrow_schema::{DataType, Field, Schema}; + + use crate::aggregates::group_values::{column::GroupValuesColumn, GroupValues}; + + #[test] + fn test() { + // *************************************************************** + // The test group cols, the schema is `a(Int64) + b(String)`. + // It should cover following input rows situations: + // - a: null + b: null + // - a: not null + b: null + // - a: null + b: not null + // - a: not null + b: not null + // + // And it should cover following repeating situations: + // - Rows unique + // - Rows repeating in two `cols` + // - Rows repeating in single `cols` + // *************************************************************** + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Utf8, true), + ])); + // // Case 1 + // Cols 1 + let a: ArrayRef = Arc::new(Int64Array::from(vec![ + None, + Some(42), + None, + Some(24), + Some(4224), + ])); + let b: ArrayRef = Arc::new(StringArray::from(vec![ + None, + None, + Some("42"), + Some("24"), + Some("4224"), + ])); + let cols1 = vec![a, b]; + + // Cols 2 + let a: ArrayRef = Arc::new(Int64Array::from(vec![ + None, + Some(42), + None, + Some(24), + Some(2442), + ])); + let b: ArrayRef = Arc::new(StringArray::from(vec![ + None, + None, + Some("42"), + Some("24"), + Some("2442"), + ])); + let cols2 = vec![a, b]; + + // Cols 3 + let a: ArrayRef = Arc::new(Int64Array::from(vec![ + None, + Some(42), + None, + Some(24), + None, + Some(42), + None, + Some(24), + Some(4224), + ])); + let b: ArrayRef = Arc::new(StringArray::from(vec![ + None, + None, + Some("42"), + Some("24"), + None, + None, + Some("42"), + Some("24"), + Some("4224"), + ])); + let cols3 = vec![a, b]; + + let mut group_values = GroupValuesColumn::try_new(schema).unwrap(); + let mut groups = Vec::new(); + group_values.intern(&cols1, &mut groups).unwrap(); + group_values.intern(&cols2, &mut groups).unwrap(); + group_values.intern(&cols3, &mut groups).unwrap(); + } +} From ebd9db97bb70c73ddf36b4683787888f72e8a091 Mon Sep 17 00:00:00 2001 From: kamille Date: Sun, 27 Oct 2024 08:57:57 +0800 Subject: [PATCH 21/60] fix death loop due to rehashing. --- datafusion/physical-plan/src/aggregates/group_values/column.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index f96429890d46..f042fdad682f 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -562,6 +562,8 @@ impl GroupValues for GroupValuesColumn { batch_hashes.resize(n_rows, 0); create_hashes(cols, &self.random_state, &mut batch_hashes)?; + self.map.reserve(n_rows, |(hash, _)| *hash); + // General steps for one round `vectorized equal_to & append`: // 1. Collect vectorized context by checking hash values of `cols` in `map` // 2. Perform `vectorized_equal_to` From 71c45ce3b032c3bdb7818ab6b3baedfd468a01b2 Mon Sep 17 00:00:00 2001 From: kamille Date: Sun, 27 Oct 2024 18:05:22 +0800 Subject: [PATCH 22/60] fix vectorized append. --- .../src/aggregates/group_values/column.rs | 10 ++++++++-- .../src/aggregates/group_values/group_column.rs | 8 ++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index f042fdad682f..abb1c3957659 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -411,7 +411,7 @@ impl GroupValuesColumn { // Look forward next one row and check let next_row = self - .vectorized_append_row_indices + .vectorized_equal_to_row_indices .get(idx + 1) .unwrap_or(&usize::MAX); if row != *next_row { @@ -462,7 +462,13 @@ impl GroupValuesColumn { for (group_column, col) in iter { group_column.vectorized_append(col, &self.vectorized_append_row_indices); } - assert_eq!(self.group_values[0].len(), self.group_values_len); + assert_eq!( + self.group_values[0].len(), + self.group_values_len, + "group_len_before_appending:{}, vectorized_append_row_indices:{}", + group_len_before_appending, + self.vectorized_append_row_indices.len(), + ); let iter = self .vectorized_append_row_indices diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 22cbe70e90ca..63b2d8481541 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -209,6 +209,8 @@ impl GroupColumn (true, Some(false)) => { self.nulls.append_n(rows.len(), true); + let new_len = self.group_values.len() + rows.len(); + self.group_values.resize(new_len, T::default_value()); } (false, _) => { @@ -377,6 +379,10 @@ where Some(false) => { self.nulls.append_n(rows.len(), true); + + let new_len = self.offsets.len() + rows.len(); + let offset = self.buffer.len(); + self.offsets.resize(new_len, O::usize_as(offset)); } } } @@ -754,6 +760,8 @@ impl ByteViewGroupValueBuilder { Some(false) => { self.nulls.append_n(rows.len(), true); + let new_len = self.views.len() + rows.len(); + self.views.resize(new_len, 0); } } } From 2f272f260cd556fc24ef28370031e53cc5da700e Mon Sep 17 00:00:00 2001 From: kamille Date: Sun, 27 Oct 2024 18:34:23 +0800 Subject: [PATCH 23/60] add counter. --- datafusion/physical-plan/src/aggregates/group_values/column.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index abb1c3957659..6ee1bd134bc4 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -578,6 +578,7 @@ impl GroupValues for GroupValuesColumn { let num_rows = cols[0].len(); self.current_indices.clear(); self.current_indices.extend(0..num_rows); + let mut count = 0; while self.current_indices.len() > 0 { // 1. Collect vectorized context by checking hash values of `cols` in `map` self.collect_vectorized_process_context(&batch_hashes); @@ -590,8 +591,10 @@ impl GroupValues for GroupValuesColumn { // 4. Update `current_indices` mem::swap(&mut self.current_indices, &mut self.remaining_indices); + count += 1; } + dbg!(&count); self.hashes_buffer = batch_hashes; Ok(()) From 731723c486b7f8fca2705af6f822c321f0c556d2 Mon Sep 17 00:00:00 2001 From: kamille Date: Sun, 27 Oct 2024 19:03:06 +0800 Subject: [PATCH 24/60] use extend rather than resize. --- .../physical-plan/src/aggregates/group_values/column.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 6ee1bd134bc4..6e4958c34ad5 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -447,7 +447,9 @@ impl GroupValuesColumn { } // 2. Resize the `group_index_lists`, apply `pending_index_lists_updates` to it - self.group_index_lists.resize(self.group_values_len + 1, 0); + let addition = (self.group_values_len + 1) - self.group_index_lists.len(); + self.group_index_lists + .extend(iter::repeat(0).take(addition)); for &(latest_index, prev_index) in index_lists_updates.iter() { self.group_index_lists[latest_index] = prev_index; } From a77f5161be0b5572d10383283a7407a10efbc7ca Mon Sep 17 00:00:00 2001 From: kamille Date: Sun, 27 Oct 2024 19:03:38 +0800 Subject: [PATCH 25/60] remove dbg!. --- datafusion/physical-plan/src/aggregates/group_values/column.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 6e4958c34ad5..c3f6b6cfe405 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -596,7 +596,6 @@ impl GroupValues for GroupValuesColumn { count += 1; } - dbg!(&count); self.hashes_buffer = batch_hashes; Ok(()) From 1830c1ae56688874ca2dceb5e3fb28068b3d75d2 Mon Sep 17 00:00:00 2001 From: kamille Date: Sun, 27 Oct 2024 20:14:41 +0800 Subject: [PATCH 26/60] remove reserve. --- .../src/aggregates/group_values/group_column.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 63b2d8481541..c40e6eb3fa97 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -26,6 +26,7 @@ use arrow::array::StringViewBuilder; use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, AsArray}; use arrow::buffer::OffsetBuffer; use arrow::buffer::ScalarBuffer; +use arrow::compute; use arrow::datatypes::ByteArrayType; use arrow::datatypes::ByteViewType; use arrow::datatypes::DataType; @@ -39,6 +40,7 @@ use datafusion_expr::sqlparser::keywords::NULLABLE; use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder; use arrow_array::types::GenericStringType; use datafusion_physical_expr_common::binary_map::{OutputType, INITIAL_BUFFER_CAPACITY}; +use std::iter; use std::marker::PhantomData; use std::mem; use std::sync::Arc; @@ -201,7 +203,6 @@ impl GroupColumn (true, Some(true)) => { self.nulls.append_n(rows.len(), false); - self.group_values.reserve(rows.len()); for &row in rows { self.group_values.push(arr.value(row)); } @@ -209,12 +210,11 @@ impl GroupColumn (true, Some(false)) => { self.nulls.append_n(rows.len(), true); - let new_len = self.group_values.len() + rows.len(); - self.group_values.resize(new_len, T::default_value()); + self.group_values + .extend(iter::repeat(T::default_value()).take(rows.len())); } (false, _) => { - self.group_values.reserve(rows.len()); for &row in rows { self.group_values.push(arr.value(row)); } From b6f2d0049402279448cfe0c6c13eb8f937fa055e Mon Sep 17 00:00:00 2001 From: kamille Date: Sun, 27 Oct 2024 23:35:30 +0800 Subject: [PATCH 27/60] refactor the codes to make simpler and more performant. --- .../src/aggregates/group_values/column.rs | 495 ++++++++++-------- 1 file changed, 268 insertions(+), 227 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index c3f6b6cfe405..1c5e8930903f 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -48,9 +48,8 @@ use datafusion_physical_expr::binary_map::OutputType; use datafusion_physical_expr_common::datum::compare_with_eq; use hashbrown::raw::{Bucket, RawTable}; -const CHECKING_FLAG_MASK: u64 = 0x8000000000000000; -const SET_CHECKING_FLAG_MASK: u64 = 0x8000000000000000; -const UNSET_CHECKING_FLAG_MASK: u64 = 0x7FFFFFFFFFFFFFFF; +const NON_INLINED_FLAG: u64 = 0x8000000000000000; +const VALUE_MASK: u64 = 0x7FFFFFFFFFFFFFFF; /// `BucketContext` is a packed struct /// @@ -89,27 +88,28 @@ const UNSET_CHECKING_FLAG_MASK: u64 = 0x7FFFFFFFFFFFFFFF; /// The group's index in group values /// #[derive(Debug, Clone, Copy)] -struct BucketContext(u64); +struct GroupIndexView(u64); -impl BucketContext { +impl GroupIndexView { #[inline] - pub fn is_checking(&self) -> bool { - (self.0 & CHECKING_FLAG_MASK) > 0 + pub fn is_non_inlinded(&self) -> bool { + (self.0 & NON_INLINED_FLAG) > 0 } #[inline] - pub fn set_checking(&mut self) { - self.0 |= SET_CHECKING_FLAG_MASK + pub fn new_inlined(group_index: u64) -> Self { + Self(group_index) } #[inline] - pub fn unset_checking(&mut self) { - self.0 &= UNSET_CHECKING_FLAG_MASK + pub fn new_non_inlined(list_offset: u64) -> Self { + let non_inlined_value = list_offset | NON_INLINED_FLAG; + Self(non_inlined_value) } #[inline] - pub fn group_index(&self) -> u64 { - self.0 & UNSET_CHECKING_FLAG_MASK + pub fn value(&self) -> u64 { + self.0 & VALUE_MASK } } @@ -127,7 +127,7 @@ pub struct GroupValuesColumn { /// /// keys: u64 hashes of the GroupValue /// values: (hash, group_index) - map: RawTable<(u64, BucketContext)>, + map: RawTable<(u64, GroupIndexView)>, /// The size of `map` in bytes map_size: usize, @@ -139,27 +139,13 @@ pub struct GroupValuesColumn { /// /// The chained indices is like: /// `latest group index -> older group index -> even older group index -> ...` - group_index_lists: Vec, + group_index_lists: Vec>, index_lists_updates: Vec<(usize, usize)>, - /// We need multiple rounds to process the `input cols`, - /// and the rows processing in current round is stored here. - current_indices: Vec, - /// Similar as `current_indices`, but `remaining_indices` /// is used to store the rows will be processed in next round. - remaining_indices: Vec, - - /// The marked checking buckets in this round - /// - /// About the checking flag you can see [`BucketContext`] - empty_buckets: Vec>, - - /// The marked checking buckets in this round - /// - /// About the checking flag you can see [`BucketContext`] - occupied_buckets: Vec>, + scalarized_indices: Vec, /// The `vectorized_equal_tod` row indices buffer vectorized_equal_to_row_indices: Vec, @@ -183,8 +169,6 @@ pub struct GroupValuesColumn { /// [`GroupValuesRows`]: crate::aggregates::group_values::row::GroupValuesRows group_values: Vec>, - group_values_len: usize, - /// reused buffer to store hashes hashes_buffer: Vec, @@ -196,7 +180,6 @@ impl GroupValuesColumn { /// Create a new instance of GroupValuesColumn if supported for the specified schema pub fn try_new(schema: SchemaRef) -> Result { let map = RawTable::with_capacity(0); - let num_cols = schema.fields.len(); Ok(Self { schema, map, @@ -206,11 +189,7 @@ impl GroupValuesColumn { group_values: vec![], hashes_buffer: Default::default(), random_state: Default::default(), - current_indices: Default::default(), - remaining_indices: Default::default(), - group_values_len: 0, - empty_buckets: Default::default(), - occupied_buckets: Default::default(), + scalarized_indices: Default::default(), vectorized_equal_to_row_indices: Default::default(), vectorized_equal_to_group_indices: Default::default(), vectorized_equal_to_results: Default::default(), @@ -258,30 +237,29 @@ impl GroupValuesColumn { /// Collect vectorized context by checking hash values of `cols` in `map` /// /// 1. If bucket not found - /// - Insert the `new bucket` build from the `group index` + /// - Build and insert the `new inlined group index view` /// and its hash value to `map` - /// - Mark this `new bucket` checking, and add it to `checking_buckets` /// - Add row index to `vectorized_append_row_indices` + /// - Set group index to row in `groups` /// /// 2. bucket found - /// - Check if the `bucket` checking, if so add it to `remaining_indices`, - /// and just process it in next round, otherwise we continue the process - /// - Mark `bucket` checking, and add it to `checking_buckets` /// - Add row index to `vectorized_equal_to_row_indices` - /// - Add group indices(from `group_index_lists`) to `vectorized_equal_to_group_indices` + /// - Check if the `group index view` is `inlined` or `non_inlined`: + /// If it is inlined, add to `vectorized_equal_to_group_indices` directly. + /// Otherwise get all group indices from `group_index_lists`, and add them. /// - fn collect_vectorized_process_context(&mut self, batch_hashes: &[u64]) { + fn collect_vectorized_process_context( + &mut self, + batch_hashes: &[u64], + groups: &mut Vec, + ) { self.vectorized_append_row_indices.clear(); self.vectorized_equal_to_row_indices.clear(); self.vectorized_equal_to_group_indices.clear(); - self.empty_buckets.clear(); - self.occupied_buckets.clear(); - self.remaining_indices.clear(); - self.group_values_len = self.group_values[0].len(); - for &row in self.current_indices.iter() { - let target_hash = batch_hashes[row]; - let entry = self.map.find(target_hash, |(exist_hash, _)| { + let mut group_values_len = self.group_values[0].len(); + for (row, &target_hash) in batch_hashes.iter().enumerate() { + let entry = self.map.get(target_hash, |(exist_hash, _)| { // Somewhat surprisingly, this closure can be called even if the // hash doesn't match, so check the hash first with an integer // comparison first avoid the more expensive comparison with @@ -289,68 +267,77 @@ impl GroupValuesColumn { target_hash == *exist_hash }); - let Some(bucket) = entry else { + let Some((_, group_index_view)) = entry else { // 1. Bucket not found case - // Insert the `new bucket` build from the `group index` - // Mark this `new bucket` checking - let current_group_idx = self.group_values_len as u64; + // Build `new inlined group index view` + let current_group_idx = group_values_len; + let group_index_view = + GroupIndexView::new_inlined(current_group_idx as u64); + // Insert the `group index view` and its hash into `map` // for hasher function, use precomputed hash value - let mut bucket_ctx = BucketContext(current_group_idx); - bucket_ctx.set_checking(); - let bucket = self.map.insert_accounted( - (target_hash, bucket_ctx), + self.map.insert_accounted( + (target_hash, group_index_view), |(hash, _)| *hash, &mut self.map_size, ); - self.empty_buckets.push(bucket); // Add row index to `vectorized_append_row_indices` self.vectorized_append_row_indices.push(row); - self.group_values_len += 1; + // Set group index to row in `groups` + groups[row] = current_group_idx; + + group_values_len += 1; continue; }; // 2. bucket found - // Check if the `bucket` checking, if so add it to `remaining_indices`, - // and just process it in next round, otherwise we continue the process - let mut list_next_group_idx = unsafe { - let (_, bucket_ctx) = bucket.as_mut(); - - if bucket_ctx.is_checking() { - self.remaining_indices.push(row); - continue; + // Check if the `group index view` is `inlined` or `non_inlined` + if group_index_view.is_non_inlinded() { + // Non-inlined case, the value of view is offset in `group_index_lists`. + // We use it to get `group_index_list`, and add related `rows` and `group_indices` + // into `vectorized_equal_to_row_indices` and `vectorized_equal_to_group_indices`. + let list_offset = group_index_view.value() as usize; + let group_index_list = &self.group_index_lists[list_offset]; + for &group_index in group_index_list { + self.vectorized_equal_to_row_indices.push(row); + self.vectorized_equal_to_group_indices.push(group_index); } - - // Mark `bucket` checking - bucket_ctx.set_checking(); - bucket_ctx.group_index() as usize + 1 - }; - - // Add it to `checking_buckets` - // Add row index to `vectorized_equal_to_row_indices` - // Add group indices(from `group_index_lists`) to `vectorized_equal_to_group_indices` - while list_next_group_idx > 0 { - self.occupied_buckets.push(bucket.clone()); - - let list_group_idx = list_next_group_idx; + } else { + let group_index = group_index_view.value() as usize; self.vectorized_equal_to_row_indices.push(row); - self.vectorized_equal_to_group_indices - .push(list_group_idx - 1); - list_next_group_idx = self.group_index_lists[list_group_idx]; + self.vectorized_equal_to_group_indices.push(group_index); } } + } - // Reset empty bucket's checking flag - self.empty_buckets.iter().for_each(|bucket| unsafe { - let (_, bucket_ctx) = bucket.as_mut(); - bucket_ctx.unset_checking(); - }); + /// Perform `vectorized_append`` for `rows` in `vectorized_append_row_indices` + fn vectorized_append(&mut self, cols: &[ArrayRef]) { + if self.vectorized_append_row_indices.is_empty() { + return; + } + + let iter = self.group_values.iter_mut().zip(cols.iter()); + for (group_column, col) in iter { + group_column.vectorized_append(col, &self.vectorized_append_row_indices); + } } /// Perform `vectorized_equal_to` - fn vectorized_equal_to(&mut self, cols: &[ArrayRef]) { + /// + /// 1. Perform `vectorized_equal_to` for `rows` in `vectorized_equal_to_group_indices` + /// and `group_indices` in `vectorized_equal_to_group_indices`. + /// + /// 2. Check `equal_to_results`: + /// + /// If found equal to `rows`, set the `group_indices` to `rows` in `groups`. + /// + /// If found not equal to `row`s, just add them to `scalarized_indices`, + /// and perform `scalarized_intern` for them after. + /// Usually, such `rows` having same hash but different value with `exists rows` + /// are very few. + fn vectorized_equal_to(&mut self, cols: &[ArrayRef], groups: &mut Vec) { assert_eq!( self.vectorized_equal_to_group_indices.len(), self.vectorized_equal_to_row_indices.len() @@ -360,10 +347,12 @@ impl GroupValuesColumn { return; } - // Vectorized equal to `cols` and `group columns` + // 1. Perform `vectorized_equal_to` for `rows` in `vectorized_equal_to_group_indices` + // and `group_indices` in `vectorized_equal_to_group_indices` let mut equal_to_results = mem::take(&mut self.vectorized_equal_to_results); equal_to_results.clear(); equal_to_results.resize(self.vectorized_equal_to_group_indices.len(), true); + self.scalarized_indices.clear(); for (col_idx, group_col) in self.group_values.iter().enumerate() { group_col.vectorized_equal_to( @@ -375,113 +364,157 @@ impl GroupValuesColumn { } self.vectorized_equal_to_results = equal_to_results; - } - /// Perform `vectorized_append` - /// - /// 1. Check equal to results, if found a equal row, nothing to do; - /// otherwise, we should create a new group for the row: - /// - /// - Modify the related bucket stored in `checking_buckets`, - /// - Store updates for `group_index_lists` in `pending_index_lists_updates` - /// - Increase the `group_values_len` - /// - /// 2. Resize the `group_index_lists`, apply `pending_index_lists_updates` to it - /// - /// 3. Perform `vectorized_append` - /// - fn vectorized_append(&mut self, cols: &[ArrayRef], groups: &mut Vec) { - let mut index_lists_updates = mem::take(&mut self.index_lists_updates); - index_lists_updates.clear(); - // Set the default value to usize::MAX, so when we made a mistake, - // panic will happen rather than - groups.resize(cols[0].len(), usize::MAX); - - // 1. Check equal to results, if found a equal row, nothing to do; - // otherwise, we should create a new group for the row. + // 2. Check `equal_to_results`, if found not equal to `row`s, just add them + // to `scalarized_indices`, and perform `scalarized_intern` for them after. let mut current_row_equal_to_result = false; - let mut current_match_group_index = None; for (idx, &row) in self.vectorized_equal_to_row_indices.iter().enumerate() { let equal_to_result = self.vectorized_equal_to_results[idx]; + + // Equal to case, set the `group_indices` to `rows` in `groups` if equal_to_result { - current_match_group_index = - Some(self.vectorized_equal_to_group_indices[idx]); + groups[row] = self.vectorized_equal_to_group_indices[idx]; } current_row_equal_to_result |= equal_to_result; - // Look forward next one row and check + // Look forward next one row to check if have checked all results + // of current row let next_row = self .vectorized_equal_to_row_indices .get(idx + 1) .unwrap_or(&usize::MAX); + + // Have checked all results of current row, check the total result if row != *next_row { - // If we should create a new group for the row - // Store update for `group_index_lists` - // Update related `BucketContext`(set the group index to latest) - // Increase the `group_values_len` + // Not equal to case, add `row` to `scalarized_indices` if !current_row_equal_to_result { - self.vectorized_append_row_indices.push(row); - unsafe { - let (_, bucket_ctx) = self.occupied_buckets[idx].as_mut(); - - index_lists_updates.push(( - self.group_values_len + 1, - (bucket_ctx.group_index() + 1) as usize, - )); - - *bucket_ctx = BucketContext(self.group_values_len as u64); - } - - self.group_values_len += 1; - } else { - unsafe { - let (_, bucket_ctx) = self.occupied_buckets[idx].as_mut(); - bucket_ctx.unset_checking(); - } - groups[row] = current_match_group_index.unwrap(); + self.scalarized_indices.push(row); } - current_match_group_index = None; + // Init the total result for checking next row current_row_equal_to_result = false; } } + } - // 2. Resize the `group_index_lists`, apply `pending_index_lists_updates` to it - let addition = (self.group_values_len + 1) - self.group_index_lists.len(); - self.group_index_lists - .extend(iter::repeat(0).take(addition)); - for &(latest_index, prev_index) in index_lists_updates.iter() { - self.group_index_lists[latest_index] = prev_index; - } + fn scalarized_intern( + &mut self, + cols: &[ArrayRef], + batch_hashes: &[u64], + groups: &mut Vec, + ) { + for &row in &self.scalarized_indices { + let target_hash = batch_hashes[row]; + let entry = + self.map + .get_mut(target_hash, |(exist_hash, group_index_view)| { + // Somewhat surprisingly, this closure can be called even if the + // hash doesn't match, so check the hash first with an integer + // comparison first avoid the more expensive comparison with + // group value. https://github.com/apache/datafusion/pull/11718 + if target_hash != *exist_hash { + return false; + } - // 3. Perform `vectorized_append` - if self.vectorized_append_row_indices.is_empty() { - return; - } + fn check_row_equal( + array_row: &dyn GroupColumn, + lhs_row: usize, + array: &ArrayRef, + rhs_row: usize, + ) -> bool { + array_row.equal_to(lhs_row, array, rhs_row) + } - let group_len_before_appending = self.group_values[0].len(); - let iter = self.group_values.iter_mut().zip(cols.iter()); - for (group_column, col) in iter { - group_column.vectorized_append(col, &self.vectorized_append_row_indices); - } - assert_eq!( - self.group_values[0].len(), - self.group_values_len, - "group_len_before_appending:{}, vectorized_append_row_indices:{}", - group_len_before_appending, - self.vectorized_append_row_indices.len(), - ); + if group_index_view.is_non_inlinded() { + let mut check_result = false; + let list_offset = group_index_view.value() as usize; + let group_index_list = &self.group_index_lists[list_offset]; + + for &group_idx in group_index_list { + // If found one matched group, return true result + if check_result { + return true; + } + + for (i, group_val) in self.group_values.iter().enumerate() + { + if !check_row_equal( + group_val.as_ref(), + group_idx, + &cols[i], + row, + ) { + break; + } + check_result = true; + } + } + + // All groups unmatched, return false result + false + } else { + let group_idx = group_index_view.value() as usize; + for (i, group_val) in self.group_values.iter().enumerate() { + if !check_row_equal( + group_val.as_ref(), + group_idx, + &cols[i], + row, + ) { + return false; + } + } + true + } + }); + + // Only `rows` having the same hash value with `exist rows` but different value + // will be process in `scalarized_intern`. + // So related `buckets` in `map` is ensured to be `Some`. + let Some((_, group_index_view)) = entry else { + unreachable!() + }; + + // Insert the `row` to `group_values` before checking `next row` + let group_idx = self.group_values[0].len(); + + let mut checklen = 0; + for (i, group_value) in self.group_values.iter_mut().enumerate() { + group_value.append_val(&cols[i], row); + let len = group_value.len(); + if i == 0 { + checklen = len; + } else { + debug_assert_eq!(checklen, len); + } + } + + // Check if the `view` is `inlined` or `non-inlined` + if group_index_view.is_non_inlinded() { + // Non-inlined case, get `group_index_list` from `group_index_lists`, + // then add the new `group` with the same hash values into it. + let list_offset = group_index_view.value() as usize; + let group_index_list = &mut self.group_index_lists[list_offset]; + group_index_list.push(group_idx); + } else { + // Inlined case + let list_offset = self.group_index_lists.len(); + + // Create new `group_index_list` including + // `exist group index` + `new group index`. + // Add new `group_index_list` into ``group_index_lists`. + let exist_group_index = group_index_view.value() as usize; + let new_group_index_list = vec![exist_group_index, group_idx]; + self.group_index_lists.push(new_group_index_list); + + // Update the `group_index_view` to non-inlined + let new_group_index_view = + GroupIndexView::new_non_inlined(list_offset as u64); + *group_index_view = new_group_index_view; + } - let iter = self - .vectorized_append_row_indices - .iter() - .zip(group_len_before_appending..self.group_values_len); - for (&row, group_idx) in iter { groups[row] = group_idx; } - - // Set back `index_lists_updates`. - self.index_lists_updates = index_lists_updates; } } @@ -570,31 +603,24 @@ impl GroupValues for GroupValuesColumn { batch_hashes.resize(n_rows, 0); create_hashes(cols, &self.random_state, &mut batch_hashes)?; - self.map.reserve(n_rows, |(hash, _)| *hash); - // General steps for one round `vectorized equal_to & append`: // 1. Collect vectorized context by checking hash values of `cols` in `map` // 2. Perform `vectorized_equal_to` // 3. Perform `vectorized_append` // 4. Update `current_indices` - let num_rows = cols[0].len(); - self.current_indices.clear(); - self.current_indices.extend(0..num_rows); - let mut count = 0; - while self.current_indices.len() > 0 { - // 1. Collect vectorized context by checking hash values of `cols` in `map` - self.collect_vectorized_process_context(&batch_hashes); - - // 2. Perform `vectorized_equal_to` - self.vectorized_equal_to(cols); - - // 3. Perform `vectorized_append` - self.vectorized_append(cols, groups); - - // 4. Update `current_indices` - mem::swap(&mut self.current_indices, &mut self.remaining_indices); - count += 1; - } + groups.resize(n_rows, usize::MAX); + + // 1. Collect vectorized context by checking hash values of `cols` in `map` + self.collect_vectorized_process_context(&batch_hashes, groups); + + // 2. Perform `vectorized_append` + self.vectorized_append(cols); + + // 3. Perform `vectorized_equal_to` + self.vectorized_equal_to(cols, groups); + + // 4. Update `current_indices` + self.scalarized_intern(cols, &batch_hashes, groups); self.hashes_buffer = batch_hashes; @@ -635,42 +661,60 @@ impl GroupValues for GroupValuesColumn { .iter_mut() .map(|v| v.take_n(n)) .collect::>(); + let new_group_index_lists = + Vec::with_capacity(self.group_index_lists.len()); + let old_group_index_lists = + std::mem::replace(&mut self.group_index_lists, new_group_index_lists); - // Update `map` // SAFETY: self.map outlives iterator and is not modified concurrently unsafe { for bucket in self.map.iter() { - let group_index = { - let (_, bucket_ctx) = bucket.as_ref(); - debug_assert!(!bucket_ctx.is_checking()); - bucket_ctx.group_index() - }; - - // Decrement group index in map by n - match group_index.checked_sub(n as u64) { - // Group index was >= n, shift value down - Some(sub) => bucket.as_mut().1 = BucketContext(sub), - // Group index was < n, so remove from table - None => self.map.erase(bucket), + // Check if it is `inlined` or `non-inlined` + if bucket.as_ref().1.is_non_inlinded() { + // Non-inlined case + // We take `group_index_list` from `old_group_index_lists` + let list_offset = bucket.as_ref().1.value() as usize; + let old_group_index_list = + &old_group_index_lists[list_offset]; + + let mut new_group_index_list = Vec::new(); + for &group_index in old_group_index_list { + if let Some(remaining) = group_index.checked_sub(n) { + new_group_index_list.push(remaining); + } + } + + // The possible results: + // - `new_group_index_list` is empty, we should erase this bucket + // - only one value in `new_group_index_list`, switch the `view` to `inlined` + // - still multiple values in `new_group_index_list`, build and set the new `unlined view` + if new_group_index_list.is_empty() { + self.map.erase(bucket); + } else if new_group_index_list.len() == 1 { + let group_index = new_group_index_list.first().unwrap(); + bucket.as_mut().1 = + GroupIndexView::new_inlined(*group_index as u64); + } else { + let new_list_offset = self.group_index_lists.len(); + self.group_index_lists.push(new_group_index_list); + bucket.as_mut().1 = + GroupIndexView::new_inlined(new_list_offset as u64); + } + } else { + // Inlined case, we just decrement group index by n + let group_index = bucket.as_ref().1.value() as usize; + match group_index.checked_sub(n) { + // Group index was >= n, shift value down + Some(sub) => { + bucket.as_mut().1 = GroupIndexView::new_inlined(sub as u64) + } + // Group index was < n, so remove from table + None => self.map.erase(bucket), + } } } } - // Update `group_index_lists` - // Loop and decrement the [n+1..] list nodes - let start_idx = n + 1; - let list_len = self.group_index_lists.len(); - for idx in start_idx..list_len { - let new_idx = idx - n; - - let next_idx = self.group_index_lists[idx]; - let new_next_idx = next_idx.checked_sub(n).unwrap_or(0); - - self.group_index_lists[new_idx] = new_next_idx; - } - self.group_index_lists - .resize(self.group_values[0].len() + 1, 0); - output } }; @@ -702,10 +746,7 @@ impl GroupValues for GroupValuesColumn { self.hashes_buffer.shrink_to(count); self.group_index_lists.clear(); self.index_lists_updates.clear(); - self.current_indices.clear(); - self.remaining_indices.clear(); - self.empty_buckets.clear(); - self.occupied_buckets.clear(); + self.scalarized_indices.clear(); self.vectorized_append_row_indices.clear(); self.vectorized_equal_to_row_indices.clear(); self.vectorized_equal_to_group_indices.clear(); From 6375d9324d332184074222f048ffebaab21b0343 Mon Sep 17 00:00:00 2001 From: kamille Date: Mon, 28 Oct 2024 00:56:13 +0800 Subject: [PATCH 28/60] clear `scalarized_indices` in `intern` to avoid some corner case. --- .../physical-plan/src/aggregates/group_values/column.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 1c5e8930903f..7287dc8afa96 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -352,7 +352,6 @@ impl GroupValuesColumn { let mut equal_to_results = mem::take(&mut self.vectorized_equal_to_results); equal_to_results.clear(); equal_to_results.resize(self.vectorized_equal_to_group_indices.len(), true); - self.scalarized_indices.clear(); for (col_idx, group_col) in self.group_values.iter().enumerate() { group_col.vectorized_equal_to( @@ -403,6 +402,10 @@ impl GroupValuesColumn { batch_hashes: &[u64], groups: &mut Vec, ) { + if self.scalarized_indices.is_empty() { + return; + } + for &row in &self.scalarized_indices { let target_hash = batch_hashes[row]; let entry = @@ -609,6 +612,7 @@ impl GroupValues for GroupValuesColumn { // 3. Perform `vectorized_append` // 4. Update `current_indices` groups.resize(n_rows, usize::MAX); + self.scalarized_indices.clear(); // 1. Collect vectorized context by checking hash values of `cols` in `map` self.collect_vectorized_process_context(&batch_hashes, groups); From 7979f74e21c22a774a80a647d5c360a8362deff6 Mon Sep 17 00:00:00 2001 From: kamille Date: Mon, 28 Oct 2024 03:38:10 +0800 Subject: [PATCH 29/60] fix `scalarized_equal_to`. --- .../src/aggregates/group_values/column.rs | 168 ++++++++++-------- 1 file changed, 95 insertions(+), 73 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 7287dc8afa96..a7e252fcfc0f 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -92,7 +92,7 @@ struct GroupIndexView(u64); impl GroupIndexView { #[inline] - pub fn is_non_inlinded(&self) -> bool { + pub fn is_non_inlined(&self) -> bool { (self.0 & NON_INLINED_FLAG) > 0 } @@ -294,7 +294,7 @@ impl GroupValuesColumn { // 2. bucket found // Check if the `group index view` is `inlined` or `non_inlined` - if group_index_view.is_non_inlinded() { + if group_index_view.is_non_inlined() { // Non-inlined case, the value of view is offset in `group_index_lists`. // We use it to get `group_index_list`, and add related `rows` and `group_indices` // into `vectorized_equal_to_row_indices` and `vectorized_equal_to_group_indices`. @@ -362,13 +362,11 @@ impl GroupValuesColumn { ); } - self.vectorized_equal_to_results = equal_to_results; - // 2. Check `equal_to_results`, if found not equal to `row`s, just add them // to `scalarized_indices`, and perform `scalarized_intern` for them after. let mut current_row_equal_to_result = false; for (idx, &row) in self.vectorized_equal_to_row_indices.iter().enumerate() { - let equal_to_result = self.vectorized_equal_to_results[idx]; + let equal_to_result = equal_to_results[idx]; // Equal to case, set the `group_indices` to `rows` in `groups` if equal_to_result { @@ -394,6 +392,59 @@ impl GroupValuesColumn { current_row_equal_to_result = false; } } + + self.vectorized_equal_to_results = equal_to_results; + } + + fn scalarized_equal_to( + &self, + group_index_view: &GroupIndexView, + cols: &[ArrayRef], + row: usize, + groups: &mut Vec, + ) -> bool { + // Check if this row exists in `group_values` + fn check_row_equal( + array_row: &dyn GroupColumn, + lhs_row: usize, + array: &ArrayRef, + rhs_row: usize, + ) -> bool { + array_row.equal_to(lhs_row, array, rhs_row) + } + + if group_index_view.is_non_inlined() { + let list_offset = group_index_view.value() as usize; + let group_index_list = &self.group_index_lists[list_offset]; + + for &group_idx in group_index_list { + let mut check_result = true; + for (i, group_val) in self.group_values.iter().enumerate() { + if !check_row_equal(group_val.as_ref(), group_idx, &cols[i], row) { + check_result = false; + break; + } + } + + if check_result { + groups[row] = group_idx; + return true; + } + } + + // All groups unmatched, return false result + false + } else { + let group_idx = group_index_view.value() as usize; + for (i, group_val) in self.group_values.iter().enumerate() { + if !check_row_equal(group_val.as_ref(), group_idx, &cols[i], row) { + return false; + } + } + + groups[row] = group_idx; + true + } } fn scalarized_intern( @@ -406,70 +457,17 @@ impl GroupValuesColumn { return; } + let mut map = mem::take(&mut self.map); + for &row in &self.scalarized_indices { let target_hash = batch_hashes[row]; - let entry = - self.map - .get_mut(target_hash, |(exist_hash, group_index_view)| { - // Somewhat surprisingly, this closure can be called even if the - // hash doesn't match, so check the hash first with an integer - // comparison first avoid the more expensive comparison with - // group value. https://github.com/apache/datafusion/pull/11718 - if target_hash != *exist_hash { - return false; - } - - fn check_row_equal( - array_row: &dyn GroupColumn, - lhs_row: usize, - array: &ArrayRef, - rhs_row: usize, - ) -> bool { - array_row.equal_to(lhs_row, array, rhs_row) - } - - if group_index_view.is_non_inlinded() { - let mut check_result = false; - let list_offset = group_index_view.value() as usize; - let group_index_list = &self.group_index_lists[list_offset]; - - for &group_idx in group_index_list { - // If found one matched group, return true result - if check_result { - return true; - } - - for (i, group_val) in self.group_values.iter().enumerate() - { - if !check_row_equal( - group_val.as_ref(), - group_idx, - &cols[i], - row, - ) { - break; - } - check_result = true; - } - } - - // All groups unmatched, return false result - false - } else { - let group_idx = group_index_view.value() as usize; - for (i, group_val) in self.group_values.iter().enumerate() { - if !check_row_equal( - group_val.as_ref(), - group_idx, - &cols[i], - row, - ) { - return false; - } - } - true - } - }); + let entry = map.get_mut(target_hash, |(exist_hash, group_index_view)| { + // Somewhat surprisingly, this closure can be called even if the + // hash doesn't match, so check the hash first with an integer + // comparison first avoid the more expensive comparison with + // group value. https://github.com/apache/datafusion/pull/11718 + target_hash == *exist_hash + }); // Only `rows` having the same hash value with `exist rows` but different value // will be process in `scalarized_intern`. @@ -478,9 +476,15 @@ impl GroupValuesColumn { unreachable!() }; + // Perform scalarized equal to + if self.scalarized_equal_to(&group_index_view, cols, row, groups) { + // Found the row actually exists in group values, + // don't need to create new group for it. + continue; + } + // Insert the `row` to `group_values` before checking `next row` let group_idx = self.group_values[0].len(); - let mut checklen = 0; for (i, group_value) in self.group_values.iter_mut().enumerate() { group_value.append_val(&cols[i], row); @@ -493,7 +497,7 @@ impl GroupValuesColumn { } // Check if the `view` is `inlined` or `non-inlined` - if group_index_view.is_non_inlinded() { + if group_index_view.is_non_inlined() { // Non-inlined case, get `group_index_list` from `group_index_lists`, // then add the new `group` with the same hash values into it. let list_offset = group_index_view.value() as usize; @@ -518,6 +522,8 @@ impl GroupValuesColumn { groups[row] = group_idx; } + + self.map = map; } } @@ -674,7 +680,7 @@ impl GroupValues for GroupValuesColumn { unsafe { for bucket in self.map.iter() { // Check if it is `inlined` or `non-inlined` - if bucket.as_ref().1.is_non_inlinded() { + if bucket.as_ref().1.is_non_inlined() { // Non-inlined case // We take `group_index_list` from `old_group_index_lists` let list_offset = bucket.as_ref().1.value() as usize; @@ -701,8 +707,9 @@ impl GroupValues for GroupValuesColumn { } else { let new_list_offset = self.group_index_lists.len(); self.group_index_lists.push(new_group_index_list); - bucket.as_mut().1 = - GroupIndexView::new_inlined(new_list_offset as u64); + bucket.as_mut().1 = GroupIndexView::new_non_inlined( + new_list_offset as u64, + ); } } else { // Inlined case, we just decrement group index by n @@ -710,7 +717,8 @@ impl GroupValues for GroupValuesColumn { match group_index.checked_sub(n) { // Group index was >= n, shift value down Some(sub) => { - bucket.as_mut().1 = GroupIndexView::new_inlined(sub as u64) + bucket.as_mut().1 = + GroupIndexView::new_inlined(sub as u64) } // Group index was < n, so remove from table None => self.map.erase(bucket), @@ -762,8 +770,10 @@ impl GroupValues for GroupValuesColumn { mod tests { use std::sync::Arc; + use ahash::RandomState; use arrow_array::{ArrayRef, Int64Array, StringArray}; use arrow_schema::{DataType, Field, Schema}; + use datafusion_common::hash_utils::create_hashes; use crate::aggregates::group_values::{column::GroupValuesColumn, GroupValues}; @@ -853,4 +863,16 @@ mod tests { group_values.intern(&cols2, &mut groups).unwrap(); group_values.intern(&cols3, &mut groups).unwrap(); } + + #[test] + fn test2() { + let col1 = Arc::new(Int64Array::from(vec![Some(1), Some(0)])) as _; + let col2 = Arc::new(Int64Array::from(vec![Some(0), Some(1)])) as _; + let col3 = Arc::new(Int64Array::from(vec![Some(0), Some(0)])) as _; + let cols = vec![col1, col2, col3]; + let random_state = RandomState::with_seeds(0, 0, 0, 0); + let mut hash_buffer = vec![0; 2]; + create_hashes(&cols, &random_state, &mut hash_buffer); + dbg!(&hash_buffer); + } } From 86dcb113a627ae4a3c39d9c037d79dbbf305f18d Mon Sep 17 00:00:00 2001 From: kamille Date: Mon, 28 Oct 2024 14:05:15 +0800 Subject: [PATCH 30/60] fallback to total scalarized `GroupValuesColumn` in streaming aggregation. --- .../src/aggregates/group_values/column.rs | 436 ++++++++++++------ .../src/aggregates/group_values/mod.rs | 17 +- .../physical-plan/src/aggregates/row_hash.rs | 2 +- 3 files changed, 304 insertions(+), 151 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index a7e252fcfc0f..bf984cdef03d 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -115,7 +115,7 @@ impl GroupIndexView { /// A [`GroupValues`] that stores multiple columns of group values. /// -pub struct GroupValuesColumn { +pub struct VectorizedGroupValuesColumn { /// The output schema schema: SchemaRef, @@ -176,7 +176,7 @@ pub struct GroupValuesColumn { random_state: RandomState, } -impl GroupValuesColumn { +impl VectorizedGroupValuesColumn { /// Create a new instance of GroupValuesColumn if supported for the specified schema pub fn try_new(schema: SchemaRef) -> Result { let map = RawTable::with_capacity(0); @@ -197,43 +197,6 @@ impl GroupValuesColumn { }) } - /// Returns true if [`GroupValuesColumn`] supported for the specified schema - pub fn supported_schema(schema: &Schema) -> bool { - schema - .fields() - .iter() - .map(|f| f.data_type()) - .all(Self::supported_type) - } - - /// Returns true if the specified data type is supported by [`GroupValuesColumn`] - /// - /// In order to be supported, there must be a specialized implementation of - /// [`GroupColumn`] for the data type, instantiated in [`Self::intern`] - fn supported_type(data_type: &DataType) -> bool { - matches!( - *data_type, - DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Float32 - | DataType::Float64 - | DataType::Utf8 - | DataType::LargeUtf8 - | DataType::Binary - | DataType::LargeBinary - | DataType::Date32 - | DataType::Date64 - | DataType::Utf8View - | DataType::BinaryView - ) - } - /// Collect vectorized context by checking hash values of `cols` in `map` /// /// 1. If bucket not found @@ -546,7 +509,7 @@ macro_rules! instantiate_primitive { }; } -impl GroupValues for GroupValuesColumn { +impl GroupValues for VectorizedGroupValuesColumn { fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec) -> Result<()> { let n_rows = cols[0].len(); @@ -766,113 +729,294 @@ impl GroupValues for GroupValuesColumn { } } -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use ahash::RandomState; - use arrow_array::{ArrayRef, Int64Array, StringArray}; - use arrow_schema::{DataType, Field, Schema}; - use datafusion_common::hash_utils::create_hashes; - - use crate::aggregates::group_values::{column::GroupValuesColumn, GroupValues}; - - #[test] - fn test() { - // *************************************************************** - // The test group cols, the schema is `a(Int64) + b(String)`. - // It should cover following input rows situations: - // - a: null + b: null - // - a: not null + b: null - // - a: null + b: not null - // - a: not null + b: not null - // - // And it should cover following repeating situations: - // - Rows unique - // - Rows repeating in two `cols` - // - Rows repeating in single `cols` - // *************************************************************** - - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int64, true), - Field::new("b", DataType::Utf8, true), - ])); - // // Case 1 - // Cols 1 - let a: ArrayRef = Arc::new(Int64Array::from(vec![ - None, - Some(42), - None, - Some(24), - Some(4224), - ])); - let b: ArrayRef = Arc::new(StringArray::from(vec![ - None, - None, - Some("42"), - Some("24"), - Some("4224"), - ])); - let cols1 = vec![a, b]; - - // Cols 2 - let a: ArrayRef = Arc::new(Int64Array::from(vec![ - None, - Some(42), - None, - Some(24), - Some(2442), - ])); - let b: ArrayRef = Arc::new(StringArray::from(vec![ - None, - None, - Some("42"), - Some("24"), - Some("2442"), - ])); - let cols2 = vec![a, b]; - - // Cols 3 - let a: ArrayRef = Arc::new(Int64Array::from(vec![ - None, - Some(42), - None, - Some(24), - None, - Some(42), - None, - Some(24), - Some(4224), - ])); - let b: ArrayRef = Arc::new(StringArray::from(vec![ - None, - None, - Some("42"), - Some("24"), - None, - None, - Some("42"), - Some("24"), - Some("4224"), - ])); - let cols3 = vec![a, b]; - - let mut group_values = GroupValuesColumn::try_new(schema).unwrap(); - let mut groups = Vec::new(); - group_values.intern(&cols1, &mut groups).unwrap(); - group_values.intern(&cols2, &mut groups).unwrap(); - group_values.intern(&cols3, &mut groups).unwrap(); +/// A [`GroupValues`] that stores multiple columns of group values. +/// +/// +pub struct GroupValuesColumn { + /// The output schema + schema: SchemaRef, + + /// Logically maps group values to a group_index in + /// [`Self::group_values`] and in each accumulator + /// + /// Uses the raw API of hashbrown to avoid actually storing the + /// keys (group values) in the table + /// + /// keys: u64 hashes of the GroupValue + /// values: (hash, group_index) + map: RawTable<(u64, usize)>, + + /// The size of `map` in bytes + map_size: usize, + + /// The actual group by values, stored column-wise. Compare from + /// the left to right, each column is stored as [`GroupColumn`]. + /// + /// Performance tests showed that this design is faster than using the + /// more general purpose [`GroupValuesRows`]. See the ticket for details: + /// + /// + /// [`GroupValuesRows`]: crate::aggregates::group_values::row::GroupValuesRows + group_values: Vec>, + + /// reused buffer to store hashes + hashes_buffer: Vec, + + /// Random state for creating hashes + random_state: RandomState, +} + +impl GroupValuesColumn { + /// Create a new instance of GroupValuesColumn if supported for the specified schema + pub fn try_new(schema: SchemaRef) -> Result { + let map = RawTable::with_capacity(0); + Ok(Self { + schema, + map, + map_size: 0, + group_values: vec![], + hashes_buffer: Default::default(), + random_state: Default::default(), + }) + } +} + +impl GroupValues for GroupValuesColumn { + fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec) -> Result<()> { + let n_rows = cols[0].len(); + + if self.group_values.is_empty() { + let mut v = Vec::with_capacity(cols.len()); + + for f in self.schema.fields().iter() { + let nullable = f.is_nullable(); + match f.data_type() { + &DataType::Int8 => instantiate_primitive!(v, nullable, Int8Type), + &DataType::Int16 => instantiate_primitive!(v, nullable, Int16Type), + &DataType::Int32 => instantiate_primitive!(v, nullable, Int32Type), + &DataType::Int64 => instantiate_primitive!(v, nullable, Int64Type), + &DataType::UInt8 => instantiate_primitive!(v, nullable, UInt8Type), + &DataType::UInt16 => instantiate_primitive!(v, nullable, UInt16Type), + &DataType::UInt32 => instantiate_primitive!(v, nullable, UInt32Type), + &DataType::UInt64 => instantiate_primitive!(v, nullable, UInt64Type), + &DataType::Float32 => { + instantiate_primitive!(v, nullable, Float32Type) + } + &DataType::Float64 => { + instantiate_primitive!(v, nullable, Float64Type) + } + &DataType::Date32 => instantiate_primitive!(v, nullable, Date32Type), + &DataType::Date64 => instantiate_primitive!(v, nullable, Date64Type), + &DataType::Utf8 => { + let b = ByteGroupValueBuilder::::new(OutputType::Utf8); + v.push(Box::new(b) as _) + } + &DataType::LargeUtf8 => { + let b = ByteGroupValueBuilder::::new(OutputType::Utf8); + v.push(Box::new(b) as _) + } + &DataType::Binary => { + let b = ByteGroupValueBuilder::::new(OutputType::Binary); + v.push(Box::new(b) as _) + } + &DataType::LargeBinary => { + let b = ByteGroupValueBuilder::::new(OutputType::Binary); + v.push(Box::new(b) as _) + } + dt => { + return not_impl_err!("{dt} not supported in GroupValuesColumn") + } + } + } + self.group_values = v; + } + + // tracks to which group each of the input rows belongs + groups.clear(); + + // 1.1 Calculate the group keys for the group values + let batch_hashes = &mut self.hashes_buffer; + batch_hashes.clear(); + batch_hashes.resize(n_rows, 0); + create_hashes(cols, &self.random_state, batch_hashes)?; + + for (row, &target_hash) in batch_hashes.iter().enumerate() { + let entry = self.map.get_mut(target_hash, |(exist_hash, group_idx)| { + // Somewhat surprisingly, this closure can be called even if the + // hash doesn't match, so check the hash first with an integer + // comparison first avoid the more expensive comparison with + // group value. https://github.com/apache/datafusion/pull/11718 + if target_hash != *exist_hash { + return false; + } + + fn check_row_equal( + array_row: &dyn GroupColumn, + lhs_row: usize, + array: &ArrayRef, + rhs_row: usize, + ) -> bool { + array_row.equal_to(lhs_row, array, rhs_row) + } + + for (i, group_val) in self.group_values.iter().enumerate() { + if !check_row_equal(group_val.as_ref(), *group_idx, &cols[i], row) { + return false; + } + } + + true + }); + + let group_idx = match entry { + // Existing group_index for this group value + Some((_hash, group_idx)) => *group_idx, + // 1.2 Need to create new entry for the group + None => { + // Add new entry to aggr_state and save newly created index + // let group_idx = group_values.num_rows(); + // group_values.push(group_rows.row(row)); + + let mut checklen = 0; + let group_idx = self.group_values[0].len(); + for (i, group_value) in self.group_values.iter_mut().enumerate() { + group_value.append_val(&cols[i], row); + let len = group_value.len(); + if i == 0 { + checklen = len; + } else { + debug_assert_eq!(checklen, len); + } + } + + // for hasher function, use precomputed hash value + self.map.insert_accounted( + (target_hash, group_idx), + |(hash, _group_index)| *hash, + &mut self.map_size, + ); + group_idx + } + }; + groups.push(group_idx); + } + + Ok(()) + } + + fn size(&self) -> usize { + let group_values_size: usize = self.group_values.iter().map(|v| v.size()).sum(); + group_values_size + self.map_size + self.hashes_buffer.allocated_size() + } + + fn is_empty(&self) -> bool { + self.len() == 0 + } + + fn len(&self) -> usize { + if self.group_values.is_empty() { + return 0; + } + + self.group_values[0].len() + } + + fn emit(&mut self, emit_to: EmitTo) -> Result> { + let mut output = match emit_to { + EmitTo::All => { + let group_values = std::mem::take(&mut self.group_values); + debug_assert!(self.group_values.is_empty()); + + group_values + .into_iter() + .map(|v| v.build()) + .collect::>() + } + EmitTo::First(n) => { + let output = self + .group_values + .iter_mut() + .map(|v| v.take_n(n)) + .collect::>(); + + // SAFETY: self.map outlives iterator and is not modified concurrently + unsafe { + for bucket in self.map.iter() { + // Decrement group index by n + match bucket.as_ref().1.checked_sub(n) { + // Group index was >= n, shift value down + Some(sub) => bucket.as_mut().1 = sub, + // Group index was < n, so remove from table + None => self.map.erase(bucket), + } + } + } + + output + } + }; + + // TODO: Materialize dictionaries in group keys (#7647) + for (field, array) in self.schema.fields.iter().zip(&mut output) { + let expected = field.data_type(); + if let DataType::Dictionary(_, v) = expected { + let actual = array.data_type(); + if v.as_ref() != actual { + return Err(DataFusionError::Internal(format!( + "Converted group rows expected dictionary of {v} got {actual}" + ))); + } + *array = cast(array.as_ref(), expected)?; + } + } + + Ok(output) } - #[test] - fn test2() { - let col1 = Arc::new(Int64Array::from(vec![Some(1), Some(0)])) as _; - let col2 = Arc::new(Int64Array::from(vec![Some(0), Some(1)])) as _; - let col3 = Arc::new(Int64Array::from(vec![Some(0), Some(0)])) as _; - let cols = vec![col1, col2, col3]; - let random_state = RandomState::with_seeds(0, 0, 0, 0); - let mut hash_buffer = vec![0; 2]; - create_hashes(&cols, &random_state, &mut hash_buffer); - dbg!(&hash_buffer); + fn clear_shrink(&mut self, batch: &RecordBatch) { + let count = batch.num_rows(); + self.group_values.clear(); + self.map.clear(); + self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared + self.map_size = self.map.capacity() * std::mem::size_of::<(u64, usize)>(); + self.hashes_buffer.clear(); + self.hashes_buffer.shrink_to(count); } } + +/// Returns true if [`GroupValuesColumn`] supported for the specified schema +pub fn supported_schema(schema: &Schema) -> bool { + schema + .fields() + .iter() + .map(|f| f.data_type()) + .all(supported_type) +} + +/// Returns true if the specified data type is supported by [`GroupValuesColumn`] +/// +/// In order to be supported, there must be a specialized implementation of +/// [`GroupColumn`] for the data type, instantiated in [`Self::intern`] +fn supported_type(data_type: &DataType) -> bool { + matches!( + *data_type, + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float32 + | DataType::Float64 + | DataType::Utf8 + | DataType::LargeUtf8 + | DataType::Binary + | DataType::LargeBinary + | DataType::Date32 + | DataType::Date64 + | DataType::Utf8View + | DataType::BinaryView + ) +} diff --git a/datafusion/physical-plan/src/aggregates/group_values/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/mod.rs index fb7b66775092..af1b82de6227 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/mod.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/mod.rs @@ -29,7 +29,7 @@ use primitive::GroupValuesPrimitive; mod column; mod row; -use column::GroupValuesColumn; +use column::VectorizedGroupValuesColumn; use row::GroupValuesRows; mod bytes; @@ -37,6 +37,8 @@ mod bytes_view; use bytes::GroupValuesByes; use datafusion_physical_expr::binary_map::OutputType; +use crate::aggregates::{group_values::column::GroupValuesColumn, order::GroupOrdering}; + mod group_column; mod null_builder; @@ -105,7 +107,10 @@ pub trait GroupValues: Send { } /// Return a specialized implementation of [`GroupValues`] for the given schema. -pub fn new_group_values(schema: SchemaRef) -> Result> { +pub fn new_group_values( + schema: SchemaRef, + group_ordering: &GroupOrdering, +) -> Result> { if schema.fields.len() == 1 { let d = schema.fields[0].data_type(); @@ -143,8 +148,12 @@ pub fn new_group_values(schema: SchemaRef) -> Result> { } } - if GroupValuesColumn::supported_schema(schema.as_ref()) { - Ok(Box::new(GroupValuesColumn::try_new(schema)?)) + if column::supported_schema(schema.as_ref()) { + if matches!(group_ordering, GroupOrdering::None) { + Ok(Box::new(VectorizedGroupValuesColumn::try_new(schema)?)) + } else { + Ok(Box::new(GroupValuesColumn::try_new(schema)?)) + } } else { Ok(Box::new(GroupValuesRows::try_new(schema)?)) } diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index 624844b6b985..ca2f33a5c6d7 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -514,7 +514,7 @@ impl GroupedHashAggregateStream { ordering.as_slice(), )?; - let group_values = new_group_values(group_schema)?; + let group_values = new_group_values(group_schema, &group_ordering)?; timer.done(); let exec_state = ExecutionState::ReadingInput; From 197656ba9228e5c813875c4c35a361d179b87d7b Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 30 Oct 2024 00:10:59 +0800 Subject: [PATCH 31/60] add unit test for `VectorizedGroupValuesColumn`. --- .../src/aggregates/group_values/column.rs | 351 ++++++++++++++++++ 1 file changed, 351 insertions(+) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index bf984cdef03d..ce06b0aeae17 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -1020,3 +1020,354 @@ fn supported_type(data_type: &DataType) -> bool { | DataType::BinaryView ) } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use ahash::RandomState; + use arrow::util::pretty::{pretty_format_batches, print_batches, print_columns}; + use arrow_array::{ArrayRef, Int64Array, RecordBatch, StringArray, StringViewArray}; + use arrow_schema::{DataType, Field, Schema}; + use datafusion_common::hash_utils::create_hashes; + use datafusion_expr::EmitTo; + + use crate::aggregates::group_values::{ + column::{GroupValuesColumn, VectorizedGroupValuesColumn}, + GroupValues, + }; + + #[test] + fn test_vectorized_intern() { + // Situations should be covered + // + // Array type: + // - Primitive array + // - String(byte) array + // - String view(byte view) array + // + // Repeation and nullability in single batch: + // - All not null rows + // - Mixed null + not null rows + // - All null rows + // - All not null rows(repeated) + // - Null + not null rows(repeated) + // - All not null rows(repeated) + // + // If group exists in `map`: + // - Group exists in inlined group view + // - Group exists in non-inlined group view + // - Group not exist + bucket not found in `map` + // - Group not exist + not equal to inlined group view(tested in hash collision) + // - Group not exist + not equal to non-inlined group view(tested in hash collision) + // + + // Intern batch 1 + let col1 = Int64Array::from(vec![ + // Repeated rows in batch + Some(42), // all not nulls + repeated rows + exist in map case + None, // mixed + repeated rows + exist in map case + None, // mixed + repeated rows + not exist in map case + Some(1142), // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some(42), + None, + None, + Some(1142), + None, + // Unique rows in batch + Some(4211), // all not nulls + unique rows + exist in map case + None, // mixed + unique rows + exist in map case + None, // mixed + unique rows + not exist in map case + Some(4212), // mixed + unique rows + not exist in map case + ]); + + let col2 = StringArray::from(vec![ + // Repeated rows in batch + Some("string1"), // all not nulls + repeated rows + exist in map case + None, // mixed + repeated rows + exist in map case + Some("string2"), // mixed + repeated rows + not exist in map case + None, // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some("string1"), + None, + Some("string2"), + None, + None, + // Unique rows in batch + Some("string3"), // all not nulls + unique rows + exist in map case + None, // mixed + unique rows + exist in map case + Some("string4"), // mixed + unique rows + not exist in map case + None, // mixed + unique rows + not exist in map case + ]); + + let col3 = StringViewArray::from(vec![ + // Repeated rows in batch + Some("stringview1"), // all not nulls + repeated rows + exist in map case + Some("stringview2"), // mixed + repeated rows + exist in map case + None, // mixed + repeated rows + not exist in map case + None, // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some("stringview1"), + Some("stringview2"), + None, + None, + None, + // Unique rows in batch + Some("stringview3"), // all not nulls + unique rows + exist in map case + Some("stringview4"), // mixed + unique rows + exist in map case + None, // mixed + unique rows + not exist in map case + None, // mixed + unique rows + not exist in map case + ]); + let batch1 = vec![ + Arc::new(col1) as _, + Arc::new(col2) as _, + Arc::new(col3) as _, + ]; + + // Intern batch 2 + let col1 = Int64Array::from(vec![ + // Repeated rows in batch + Some(42), // all not nulls + repeated rows + exist in map case + None, // mixed + repeated rows + exist in map case + None, // mixed + repeated rows + not exist in map case + Some(21142), // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some(42), + None, + None, + Some(21142), + None, + // Unique rows in batch + Some(4211), // all not nulls + unique rows + exist in map case + None, // mixed + unique rows + exist in map case + None, // mixed + unique rows + not exist in map case + Some(24212), // mixed + unique rows + not exist in map case + ]); + + let col2 = StringArray::from(vec![ + // Repeated rows in batch + Some("string1"), // all not nulls + repeated rows + exist in map case + None, // mixed + repeated rows + exist in map case + Some("2string2"), // mixed + repeated rows + not exist in map case + None, // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some("string1"), + None, + Some("2string2"), + None, + None, + // Unique rows in batch + Some("string3"), // all not nulls + unique rows + exist in map case + None, // mixed + unique rows + exist in map case + Some("2string4"), // mixed + unique rows + not exist in map case + None, // mixed + unique rows + not exist in map case + ]); + + let col3 = StringViewArray::from(vec![ + // Repeated rows in batch + Some("stringview1"), // all not nulls + repeated rows + exist in map case + Some("stringview2"), // mixed + repeated rows + exist in map case + None, // mixed + repeated rows + not exist in map case + None, // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some("stringview1"), + Some("stringview2"), + None, + None, + None, + // Unique rows in batch + Some("stringview3"), // all not nulls + unique rows + exist in map case + Some("stringview4"), // mixed + unique rows + exist in map case + None, // mixed + unique rows + not exist in map case + None, // mixed + unique rows + not exist in map case + ]); + let batch2 = vec![ + Arc::new(col1) as _, + Arc::new(col2) as _, + Arc::new(col3) as _, + ]; + + // Intern batch 3 + let col1 = Int64Array::from(vec![ + // Repeated rows in batch + Some(42), // all not nulls + repeated rows + exist in map case + None, // mixed + repeated rows + exist in map case + None, // mixed + repeated rows + not exist in map case + Some(31142), // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some(42), + None, + None, + Some(31142), + None, + // Unique rows in batch + Some(4211), // all not nulls + unique rows + exist in map case + None, // mixed + unique rows + exist in map case + None, // mixed + unique rows + not exist in map case + Some(34212), // mixed + unique rows + not exist in map case + ]); + + let col2 = StringArray::from(vec![ + // Repeated rows in batch + Some("string1"), // all not nulls + repeated rows + exist in map case + None, // mixed + repeated rows + exist in map case + Some("3string2"), // mixed + repeated rows + not exist in map case + None, // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some("string1"), + None, + Some("3string2"), + None, + None, + // Unique rows in batch + Some("string3"), // all not nulls + unique rows + exist in map case + None, // mixed + unique rows + exist in map case + Some("3string4"), // mixed + unique rows + not exist in map case + None, // mixed + unique rows + not exist in map case + ]); + + let col3 = StringViewArray::from(vec![ + // Repeated rows in batch + Some("stringview1"), // all not nulls + repeated rows + exist in map case + Some("stringview2"), // mixed + repeated rows + exist in map case + None, // mixed + repeated rows + not exist in map case + None, // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some("stringview1"), + Some("stringview2"), + None, + None, + None, + // Unique rows in batch + Some("stringview3"), // all not nulls + unique rows + exist in map case + Some("stringview4"), // mixed + unique rows + exist in map case + None, // mixed + unique rows + not exist in map case + None, // mixed + unique rows + not exist in map case + ]); + let batch3 = vec![ + Arc::new(col1) as _, + Arc::new(col2) as _, + Arc::new(col3) as _, + ]; + + // Expected batch + let col1 = Int64Array::from(vec![ + // Repeated rows in batch + Some(42), + None, + None, + Some(1142), + None, + Some(21142), + None, + Some(31142), + None, + // Unique rows in batch + Some(4211), + None, + None, + Some(4212), + None, + Some(24212), + None, + Some(34212), + ]); + + let col2 = StringArray::from(vec![ + // Repeated rows in batch + Some("string1"), + None, + Some("string2"), + None, + Some("2string2"), + None, + Some("3string2"), + None, + None, + // Unique rows in batch + Some("string3"), + None, + Some("string4"), + None, + Some("2string4"), + None, + Some("3string4"), + None, + ]); + + let col3 = StringViewArray::from(vec![ + // Repeated rows in batch + Some("stringview1"), + Some("stringview2"), + None, + None, + None, + None, + None, + None, + None, + // Unique rows in batch + Some("stringview3"), + Some("stringview4"), + None, + None, + None, + None, + None, + None, + ]); + let expected_batch = vec![ + Arc::new(col1) as _, + Arc::new(col2) as _, + Arc::new(col3) as _, + ]; + + // Perform vectorized intern + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Utf8View, true), + ])); + let mut group_column_values = + VectorizedGroupValuesColumn::try_new(schema.clone()).unwrap(); + + let mut groups = Vec::new(); + group_column_values.intern(&batch1, &mut groups).unwrap(); + group_column_values.intern(&batch2, &mut groups).unwrap(); + group_column_values.intern(&batch3, &mut groups).unwrap(); + + let actual = group_column_values.emit(EmitTo::All).unwrap(); + let actual_batch = RecordBatch::try_new(schema.clone(), actual).unwrap(); + let formatted_actual_batch = + pretty_format_batches(&[actual_batch]).unwrap().to_string(); + let mut formatted_actual_batch_sorted: Vec<&str> = + formatted_actual_batch.trim().lines().collect(); + formatted_actual_batch_sorted.sort_unstable(); + + let expected_batch = RecordBatch::try_new(schema, expected_batch).unwrap(); + let formatted_expected_batch = pretty_format_batches(&[expected_batch]) + .unwrap() + .to_string(); + let mut formatted_expected_batch_sorted: Vec<&str> = + formatted_expected_batch.trim().lines().collect(); + formatted_expected_batch_sorted.sort_unstable(); + + for (i, (actual_line, expected_line)) in formatted_actual_batch_sorted + .iter() + .zip(&formatted_expected_batch_sorted) + .enumerate() + { + assert_eq!( + (i, actual_line), + (i, expected_line), + "Inconsistent result\n\n\ + Actual batch:\n{}\n\ + Expected batch:\n{}\n\ + ", + formatted_actual_batch, + formatted_expected_batch, + ); + } + } +} From cc96beb498870c37e87a559c26e3c4a4f1916959 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 30 Oct 2024 12:36:07 +0800 Subject: [PATCH 32/60] add unit test for emitting first n in `VectorizedGroupValuesColumn`. --- .../src/aggregates/group_values/column.rs | 688 ++++++++++-------- 1 file changed, 380 insertions(+), 308 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index ce06b0aeae17..dfac7bc3b7e4 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -1023,12 +1023,15 @@ fn supported_type(data_type: &DataType) -> bool { #[cfg(test)] mod tests { - use std::sync::Arc; + use std::{cmp, sync::Arc}; use ahash::RandomState; - use arrow::util::pretty::{pretty_format_batches, print_batches, print_columns}; + use arrow::{ + compute::concat_batches, + util::pretty::{pretty_format_batches, print_batches, print_columns}, + }; use arrow_array::{ArrayRef, Int64Array, RecordBatch, StringArray, StringViewArray}; - use arrow_schema::{DataType, Field, Schema}; + use arrow_schema::{DataType, Field, Schema, SchemaRef}; use datafusion_common::hash_utils::create_hashes; use datafusion_expr::EmitTo; @@ -1038,315 +1041,384 @@ mod tests { }; #[test] - fn test_vectorized_intern() { - // Situations should be covered - // - // Array type: - // - Primitive array - // - String(byte) array - // - String view(byte view) array - // - // Repeation and nullability in single batch: - // - All not null rows - // - Mixed null + not null rows - // - All null rows - // - All not null rows(repeated) - // - Null + not null rows(repeated) - // - All not null rows(repeated) - // - // If group exists in `map`: - // - Group exists in inlined group view - // - Group exists in non-inlined group view - // - Group not exist + bucket not found in `map` - // - Group not exist + not equal to inlined group view(tested in hash collision) - // - Group not exist + not equal to non-inlined group view(tested in hash collision) - // - - // Intern batch 1 - let col1 = Int64Array::from(vec![ - // Repeated rows in batch - Some(42), // all not nulls + repeated rows + exist in map case - None, // mixed + repeated rows + exist in map case - None, // mixed + repeated rows + not exist in map case - Some(1142), // mixed + repeated rows + not exist in map case - None, // all nulls + repeated rows + exist in map case - Some(42), - None, - None, - Some(1142), - None, - // Unique rows in batch - Some(4211), // all not nulls + unique rows + exist in map case - None, // mixed + unique rows + exist in map case - None, // mixed + unique rows + not exist in map case - Some(4212), // mixed + unique rows + not exist in map case - ]); - - let col2 = StringArray::from(vec![ - // Repeated rows in batch - Some("string1"), // all not nulls + repeated rows + exist in map case - None, // mixed + repeated rows + exist in map case - Some("string2"), // mixed + repeated rows + not exist in map case - None, // mixed + repeated rows + not exist in map case - None, // all nulls + repeated rows + exist in map case - Some("string1"), - None, - Some("string2"), - None, - None, - // Unique rows in batch - Some("string3"), // all not nulls + unique rows + exist in map case - None, // mixed + unique rows + exist in map case - Some("string4"), // mixed + unique rows + not exist in map case - None, // mixed + unique rows + not exist in map case - ]); - - let col3 = StringViewArray::from(vec![ - // Repeated rows in batch - Some("stringview1"), // all not nulls + repeated rows + exist in map case - Some("stringview2"), // mixed + repeated rows + exist in map case - None, // mixed + repeated rows + not exist in map case - None, // mixed + repeated rows + not exist in map case - None, // all nulls + repeated rows + exist in map case - Some("stringview1"), - Some("stringview2"), - None, - None, - None, - // Unique rows in batch - Some("stringview3"), // all not nulls + unique rows + exist in map case - Some("stringview4"), // mixed + unique rows + exist in map case - None, // mixed + unique rows + not exist in map case - None, // mixed + unique rows + not exist in map case - ]); - let batch1 = vec![ - Arc::new(col1) as _, - Arc::new(col2) as _, - Arc::new(col3) as _, - ]; - - // Intern batch 2 - let col1 = Int64Array::from(vec![ - // Repeated rows in batch - Some(42), // all not nulls + repeated rows + exist in map case - None, // mixed + repeated rows + exist in map case - None, // mixed + repeated rows + not exist in map case - Some(21142), // mixed + repeated rows + not exist in map case - None, // all nulls + repeated rows + exist in map case - Some(42), - None, - None, - Some(21142), - None, - // Unique rows in batch - Some(4211), // all not nulls + unique rows + exist in map case - None, // mixed + unique rows + exist in map case - None, // mixed + unique rows + not exist in map case - Some(24212), // mixed + unique rows + not exist in map case - ]); - - let col2 = StringArray::from(vec![ - // Repeated rows in batch - Some("string1"), // all not nulls + repeated rows + exist in map case - None, // mixed + repeated rows + exist in map case - Some("2string2"), // mixed + repeated rows + not exist in map case - None, // mixed + repeated rows + not exist in map case - None, // all nulls + repeated rows + exist in map case - Some("string1"), - None, - Some("2string2"), - None, - None, - // Unique rows in batch - Some("string3"), // all not nulls + unique rows + exist in map case - None, // mixed + unique rows + exist in map case - Some("2string4"), // mixed + unique rows + not exist in map case - None, // mixed + unique rows + not exist in map case - ]); - - let col3 = StringViewArray::from(vec![ - // Repeated rows in batch - Some("stringview1"), // all not nulls + repeated rows + exist in map case - Some("stringview2"), // mixed + repeated rows + exist in map case - None, // mixed + repeated rows + not exist in map case - None, // mixed + repeated rows + not exist in map case - None, // all nulls + repeated rows + exist in map case - Some("stringview1"), - Some("stringview2"), - None, - None, - None, - // Unique rows in batch - Some("stringview3"), // all not nulls + unique rows + exist in map case - Some("stringview4"), // mixed + unique rows + exist in map case - None, // mixed + unique rows + not exist in map case - None, // mixed + unique rows + not exist in map case - ]); - let batch2 = vec![ - Arc::new(col1) as _, - Arc::new(col2) as _, - Arc::new(col3) as _, - ]; - - // Intern batch 3 - let col1 = Int64Array::from(vec![ - // Repeated rows in batch - Some(42), // all not nulls + repeated rows + exist in map case - None, // mixed + repeated rows + exist in map case - None, // mixed + repeated rows + not exist in map case - Some(31142), // mixed + repeated rows + not exist in map case - None, // all nulls + repeated rows + exist in map case - Some(42), - None, - None, - Some(31142), - None, - // Unique rows in batch - Some(4211), // all not nulls + unique rows + exist in map case - None, // mixed + unique rows + exist in map case - None, // mixed + unique rows + not exist in map case - Some(34212), // mixed + unique rows + not exist in map case - ]); - - let col2 = StringArray::from(vec![ - // Repeated rows in batch - Some("string1"), // all not nulls + repeated rows + exist in map case - None, // mixed + repeated rows + exist in map case - Some("3string2"), // mixed + repeated rows + not exist in map case - None, // mixed + repeated rows + not exist in map case - None, // all nulls + repeated rows + exist in map case - Some("string1"), - None, - Some("3string2"), - None, - None, - // Unique rows in batch - Some("string3"), // all not nulls + unique rows + exist in map case - None, // mixed + unique rows + exist in map case - Some("3string4"), // mixed + unique rows + not exist in map case - None, // mixed + unique rows + not exist in map case - ]); - - let col3 = StringViewArray::from(vec![ - // Repeated rows in batch - Some("stringview1"), // all not nulls + repeated rows + exist in map case - Some("stringview2"), // mixed + repeated rows + exist in map case - None, // mixed + repeated rows + not exist in map case - None, // mixed + repeated rows + not exist in map case - None, // all nulls + repeated rows + exist in map case - Some("stringview1"), - Some("stringview2"), - None, - None, - None, - // Unique rows in batch - Some("stringview3"), // all not nulls + unique rows + exist in map case - Some("stringview4"), // mixed + unique rows + exist in map case - None, // mixed + unique rows + not exist in map case - None, // mixed + unique rows + not exist in map case - ]); - let batch3 = vec![ - Arc::new(col1) as _, - Arc::new(col2) as _, - Arc::new(col3) as _, - ]; - - // Expected batch - let col1 = Int64Array::from(vec![ - // Repeated rows in batch - Some(42), - None, - None, - Some(1142), - None, - Some(21142), - None, - Some(31142), - None, - // Unique rows in batch - Some(4211), - None, - None, - Some(4212), - None, - Some(24212), - None, - Some(34212), - ]); - - let col2 = StringArray::from(vec![ - // Repeated rows in batch - Some("string1"), - None, - Some("string2"), - None, - Some("2string2"), - None, - Some("3string2"), - None, - None, - // Unique rows in batch - Some("string3"), - None, - Some("string4"), - None, - Some("2string4"), - None, - Some("3string4"), - None, - ]); - - let col3 = StringViewArray::from(vec![ - // Repeated rows in batch - Some("stringview1"), - Some("stringview2"), - None, - None, - None, - None, - None, - None, - None, - // Unique rows in batch - Some("stringview3"), - Some("stringview4"), - None, - None, - None, - None, - None, - None, - ]); - let expected_batch = vec![ - Arc::new(col1) as _, - Arc::new(col2) as _, - Arc::new(col3) as _, - ]; - - // Perform vectorized intern - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int64, true), - Field::new("b", DataType::Utf8, true), - Field::new("c", DataType::Utf8View, true), - ])); - let mut group_column_values = - VectorizedGroupValuesColumn::try_new(schema.clone()).unwrap(); - - let mut groups = Vec::new(); - group_column_values.intern(&batch1, &mut groups).unwrap(); - group_column_values.intern(&batch2, &mut groups).unwrap(); - group_column_values.intern(&batch3, &mut groups).unwrap(); - - let actual = group_column_values.emit(EmitTo::All).unwrap(); - let actual_batch = RecordBatch::try_new(schema.clone(), actual).unwrap(); - let formatted_actual_batch = - pretty_format_batches(&[actual_batch]).unwrap().to_string(); + fn test_intern_for_vectorized_group_values() { + let data_set = VectorizedTestDataSet::new(); + let mut group_values = + VectorizedGroupValuesColumn::try_new(data_set.schema()).unwrap(); + + data_set.load_to_group_values(&mut group_values); + let actual_batch = group_values.emit(EmitTo::All).unwrap(); + let actual_batch = RecordBatch::try_new(data_set.schema(), actual_batch).unwrap(); + + check_result(&actual_batch, &data_set.expected_batch); + } + + #[test] + fn test_emit_first_n_for_vectorized_group_values() { + let data_set = VectorizedTestDataSet::new(); + let mut group_values = + VectorizedGroupValuesColumn::try_new(data_set.schema()).unwrap(); + + // 1~num_rows times to emit the groups + let num_rows = data_set.expected_batch.num_rows(); + let schema = data_set.schema(); + for times_to_take in 1..=num_rows { + // Write data after emitting + data_set.load_to_group_values(&mut group_values); + + // Emit `times_to_take` times, collect and concat the sub-results to total result, + // then check it + let suggest_num_emit = data_set.expected_batch.num_rows() / times_to_take; + let mut num_remaining_rows = num_rows; + let mut actual_sub_batches = Vec::new(); + + for nth_time in 0..times_to_take { + let num_emit = if nth_time == times_to_take - 1 { + num_remaining_rows + } else { + suggest_num_emit + }; + + let sub_batch = group_values.emit(EmitTo::First(num_emit)).unwrap(); + let sub_batch = RecordBatch::try_new(schema.clone(), sub_batch).unwrap(); + actual_sub_batches.push(sub_batch); + + num_remaining_rows -= num_emit; + } + assert!(num_remaining_rows == 0); + + let actual_batch = concat_batches(&schema, &actual_sub_batches).unwrap(); + check_result(&actual_batch, &data_set.expected_batch); + } + } + + /// Test data set for [`VectorizedGroupValuesColumn`] + /// + /// Define the test data and support loading them into test [`VectorizedGroupValuesColumn`] + /// + /// The covering situations: + /// + /// Array type: + /// - Primitive array + /// - String(byte) array + /// - String view(byte view) array + /// + /// Repeation and nullability in single batch: + /// - All not null rows + /// - Mixed null + not null rows + /// - All null rows + /// - All not null rows(repeated) + /// - Null + not null rows(repeated) + /// - All not null rows(repeated) + /// + /// If group exists in `map`: + /// - Group exists in inlined group view + /// - Group exists in non-inlined group view + /// - Group not exist + bucket not found in `map` + /// - Group not exist + not equal to inlined group view(tested in hash collision) + /// - Group not exist + not equal to non-inlined group view(tested in hash collision) + /// + struct VectorizedTestDataSet { + test_batches: Vec>, + expected_batch: RecordBatch, + } + + impl VectorizedTestDataSet { + fn new() -> Self { + // Intern batch 1 + let col1 = Int64Array::from(vec![ + // Repeated rows in batch + Some(42), // all not nulls + repeated rows + exist in map case + None, // mixed + repeated rows + exist in map case + None, // mixed + repeated rows + not exist in map case + Some(1142), // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some(42), + None, + None, + Some(1142), + None, + // Unique rows in batch + Some(4211), // all not nulls + unique rows + exist in map case + None, // mixed + unique rows + exist in map case + None, // mixed + unique rows + not exist in map case + Some(4212), // mixed + unique rows + not exist in map case + ]); + + let col2 = StringArray::from(vec![ + // Repeated rows in batch + Some("string1"), // all not nulls + repeated rows + exist in map case + None, // mixed + repeated rows + exist in map case + Some("string2"), // mixed + repeated rows + not exist in map case + None, // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some("string1"), + None, + Some("string2"), + None, + None, + // Unique rows in batch + Some("string3"), // all not nulls + unique rows + exist in map case + None, // mixed + unique rows + exist in map case + Some("string4"), // mixed + unique rows + not exist in map case + None, // mixed + unique rows + not exist in map case + ]); + + let col3 = StringViewArray::from(vec![ + // Repeated rows in batch + Some("stringview1"), // all not nulls + repeated rows + exist in map case + Some("stringview2"), // mixed + repeated rows + exist in map case + None, // mixed + repeated rows + not exist in map case + None, // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some("stringview1"), + Some("stringview2"), + None, + None, + None, + // Unique rows in batch + Some("stringview3"), // all not nulls + unique rows + exist in map case + Some("stringview4"), // mixed + unique rows + exist in map case + None, // mixed + unique rows + not exist in map case + None, // mixed + unique rows + not exist in map case + ]); + let batch1 = vec![ + Arc::new(col1) as _, + Arc::new(col2) as _, + Arc::new(col3) as _, + ]; + + // Intern batch 2 + let col1 = Int64Array::from(vec![ + // Repeated rows in batch + Some(42), // all not nulls + repeated rows + exist in map case + None, // mixed + repeated rows + exist in map case + None, // mixed + repeated rows + not exist in map case + Some(21142), // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some(42), + None, + None, + Some(21142), + None, + // Unique rows in batch + Some(4211), // all not nulls + unique rows + exist in map case + None, // mixed + unique rows + exist in map case + None, // mixed + unique rows + not exist in map case + Some(24212), // mixed + unique rows + not exist in map case + ]); + + let col2 = StringArray::from(vec![ + // Repeated rows in batch + Some("string1"), // all not nulls + repeated rows + exist in map case + None, // mixed + repeated rows + exist in map case + Some("2string2"), // mixed + repeated rows + not exist in map case + None, // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some("string1"), + None, + Some("2string2"), + None, + None, + // Unique rows in batch + Some("string3"), // all not nulls + unique rows + exist in map case + None, // mixed + unique rows + exist in map case + Some("2string4"), // mixed + unique rows + not exist in map case + None, // mixed + unique rows + not exist in map case + ]); + + let col3 = StringViewArray::from(vec![ + // Repeated rows in batch + Some("stringview1"), // all not nulls + repeated rows + exist in map case + Some("stringview2"), // mixed + repeated rows + exist in map case + None, // mixed + repeated rows + not exist in map case + None, // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some("stringview1"), + Some("stringview2"), + None, + None, + None, + // Unique rows in batch + Some("stringview3"), // all not nulls + unique rows + exist in map case + Some("stringview4"), // mixed + unique rows + exist in map case + None, // mixed + unique rows + not exist in map case + None, // mixed + unique rows + not exist in map case + ]); + let batch2 = vec![ + Arc::new(col1) as _, + Arc::new(col2) as _, + Arc::new(col3) as _, + ]; + + // Intern batch 3 + let col1 = Int64Array::from(vec![ + // Repeated rows in batch + Some(42), // all not nulls + repeated rows + exist in map case + None, // mixed + repeated rows + exist in map case + None, // mixed + repeated rows + not exist in map case + Some(31142), // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some(42), + None, + None, + Some(31142), + None, + // Unique rows in batch + Some(4211), // all not nulls + unique rows + exist in map case + None, // mixed + unique rows + exist in map case + None, // mixed + unique rows + not exist in map case + Some(34212), // mixed + unique rows + not exist in map case + ]); + + let col2 = StringArray::from(vec![ + // Repeated rows in batch + Some("string1"), // all not nulls + repeated rows + exist in map case + None, // mixed + repeated rows + exist in map case + Some("3string2"), // mixed + repeated rows + not exist in map case + None, // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some("string1"), + None, + Some("3string2"), + None, + None, + // Unique rows in batch + Some("string3"), // all not nulls + unique rows + exist in map case + None, // mixed + unique rows + exist in map case + Some("3string4"), // mixed + unique rows + not exist in map case + None, // mixed + unique rows + not exist in map case + ]); + + let col3 = StringViewArray::from(vec![ + // Repeated rows in batch + Some("stringview1"), // all not nulls + repeated rows + exist in map case + Some("stringview2"), // mixed + repeated rows + exist in map case + None, // mixed + repeated rows + not exist in map case + None, // mixed + repeated rows + not exist in map case + None, // all nulls + repeated rows + exist in map case + Some("stringview1"), + Some("stringview2"), + None, + None, + None, + // Unique rows in batch + Some("stringview3"), // all not nulls + unique rows + exist in map case + Some("stringview4"), // mixed + unique rows + exist in map case + None, // mixed + unique rows + not exist in map case + None, // mixed + unique rows + not exist in map case + ]); + let batch3 = vec![ + Arc::new(col1) as _, + Arc::new(col2) as _, + Arc::new(col3) as _, + ]; + + // Expected batch + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Utf8View, true), + ])); + + let col1 = Int64Array::from(vec![ + // Repeated rows in batch + Some(42), + None, + None, + Some(1142), + None, + Some(21142), + None, + Some(31142), + None, + // Unique rows in batch + Some(4211), + None, + None, + Some(4212), + None, + Some(24212), + None, + Some(34212), + ]); + + let col2 = StringArray::from(vec![ + // Repeated rows in batch + Some("string1"), + None, + Some("string2"), + None, + Some("2string2"), + None, + Some("3string2"), + None, + None, + // Unique rows in batch + Some("string3"), + None, + Some("string4"), + None, + Some("2string4"), + None, + Some("3string4"), + None, + ]); + + let col3 = StringViewArray::from(vec![ + // Repeated rows in batch + Some("stringview1"), + Some("stringview2"), + None, + None, + None, + None, + None, + None, + None, + // Unique rows in batch + Some("stringview3"), + Some("stringview4"), + None, + None, + None, + None, + None, + None, + ]); + let expected_batch = vec![ + Arc::new(col1) as _, + Arc::new(col2) as _, + Arc::new(col3) as _, + ]; + let expected_batch = RecordBatch::try_new(schema, expected_batch).unwrap(); + + Self { + test_batches: vec![batch1, batch2, batch3], + expected_batch, + } + } + + fn load_to_group_values(&self, group_values: &mut impl GroupValues) { + for batch in self.test_batches.iter() { + group_values.intern(&batch, &mut vec![]).unwrap(); + } + } + + fn schema(&self) -> SchemaRef { + self.expected_batch.schema() + } + } + + fn check_result(actual_batch: &RecordBatch, expected_batch: &RecordBatch) { + let formatted_actual_batch = pretty_format_batches(&[actual_batch.clone()]) + .unwrap() + .to_string(); let mut formatted_actual_batch_sorted: Vec<&str> = formatted_actual_batch.trim().lines().collect(); formatted_actual_batch_sorted.sort_unstable(); - let expected_batch = RecordBatch::try_new(schema, expected_batch).unwrap(); - let formatted_expected_batch = pretty_format_batches(&[expected_batch]) + let formatted_expected_batch = pretty_format_batches(&[expected_batch.clone()]) .unwrap() .to_string(); let mut formatted_expected_batch_sorted: Vec<&str> = From 2c1ec195523a8bf26d9b0026919ebf0dda5d77ae Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 30 Oct 2024 13:16:47 +0800 Subject: [PATCH 33/60] sort out tests codes in for group columns and add vectorized tests for primitives. --- .../aggregates/group_values/group_column.rs | 247 ++++++++++++++---- 1 file changed, 191 insertions(+), 56 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index c40e6eb3fa97..bead4cad1c29 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -1162,53 +1162,68 @@ mod tests { use super::{ByteGroupValueBuilder, GroupColumn}; + // ======================================================================== + // Tests for primitive builders + // ======================================================================== #[test] - fn test_take_n() { - let mut builder = ByteGroupValueBuilder::::new(OutputType::Utf8); - let array = Arc::new(StringArray::from(vec![Some("a"), None])) as ArrayRef; - // a, null, null - builder.append_val(&array, 0); - builder.append_val(&array, 1); - builder.append_val(&array, 1); - - // (a, null) remaining: null - let output = builder.take_n(2); - assert_eq!(&output, &array); + fn test_nullable_primitive_equal_to() { + let append = |builder: &mut PrimitiveGroupValueBuilder, + builder_array: &ArrayRef, + append_rows: &[usize]| { + for &index in append_rows { + builder.append_val(builder_array, index); + } + }; - // null, a, null, a - builder.append_val(&array, 0); - builder.append_val(&array, 1); - builder.append_val(&array, 0); + let equal_to = |builder: &PrimitiveGroupValueBuilder, + lhs_rows: &[usize], + input_array: &ArrayRef, + rhs_rows: &[usize], + equal_to_results: &mut Vec| { + let iter = lhs_rows.iter().zip(rhs_rows.iter()); + for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() { + equal_to_results[idx] = builder.equal_to(lhs_row, &input_array, rhs_row); + } + }; - // (null, a) remaining: (null, a) - let output = builder.take_n(2); - let array = Arc::new(StringArray::from(vec![None, Some("a")])) as ArrayRef; - assert_eq!(&output, &array); + test_nullable_primitive_equal_to_internal(append, equal_to); + } - let array = Arc::new(StringArray::from(vec![ - Some("a"), - None, - Some("longstringfortest"), - ])) as ArrayRef; + #[test] + fn test_nullable_primitive_vectorized_equal_to() { + let append = |builder: &mut PrimitiveGroupValueBuilder, + builder_array: &ArrayRef, + append_rows: &[usize]| { + builder.vectorized_append(builder_array, append_rows); + }; - // null, a, longstringfortest, null, null - builder.append_val(&array, 2); - builder.append_val(&array, 1); - builder.append_val(&array, 1); + let equal_to = |builder: &PrimitiveGroupValueBuilder, + lhs_rows: &[usize], + input_array: &ArrayRef, + rhs_rows: &[usize], + equal_to_results: &mut Vec| { + builder.vectorized_equal_to( + lhs_rows, + input_array, + rhs_rows, + equal_to_results, + ); + }; - // (null, a, longstringfortest, null) remaining: (null) - let output = builder.take_n(4); - let array = Arc::new(StringArray::from(vec![ - None, - Some("a"), - Some("longstringfortest"), - None, - ])) as ArrayRef; - assert_eq!(&output, &array); + test_nullable_primitive_equal_to_internal(append, equal_to); } - #[test] - fn test_nullable_primitive_equal_to() { + fn test_nullable_primitive_equal_to_internal(mut append: A, mut equal_to: E) + where + A: FnMut(&mut PrimitiveGroupValueBuilder, &ArrayRef, &[usize]), + E: FnMut( + &PrimitiveGroupValueBuilder, + &[usize], + &ArrayRef, + &[usize], + &mut Vec, + ), + { // Will cover such cases: // - exist null, input not null // - exist null, input null; values not equal @@ -1227,12 +1242,7 @@ mod tests { Some(2), Some(3), ])) as ArrayRef; - builder.append_val(&builder_array, 0); - builder.append_val(&builder_array, 1); - builder.append_val(&builder_array, 2); - builder.append_val(&builder_array, 3); - builder.append_val(&builder_array, 4); - builder.append_val(&builder_array, 5); + append(&mut builder, &builder_array, &[0, 1, 2, 3, 4, 5]); // Define input array let (_nulls, values, _) = @@ -1251,16 +1261,82 @@ mod tests { let input_array = Arc::new(Int64Array::new(values, Some(nulls))) as ArrayRef; // Check - assert!(!builder.equal_to(0, &input_array, 0)); - assert!(builder.equal_to(1, &input_array, 1)); - assert!(builder.equal_to(2, &input_array, 2)); - assert!(!builder.equal_to(3, &input_array, 3)); - assert!(!builder.equal_to(4, &input_array, 4)); - assert!(builder.equal_to(5, &input_array, 5)); + let mut equal_to_results = vec![true; builder.len()]; + equal_to( + &builder, + &[0, 1, 2, 3, 4, 5], + &input_array, + &[0, 1, 2, 3, 4, 5], + &mut equal_to_results, + ); + + assert!(!equal_to_results[0]); + assert!(equal_to_results[1]); + assert!(equal_to_results[2]); + assert!(!equal_to_results[3]); + assert!(!equal_to_results[4]); + assert!(equal_to_results[5]); } #[test] fn test_not_nullable_primitive_equal_to() { + let append = |builder: &mut PrimitiveGroupValueBuilder, + builder_array: &ArrayRef, + append_rows: &[usize]| { + for &index in append_rows { + builder.append_val(builder_array, index); + } + }; + + let equal_to = |builder: &PrimitiveGroupValueBuilder, + lhs_rows: &[usize], + input_array: &ArrayRef, + rhs_rows: &[usize], + equal_to_results: &mut Vec| { + let iter = lhs_rows.iter().zip(rhs_rows.iter()); + for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() { + equal_to_results[idx] = builder.equal_to(lhs_row, &input_array, rhs_row); + } + }; + + test_not_nullable_primitive_equal_to_internal(append, equal_to); + } + + #[test] + fn test_not_nullable_primitive_vectorized_equal_to() { + let append = |builder: &mut PrimitiveGroupValueBuilder, + builder_array: &ArrayRef, + append_rows: &[usize]| { + builder.vectorized_append(builder_array, append_rows); + }; + + let equal_to = |builder: &PrimitiveGroupValueBuilder, + lhs_rows: &[usize], + input_array: &ArrayRef, + rhs_rows: &[usize], + equal_to_results: &mut Vec| { + builder.vectorized_equal_to( + lhs_rows, + input_array, + rhs_rows, + equal_to_results, + ); + }; + + test_not_nullable_primitive_equal_to_internal(append, equal_to); + } + + fn test_not_nullable_primitive_equal_to_internal(mut append: A, mut equal_to: E) + where + A: FnMut(&mut PrimitiveGroupValueBuilder, &ArrayRef, &[usize]), + E: FnMut( + &PrimitiveGroupValueBuilder, + &[usize], + &ArrayRef, + &[usize], + &mut Vec, + ), + { // Will cover such cases: // - values equal // - values not equal @@ -1269,19 +1345,75 @@ mod tests { let mut builder = PrimitiveGroupValueBuilder::::new(); let builder_array = Arc::new(Int64Array::from(vec![Some(0), Some(1)])) as ArrayRef; - builder.append_val(&builder_array, 0); - builder.append_val(&builder_array, 1); + append(&mut builder, &builder_array, &[0, 1]); // Define input array let input_array = Arc::new(Int64Array::from(vec![Some(0), Some(2)])) as ArrayRef; // Check - assert!(builder.equal_to(0, &input_array, 0)); - assert!(!builder.equal_to(1, &input_array, 1)); + let mut equal_to_results = vec![true; builder.len()]; + equal_to( + &builder, + &[0, 1], + &input_array, + &[0, 1], + &mut equal_to_results, + ); + + assert!(equal_to_results[0]); + assert!(!equal_to_results[1]); + } + + // ======================================================================== + // Tests for byte builders + // ======================================================================== + #[test] + fn test_byte_take_n() { + let mut builder = ByteGroupValueBuilder::::new(OutputType::Utf8); + let array = Arc::new(StringArray::from(vec![Some("a"), None])) as ArrayRef; + // a, null, null + builder.append_val(&array, 0); + builder.append_val(&array, 1); + builder.append_val(&array, 1); + + // (a, null) remaining: null + let output = builder.take_n(2); + assert_eq!(&output, &array); + + // null, a, null, a + builder.append_val(&array, 0); + builder.append_val(&array, 1); + builder.append_val(&array, 0); + + // (null, a) remaining: (null, a) + let output = builder.take_n(2); + let array = Arc::new(StringArray::from(vec![None, Some("a")])) as ArrayRef; + assert_eq!(&output, &array); + + let array = Arc::new(StringArray::from(vec![ + Some("a"), + None, + Some("longstringfortest"), + ])) as ArrayRef; + + // null, a, longstringfortest, null, null + builder.append_val(&array, 2); + builder.append_val(&array, 1); + builder.append_val(&array, 1); + + // (null, a, longstringfortest, null) remaining: (null) + let output = builder.take_n(4); + let array = Arc::new(StringArray::from(vec![ + None, + Some("a"), + Some("longstringfortest"), + None, + ])) as ArrayRef; + assert_eq!(&output, &array); } #[test] - fn test_byte_array_equal_to() { + fn test_byte_equal_to() { // Will cover such cases: // - exist null, input not null // - exist null, input null; values not equal @@ -1339,6 +1471,9 @@ mod tests { assert!(builder.equal_to(5, &input_array, 5)); } + // ======================================================================== + // Tests for byte view builders + // ======================================================================== #[test] fn test_byte_view_append_val() { let mut builder = From fa6343c2deb7a9de1a897f9b3850bc9f8efdbc41 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 30 Oct 2024 13:33:35 +0800 Subject: [PATCH 34/60] add vectorized test for byte builder. --- .../aggregates/group_values/group_column.rs | 85 ++++++++++++++++--- 1 file changed, 73 insertions(+), 12 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index bead4cad1c29..ea152b722804 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -1414,6 +1414,63 @@ mod tests { #[test] fn test_byte_equal_to() { + let append = |builder: &mut ByteGroupValueBuilder, + builder_array: &ArrayRef, + append_rows: &[usize]| { + for &index in append_rows { + builder.append_val(builder_array, index); + } + }; + + let equal_to = |builder: &ByteGroupValueBuilder, + lhs_rows: &[usize], + input_array: &ArrayRef, + rhs_rows: &[usize], + equal_to_results: &mut Vec| { + let iter = lhs_rows.iter().zip(rhs_rows.iter()); + for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() { + equal_to_results[idx] = builder.equal_to(lhs_row, &input_array, rhs_row); + } + }; + + test_byte_equal_to_internal(append, equal_to); + } + + #[test] + fn test_byte_vectorized_equal_to() { + let append = |builder: &mut ByteGroupValueBuilder, + builder_array: &ArrayRef, + append_rows: &[usize]| { + builder.vectorized_append(builder_array, append_rows); + }; + + let equal_to = |builder: &ByteGroupValueBuilder, + lhs_rows: &[usize], + input_array: &ArrayRef, + rhs_rows: &[usize], + equal_to_results: &mut Vec| { + builder.vectorized_equal_to( + lhs_rows, + input_array, + rhs_rows, + equal_to_results, + ); + }; + + test_byte_equal_to_internal(append, equal_to); + } + + fn test_byte_equal_to_internal(mut append: A, mut equal_to: E) + where + A: FnMut(&mut ByteGroupValueBuilder, &ArrayRef, &[usize]), + E: FnMut( + &ByteGroupValueBuilder, + &[usize], + &ArrayRef, + &[usize], + &mut Vec, + ), + { // Will cover such cases: // - exist null, input not null // - exist null, input null; values not equal @@ -1432,12 +1489,7 @@ mod tests { Some("bar"), Some("baz"), ])) as ArrayRef; - builder.append_val(&builder_array, 0); - builder.append_val(&builder_array, 1); - builder.append_val(&builder_array, 2); - builder.append_val(&builder_array, 3); - builder.append_val(&builder_array, 4); - builder.append_val(&builder_array, 5); + append(&mut builder, &builder_array, &[0, 1, 2, 3, 4, 5]); // Define input array let (offsets, buffer, _nulls) = StringArray::from(vec![ @@ -1463,12 +1515,21 @@ mod tests { Arc::new(StringArray::new(offsets, buffer, Some(nulls))) as ArrayRef; // Check - assert!(!builder.equal_to(0, &input_array, 0)); - assert!(builder.equal_to(1, &input_array, 1)); - assert!(builder.equal_to(2, &input_array, 2)); - assert!(!builder.equal_to(3, &input_array, 3)); - assert!(!builder.equal_to(4, &input_array, 4)); - assert!(builder.equal_to(5, &input_array, 5)); + let mut equal_to_results = vec![true; builder.len()]; + equal_to( + &builder, + &[0, 1, 2, 3, 4, 5], + &input_array, + &[0, 1, 2, 3, 4, 5], + &mut equal_to_results, + ); + + assert!(!equal_to_results[0]); + assert!(equal_to_results[1]); + assert!(equal_to_results[2]); + assert!(!equal_to_results[3]); + assert!(!equal_to_results[4]); + assert!(equal_to_results[5]); } // ======================================================================== From 41ac655b7188f5f58009a48e3ac71016f395d128 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 30 Oct 2024 13:43:52 +0800 Subject: [PATCH 35/60] add vectorized test for byte view builder. --- .../aggregates/group_values/group_column.rs | 102 ++++++++++++++---- 1 file changed, 80 insertions(+), 22 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index ea152b722804..92d5324281b4 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -1555,12 +1555,69 @@ mod tests { let output = Box::new(builder).build(); // should be 2 output buffers to hold all the data - assert_eq!(output.as_string_view().data_buffers().len(), 2,); + assert_eq!(output.as_string_view().data_buffers().len(), 2); assert_eq!(&output, &builder_array) } #[test] fn test_byte_view_equal_to() { + let append = |builder: &mut ByteViewGroupValueBuilder, + builder_array: &ArrayRef, + append_rows: &[usize]| { + for &index in append_rows { + builder.append_val(builder_array, index); + } + }; + + let equal_to = |builder: &ByteViewGroupValueBuilder, + lhs_rows: &[usize], + input_array: &ArrayRef, + rhs_rows: &[usize], + equal_to_results: &mut Vec| { + let iter = lhs_rows.iter().zip(rhs_rows.iter()); + for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() { + equal_to_results[idx] = builder.equal_to(lhs_row, &input_array, rhs_row); + } + }; + + test_byte_view_equal_to_internal(append, equal_to); + } + + #[test] + fn test_byte_view_vectorized_equal_to() { + let append = |builder: &mut ByteViewGroupValueBuilder, + builder_array: &ArrayRef, + append_rows: &[usize]| { + builder.vectorized_append(builder_array, append_rows); + }; + + let equal_to = |builder: &ByteViewGroupValueBuilder, + lhs_rows: &[usize], + input_array: &ArrayRef, + rhs_rows: &[usize], + equal_to_results: &mut Vec| { + builder.vectorized_equal_to( + lhs_rows, + input_array, + rhs_rows, + equal_to_results, + ); + }; + + test_byte_view_equal_to_internal(append, equal_to); + } + + fn test_byte_view_equal_to_internal(mut append: A, mut equal_to: E) + where + A: FnMut(&mut ByteViewGroupValueBuilder, &ArrayRef, &[usize]), + E: FnMut( + &ByteViewGroupValueBuilder, + &[usize], + &ArrayRef, + &[usize], + &mut Vec, + ), + { // Will cover such cases: // - exist null, input not null // - exist null, input null; values not equal @@ -1600,15 +1657,7 @@ mod tests { Some("I am a long string for test eq in completed"), Some("I am a long string for test eq in progress"), ])) as ArrayRef; - builder.append_val(&builder_array, 0); - builder.append_val(&builder_array, 1); - builder.append_val(&builder_array, 2); - builder.append_val(&builder_array, 3); - builder.append_val(&builder_array, 4); - builder.append_val(&builder_array, 5); - builder.append_val(&builder_array, 6); - builder.append_val(&builder_array, 7); - builder.append_val(&builder_array, 8); + append(&mut builder, &builder_array, &[0, 1, 2, 3, 4, 5, 6, 7, 8]); // Define input array let (views, buffer, _nulls) = StringViewArray::from(vec![ @@ -1646,18 +1695,27 @@ mod tests { Arc::new(StringViewArray::new(views, buffer, Some(nulls))) as ArrayRef; // Check - assert!(!builder.equal_to(0, &input_array, 0)); - assert!(builder.equal_to(1, &input_array, 1)); - assert!(builder.equal_to(2, &input_array, 2)); - assert!(!builder.equal_to(3, &input_array, 3)); - assert!(!builder.equal_to(4, &input_array, 4)); - assert!(!builder.equal_to(5, &input_array, 5)); - assert!(builder.equal_to(6, &input_array, 6)); - assert!(!builder.equal_to(7, &input_array, 7)); - assert!(!builder.equal_to(7, &input_array, 8)); - assert!(builder.equal_to(7, &input_array, 9)); - assert!(!builder.equal_to(8, &input_array, 10)); - assert!(builder.equal_to(8, &input_array, 11)); + let mut equal_to_results = vec![true; input_array.len()]; + equal_to( + &builder, + &[0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 8, 8], + &input_array, + &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], + &mut equal_to_results, + ); + + assert!(!equal_to_results[0]); + assert!(equal_to_results[1]); + assert!(equal_to_results[2]); + assert!(!equal_to_results[3]); + assert!(!equal_to_results[4]); + assert!(!equal_to_results[5]); + assert!(equal_to_results[6]); + assert!(!equal_to_results[7]); + assert!(!equal_to_results[8]); + assert!(equal_to_results[9]); + assert!(!equal_to_results[10]); + assert!(equal_to_results[11]); } #[test] From 4f8924e90258b8b92cc873b95eab3099fa89476f Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 30 Oct 2024 19:55:01 +0800 Subject: [PATCH 36/60] add test for the all nulls or not nulls branches in vectorized. --- .../aggregates/group_values/group_column.rs | 171 +++++++++++++++++- 1 file changed, 170 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 92d5324281b4..db5165ead58e 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -1152,7 +1152,7 @@ mod tests { array::AsArray, datatypes::{Int64Type, StringViewType}, }; - use arrow_array::{ArrayRef, Int64Array, StringArray, StringViewArray}; + use arrow_array::{Array, ArrayRef, Int64Array, StringArray, StringViewArray}; use arrow_buffer::{BooleanBufferBuilder, NullBuffer}; use datafusion_physical_expr::binary_map::OutputType; @@ -1364,6 +1364,62 @@ mod tests { assert!(!equal_to_results[1]); } + #[test] + fn test_nullable_primitive_vectorized_operation_special_case() { + // Test the special `all nulls` or `not nulls` input array case + // for vectorized append and equal to + + let mut builder = PrimitiveGroupValueBuilder::::new(); + + // All nulls input array + let all_nulls_input_array = Arc::new(Int64Array::from(vec![ + Option::::None, + None, + None, + None, + None, + ])) as _; + builder.vectorized_append(&all_nulls_input_array, &[0, 1, 2, 3, 4]); + + let mut equal_to_results = vec![true; all_nulls_input_array.len()]; + builder.vectorized_equal_to( + &[0, 1, 2, 3, 4], + &all_nulls_input_array, + &[0, 1, 2, 3, 4], + &mut equal_to_results, + ); + + assert!(equal_to_results[0]); + assert!(equal_to_results[1]); + assert!(equal_to_results[2]); + assert!(equal_to_results[3]); + assert!(equal_to_results[4]); + + // All not nulls input array + let all_not_nulls_input_array = Arc::new(Int64Array::from(vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + ])) as _; + builder.vectorized_append(&all_not_nulls_input_array, &[0, 1, 2, 3, 4]); + + let mut equal_to_results = vec![true; all_not_nulls_input_array.len()]; + builder.vectorized_equal_to( + &[5, 6, 7, 8, 9], + &all_not_nulls_input_array, + &[0, 1, 2, 3, 4], + &mut equal_to_results, + ); + + assert!(equal_to_results[0]); + assert!(equal_to_results[1]); + assert!(equal_to_results[2]); + assert!(equal_to_results[3]); + assert!(equal_to_results[4]); + } + // ======================================================================== // Tests for byte builders // ======================================================================== @@ -1460,6 +1516,62 @@ mod tests { test_byte_equal_to_internal(append, equal_to); } + #[test] + fn test_byte_vectorized_operation_special_case() { + // Test the special `all nulls` or `not nulls` input array case + // for vectorized append and equal to + + let mut builder = ByteGroupValueBuilder::::new(OutputType::Utf8); + + // All nulls input array + let all_nulls_input_array = Arc::new(StringArray::from(vec![ + Option::<&str>::None, + None, + None, + None, + None, + ])) as _; + builder.vectorized_append(&all_nulls_input_array, &[0, 1, 2, 3, 4]); + + let mut equal_to_results = vec![true; all_nulls_input_array.len()]; + builder.vectorized_equal_to( + &[0, 1, 2, 3, 4], + &all_nulls_input_array, + &[0, 1, 2, 3, 4], + &mut equal_to_results, + ); + + assert!(equal_to_results[0]); + assert!(equal_to_results[1]); + assert!(equal_to_results[2]); + assert!(equal_to_results[3]); + assert!(equal_to_results[4]); + + // All not nulls input array + let all_not_nulls_input_array = Arc::new(StringArray::from(vec![ + Some("string1"), + Some("string2"), + Some("string3"), + Some("string4"), + Some("string5"), + ])) as _; + builder.vectorized_append(&all_not_nulls_input_array, &[0, 1, 2, 3, 4]); + + let mut equal_to_results = vec![true; all_not_nulls_input_array.len()]; + builder.vectorized_equal_to( + &[5, 6, 7, 8, 9], + &all_not_nulls_input_array, + &[0, 1, 2, 3, 4], + &mut equal_to_results, + ); + + assert!(equal_to_results[0]); + assert!(equal_to_results[1]); + assert!(equal_to_results[2]); + assert!(equal_to_results[3]); + assert!(equal_to_results[4]); + } + fn test_byte_equal_to_internal(mut append: A, mut equal_to: E) where A: FnMut(&mut ByteGroupValueBuilder, &ArrayRef, &[usize]), @@ -1607,6 +1719,63 @@ mod tests { test_byte_view_equal_to_internal(append, equal_to); } + #[test] + fn test_byte_view_vectorized_operation_special_case() { + // Test the special `all nulls` or `not nulls` input array case + // for vectorized append and equal to + + let mut builder = + ByteViewGroupValueBuilder::::new().with_max_block_size(60); + + // All nulls input array + let all_nulls_input_array = Arc::new(StringViewArray::from(vec![ + Option::<&str>::None, + None, + None, + None, + None, + ])) as _; + builder.vectorized_append(&all_nulls_input_array, &[0, 1, 2, 3, 4]); + + let mut equal_to_results = vec![true; all_nulls_input_array.len()]; + builder.vectorized_equal_to( + &[0, 1, 2, 3, 4], + &all_nulls_input_array, + &[0, 1, 2, 3, 4], + &mut equal_to_results, + ); + + assert!(equal_to_results[0]); + assert!(equal_to_results[1]); + assert!(equal_to_results[2]); + assert!(equal_to_results[3]); + assert!(equal_to_results[4]); + + // All not nulls input array + let all_not_nulls_input_array = Arc::new(StringViewArray::from(vec![ + Some("stringview1"), + Some("stringview2"), + Some("stringview3"), + Some("stringview4"), + Some("stringview5"), + ])) as _; + builder.vectorized_append(&all_not_nulls_input_array, &[0, 1, 2, 3, 4]); + + let mut equal_to_results = vec![true; all_not_nulls_input_array.len()]; + builder.vectorized_equal_to( + &[5, 6, 7, 8, 9], + &all_not_nulls_input_array, + &[0, 1, 2, 3, 4], + &mut equal_to_results, + ); + + assert!(equal_to_results[0]); + assert!(equal_to_results[1]); + assert!(equal_to_results[2]); + assert!(equal_to_results[3]); + assert!(equal_to_results[4]); + } + fn test_byte_view_equal_to_internal(mut append: A, mut equal_to: E) where A: FnMut(&mut ByteViewGroupValueBuilder, &ArrayRef, &[usize]), From 236b0bcc67853c67e8f25787e8e7297e9624a332 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 30 Oct 2024 20:49:04 +0800 Subject: [PATCH 37/60] fix clippy. --- datafusion/common/src/utils/memory.rs | 6 ++-- .../src/aggregates/group_values/column.rs | 34 ++++++------------- .../aggregates/group_values/group_column.rs | 3 -- 3 files changed, 13 insertions(+), 30 deletions(-) diff --git a/datafusion/common/src/utils/memory.rs b/datafusion/common/src/utils/memory.rs index d5ce59e3421b..ec6b59a302d1 100644 --- a/datafusion/common/src/utils/memory.rs +++ b/datafusion/common/src/utils/memory.rs @@ -102,14 +102,14 @@ pub fn estimate_memory_size(num_elements: usize, fixed_size: usize) -> Result #[cfg(test)] mod tests { - use std::collections::HashSet; + use std::{collections::HashSet, mem}; use super::estimate_memory_size; #[test] fn test_estimate_memory() { // size (bytes): 48 - let fixed_size = size_of::>(); + let fixed_size = mem::size_of::>(); // estimated buckets: 16 = (8 * 8 / 7).next_power_of_two() let num_elements = 8; @@ -127,7 +127,7 @@ mod tests { #[test] fn test_estimate_memory_overflow() { let num_elements = usize::MAX; - let fixed_size = size_of::>(); + let fixed_size = mem::size_of::>(); let estimated = estimate_memory_size::(num_elements, fixed_size); assert!(estimated.is_err()); diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index cd4b18399c3a..c68c42826646 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::ops::Sub; -use std::{iter, mem, usize}; +use std::mem; use crate::aggregates::group_values::group_column::{ ByteGroupValueBuilder, ByteViewGroupValueBuilder, GroupColumn, @@ -24,29 +23,22 @@ use crate::aggregates::group_values::group_column::{ }; use crate::aggregates::group_values::GroupValues; use ahash::RandomState; -use arrow::compute::{self, cast}; +use arrow::compute::cast; use arrow::datatypes::{ BinaryViewType, Date32Type, Date64Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, StringViewType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use arrow::record_batch::RecordBatch; -use arrow_array::{ - Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, - Date64Array, Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, - Int64Array, Int8Array, LargeStringArray, StringArray, StringViewArray, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, -}; -use arrow_schema::{DataType, Schema, SchemaRef, TimeUnit}; +use arrow_array::{Array, ArrayRef}; +use arrow_schema::{DataType, Schema, SchemaRef}; use datafusion_common::hash_utils::create_hashes; use datafusion_common::{not_impl_err, DataFusionError, Result}; use datafusion_execution::memory_pool::proxy::{RawTableAllocExt, VecAllocExt}; use datafusion_expr::EmitTo; use datafusion_physical_expr::binary_map::OutputType; -use datafusion_physical_expr_common::datum::compare_with_eq; -use hashbrown::raw::{Bucket, RawTable}; +use hashbrown::raw::RawTable; const NON_INLINED_FLAG: u64 = 0x8000000000000000; const VALUE_MASK: u64 = 0x7FFFFFFFFFFFFFFF; @@ -424,7 +416,7 @@ impl VectorizedGroupValuesColumn { for &row in &self.scalarized_indices { let target_hash = batch_hashes[row]; - let entry = map.get_mut(target_hash, |(exist_hash, group_index_view)| { + let entry = map.get_mut(target_hash, |(exist_hash, _)| { // Somewhat surprisingly, this closure can be called even if the // hash doesn't match, so check the hash first with an integer // comparison first avoid the more expensive comparison with @@ -440,7 +432,7 @@ impl VectorizedGroupValuesColumn { }; // Perform scalarized equal to - if self.scalarized_equal_to(&group_index_view, cols, row, groups) { + if self.scalarized_equal_to(group_index_view, cols, row, groups) { // Found the row actually exists in group values, // don't need to create new group for it. continue; @@ -1023,21 +1015,15 @@ fn supported_type(data_type: &DataType) -> bool { #[cfg(test)] mod tests { - use std::{cmp, sync::Arc}; + use std::sync::Arc; - use ahash::RandomState; - use arrow::{ - compute::concat_batches, - util::pretty::{pretty_format_batches, print_batches, print_columns}, - }; + use arrow::{compute::concat_batches, util::pretty::pretty_format_batches}; use arrow_array::{ArrayRef, Int64Array, RecordBatch, StringArray, StringViewArray}; use arrow_schema::{DataType, Field, Schema, SchemaRef}; - use datafusion_common::hash_utils::create_hashes; use datafusion_expr::EmitTo; use crate::aggregates::group_values::{ - column::{GroupValuesColumn, VectorizedGroupValuesColumn}, - GroupValues, + column::VectorizedGroupValuesColumn, GroupValues, }; #[test] diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index c4a3f64fa477..e0171790c5e8 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -22,11 +22,9 @@ use arrow::array::GenericBinaryArray; use arrow::array::GenericStringArray; use arrow::array::OffsetSizeTrait; use arrow::array::PrimitiveArray; -use arrow::array::StringViewBuilder; use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, AsArray}; use arrow::buffer::OffsetBuffer; use arrow::buffer::ScalarBuffer; -use arrow::compute; use arrow::datatypes::ByteArrayType; use arrow::datatypes::ByteViewType; use arrow::datatypes::DataType; @@ -35,7 +33,6 @@ use arrow_array::GenericByteArray; use arrow_array::GenericByteViewArray; use arrow_buffer::Buffer; use datafusion_common::utils::proxy::VecAllocExt; -use datafusion_expr::sqlparser::keywords::NULLABLE; use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder; use arrow_array::types::GenericStringType; From 15aaab1f37df46171eab535a26b81f571d826d96 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 30 Oct 2024 20:53:31 +0800 Subject: [PATCH 38/60] fix fmt. --- .../physical-plan/src/aggregates/group_values/column.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index c68c42826646..ee4e6b9e4219 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::mem; +use std::mem::{self, size_of}; use crate::aggregates::group_values::group_column::{ ByteGroupValueBuilder, ByteViewGroupValueBuilder, GroupColumn, @@ -708,7 +708,7 @@ impl GroupValues for VectorizedGroupValuesColumn { self.group_values.clear(); self.map.clear(); self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared - self.map_size = self.map.capacity() * mem::size_of::<(u64, usize)>(); + self.map_size = self.map.capacity() * size_of::<(u64, usize)>(); self.hashes_buffer.clear(); self.hashes_buffer.shrink_to(count); self.group_index_lists.clear(); @@ -970,7 +970,7 @@ impl GroupValues for GroupValuesColumn { self.group_values.clear(); self.map.clear(); self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared - self.map_size = self.map.capacity() * mem::size_of::<(u64, usize)>(); + self.map_size = self.map.capacity() * size_of::<(u64, usize)>(); self.hashes_buffer.clear(); self.hashes_buffer.shrink_to(count); } From a0aa7b782c535d6c6ee6e4fe9144173c2242f4fd Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 30 Oct 2024 20:58:55 +0800 Subject: [PATCH 39/60] fix compile in rust 1.79. --- datafusion/common/src/utils/memory.rs | 6 +++--- .../core/tests/user_defined/user_defined_aggregates.rs | 1 + datafusion/proto/tests/cases/roundtrip_logical_plan.rs | 1 + datafusion/substrait/tests/cases/roundtrip_logical_plan.rs | 1 + 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/utils/memory.rs b/datafusion/common/src/utils/memory.rs index ec6b59a302d1..bb68d59eed59 100644 --- a/datafusion/common/src/utils/memory.rs +++ b/datafusion/common/src/utils/memory.rs @@ -102,14 +102,14 @@ pub fn estimate_memory_size(num_elements: usize, fixed_size: usize) -> Result #[cfg(test)] mod tests { - use std::{collections::HashSet, mem}; + use std::{collections::HashSet, mem::size_of}; use super::estimate_memory_size; #[test] fn test_estimate_memory() { // size (bytes): 48 - let fixed_size = mem::size_of::>(); + let fixed_size = size_of::>(); // estimated buckets: 16 = (8 * 8 / 7).next_power_of_two() let num_elements = 8; @@ -127,7 +127,7 @@ mod tests { #[test] fn test_estimate_memory_overflow() { let num_elements = usize::MAX; - let fixed_size = mem::size_of::>(); + let fixed_size = size_of::>(); let estimated = estimate_memory_size::(num_elements, fixed_size); assert!(estimated.is_err()); diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs index 497addd23094..99c00615376f 100644 --- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs +++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs @@ -19,6 +19,7 @@ //! user defined aggregate functions use std::hash::{DefaultHasher, Hash, Hasher}; +use std::mem::{size_of, size_of_val}; use std::sync::{ atomic::{AtomicBool, Ordering}, Arc, diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 14d91913e7cd..f07a9e46be8b 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -29,6 +29,7 @@ use prost::Message; use std::any::Any; use std::collections::HashMap; use std::fmt::{self, Debug, Formatter}; +use std::mem::size_of_val; use std::sync::Arc; use std::vec; diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index 04530dd34d4b..1ce199f51535 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -23,6 +23,7 @@ use datafusion_substrait::logical_plan::{ consumer::from_substrait_plan, producer::to_substrait_plan, }; use std::cmp::Ordering; +use std::mem::size_of_val; use datafusion::arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}; use datafusion::common::{not_impl_err, plan_err, DFSchema, DFSchemaRef}; From c2088f704264ffbaf7cfe6ca8b0d754139423624 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 30 Oct 2024 22:26:55 +0800 Subject: [PATCH 40/60] improve comments. --- .../src/aggregates/group_values/column.rs | 61 +++++++------------ .../aggregates/group_values/group_column.rs | 13 ++-- 2 files changed, 30 insertions(+), 44 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index ee4e6b9e4219..49e280fc6611 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -43,41 +43,21 @@ use hashbrown::raw::RawTable; const NON_INLINED_FLAG: u64 = 0x8000000000000000; const VALUE_MASK: u64 = 0x7FFFFFFFFFFFFFFF; -/// `BucketContext` is a packed struct +/// The view of indices pointing to the actual values in `GroupValues` /// -/// ### Format: +/// If only single `group index` represented by view, +/// value of view is just the `group index`, and we call it a `inlined view`. /// -/// +---------------------+--------------------+ -/// | checking flag(1bit) | group index(63bit) | -/// +---------------------+--------------------+ -/// -/// ### Checking flag +/// If multiple `group indices` represented by view, +/// value of view is the actually the index pointing to `group indices`, +/// and we call it `non-inlined view`. /// -/// It is possible that rows with same hash values exist in `input cols`. -/// And if we `vectorized_equal_to` and `vectorized append` them -/// in the same round, some fault cases will occur especially when -/// they are totally the repeated rows... +/// The view(a u64) format is like: +/// +---------------------+---------------------------------------------+ +/// | inlined flag(1bit) | group index / index to group indices(63bit) | +/// +---------------------+---------------------------------------------+ /// -/// For example: -/// - Two repeated rows exist in `input cols`. -/// -/// - We found their hash values equal to one exist group -/// -/// - We then perform `vectorized_equal_to` for them to the exist group, -/// and found their values not equal to the exist one -/// -/// - Finally when perform `vectorized append`, we decide to build two -/// respective new groups for them, even we actually just need one -/// new group... -/// -/// So for solving such cases simply, if some rows with same hash value -/// in `input cols`, just allow to process one of them in a round, -/// and this flag is used to represent that one of them is processing -/// in current round. -/// -/// ### Group index -/// -/// The group's index in group values +/// `inlined flag`: 1 represents `non-inlined`, and 0 represents `inlined` /// #[derive(Debug, Clone, Copy)] struct GroupIndexView(u64); @@ -114,11 +94,17 @@ pub struct VectorizedGroupValuesColumn { /// Logically maps group values to a group_index in /// [`Self::group_values`] and in each accumulator /// - /// Uses the raw API of hashbrown to avoid actually storing the - /// keys (group values) in the table + /// It is a `hashtable` based on `hashbrown`. + /// + /// Key and value in the `hashtable`: + /// - The `key` is `hash value(u64)` of the `group value` + /// - The `value` is the `group values` with the same `hash value` + /// + /// We don't really store the actual `group values` in `hashtable`, + /// instead we store the `group indices` pointing to values in `GroupValues`. + /// And we use [`GroupIndexView`] to represent such `group indices` in table. + /// /// - /// keys: u64 hashes of the GroupValue - /// values: (hash, group_index) map: RawTable<(u64, GroupIndexView)>, /// The size of `map` in bytes @@ -131,10 +117,9 @@ pub struct VectorizedGroupValuesColumn { /// /// The chained indices is like: /// `latest group index -> older group index -> even older group index -> ...` + /// group_index_lists: Vec>, - index_lists_updates: Vec<(usize, usize)>, - /// Similar as `current_indices`, but `remaining_indices` /// is used to store the rows will be processed in next round. scalarized_indices: Vec, @@ -176,7 +161,6 @@ impl VectorizedGroupValuesColumn { schema, map, group_index_lists: Vec::new(), - index_lists_updates: Vec::new(), map_size: 0, group_values: vec![], hashes_buffer: Default::default(), @@ -712,7 +696,6 @@ impl GroupValues for VectorizedGroupValuesColumn { self.hashes_buffer.clear(); self.hashes_buffer.shrink_to(count); self.group_index_lists.clear(); - self.index_lists_updates.clear(); self.scalarized_indices.clear(); self.vectorized_append_row_indices.clear(); self.vectorized_equal_to_row_indices.clear(); diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index e0171790c5e8..2348dea916fc 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -150,25 +150,28 @@ impl GroupColumn ) { let array = array.as_primitive::(); - for (idx, &lhs_row) in group_indices.iter().enumerate() { + let iter = group_indices + .iter() + .zip(rows.iter()) + .zip(equal_to_results.iter_mut()); + for ((&lhs_row, &rhs_row), equal_to_result) in iter { // Has found not equal to, don't need to check - if !equal_to_results[idx] { + if !*equal_to_result { continue; } - let rhs_row = rows[idx]; // Perf: skip null check (by short circuit) if input is not nullable if NULLABLE { let exist_null = self.nulls.is_null(lhs_row); let input_null = array.is_null(rhs_row); if let Some(result) = nulls_equal_to(exist_null, input_null) { - equal_to_results[idx] = result; + *equal_to_result = result; continue; } // Otherwise, we need to check their values } - equal_to_results[idx] = self.group_values[lhs_row] == array.value(rhs_row); + *equal_to_result = self.group_values[lhs_row] == array.value(rhs_row); } } From 7acfef04d516f69272b38970f280a3206c18d465 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 30 Oct 2024 22:44:51 +0800 Subject: [PATCH 41/60] fix doc. --- .../physical-plan/src/aggregates/group_values/column.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 49e280fc6611..39599149ef06 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -959,7 +959,8 @@ impl GroupValues for GroupValuesColumn { } } -/// Returns true if [`GroupValuesColumn`] supported for the specified schema +/// Returns true if [`GroupValuesColumn`] or [`VectorizedGroupValuesColumn`] +/// supported for the specified schema pub fn supported_schema(schema: &Schema) -> bool { schema .fields() @@ -968,10 +969,12 @@ pub fn supported_schema(schema: &Schema) -> bool { .all(supported_type) } -/// Returns true if the specified data type is supported by [`GroupValuesColumn`] +/// Returns true if the specified data type is supported by +/// [`GroupValuesColumn`] or [`VectorizedGroupValuesColumn`] /// /// In order to be supported, there must be a specialized implementation of -/// [`GroupColumn`] for the data type, instantiated in [`Self::intern`] +/// [`GroupColumn`] for the data type, instantiated in [`GroupValuesColumn::intern`] +/// or [`VectorizedGroupValuesColumn::intern`] fn supported_type(data_type: &DataType) -> bool { matches!( *data_type, From 7875d5077d72cb54d767ff9d5dde5a52b241fba2 Mon Sep 17 00:00:00 2001 From: kamille Date: Wed, 30 Oct 2024 23:34:54 +0800 Subject: [PATCH 42/60] add more comments to explain the really complex vectorized intern process. --- .../src/aggregates/group_values/column.rs | 168 ++++++++++++------ 1 file changed, 110 insertions(+), 58 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 39599149ef06..95cd5271c449 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -282,6 +282,8 @@ impl VectorizedGroupValuesColumn { self.vectorized_equal_to_row_indices.len() ); + self.scalarized_indices.clear(); + if self.vectorized_equal_to_group_indices.is_empty() { return; } @@ -335,57 +337,42 @@ impl VectorizedGroupValuesColumn { self.vectorized_equal_to_results = equal_to_results; } - fn scalarized_equal_to( - &self, - group_index_view: &GroupIndexView, - cols: &[ArrayRef], - row: usize, - groups: &mut Vec, - ) -> bool { - // Check if this row exists in `group_values` - fn check_row_equal( - array_row: &dyn GroupColumn, - lhs_row: usize, - array: &ArrayRef, - rhs_row: usize, - ) -> bool { - array_row.equal_to(lhs_row, array, rhs_row) - } - - if group_index_view.is_non_inlined() { - let list_offset = group_index_view.value() as usize; - let group_index_list = &self.group_index_lists[list_offset]; - - for &group_idx in group_index_list { - let mut check_result = true; - for (i, group_val) in self.group_values.iter().enumerate() { - if !check_row_equal(group_val.as_ref(), group_idx, &cols[i], row) { - check_result = false; - break; - } - } - - if check_result { - groups[row] = group_idx; - return true; - } - } - - // All groups unmatched, return false result - false - } else { - let group_idx = group_index_view.value() as usize; - for (i, group_val) in self.group_values.iter().enumerate() { - if !check_row_equal(group_val.as_ref(), group_idx, &cols[i], row) { - return false; - } - } - - groups[row] = group_idx; - true - } - } - + /// It is possible that some `input rows` have the same + /// hash values with the `exist rows`, but have the different + /// actual values the exists. + /// + /// We can found them in `vectorized_equal_to`, and put them + /// into `scalarized_indices`. And for these `input rows`, + /// we will perform the `scalarized_intern` similar as what in + /// [`GroupValuesColumn`]. + /// + /// This design can make the process simple and still efficient enough: + /// + /// # About making the process simple + /// + /// Some corner cases become really easy to solve, like following cases: + /// + /// ```text + /// input row1 (same hash value with exist rows, but value different) + /// input row1 + /// ... + /// input row1 + /// ``` + /// + /// After performing `vectorized_equal_to`, we will found multiple `input rows` + /// not equal to the `exist rows`. However such `input rows` are repeated, only + /// one new group should be create for them. + /// + /// If we don't fallback to `scalarized_intern`, it is really hard for us to + /// distinguish the such `repeated rows` in `input rows`. And if we just fallback, + /// it is really easy to solve, and the performance is at least not worse than origin. + /// + /// # About performance + /// + /// The hash collision may be not frequent, so the fallback will indeed hardly happen. + /// In most situations, `scalarized_indices` will found to be empty after finishing to + /// preform `vectorized_equal_to`. + /// fn scalarized_intern( &mut self, cols: &[ArrayRef], @@ -464,6 +451,57 @@ impl VectorizedGroupValuesColumn { self.map = map; } + + fn scalarized_equal_to( + &self, + group_index_view: &GroupIndexView, + cols: &[ArrayRef], + row: usize, + groups: &mut Vec, + ) -> bool { + // Check if this row exists in `group_values` + fn check_row_equal( + array_row: &dyn GroupColumn, + lhs_row: usize, + array: &ArrayRef, + rhs_row: usize, + ) -> bool { + array_row.equal_to(lhs_row, array, rhs_row) + } + + if group_index_view.is_non_inlined() { + let list_offset = group_index_view.value() as usize; + let group_index_list = &self.group_index_lists[list_offset]; + + for &group_idx in group_index_list { + let mut check_result = true; + for (i, group_val) in self.group_values.iter().enumerate() { + if !check_row_equal(group_val.as_ref(), group_idx, &cols[i], row) { + check_result = false; + break; + } + } + + if check_result { + groups[row] = group_idx; + return true; + } + } + + // All groups unmatched, return false result + false + } else { + let group_idx = group_index_view.value() as usize; + for (i, group_val) in self.group_values.iter().enumerate() { + if !check_row_equal(group_val.as_ref(), group_idx, &cols[i], row) { + return false; + } + } + + groups[row] = group_idx; + true + } + } } /// instantiates a [`PrimitiveGroupValueBuilder`] and pushes it into $v @@ -545,6 +583,7 @@ impl GroupValues for VectorizedGroupValuesColumn { // tracks to which group each of the input rows belongs groups.clear(); + groups.resize(n_rows, usize::MAX); let mut batch_hashes = mem::take(&mut self.hashes_buffer); batch_hashes.clear(); @@ -552,12 +591,25 @@ impl GroupValues for VectorizedGroupValuesColumn { create_hashes(cols, &self.random_state, &mut batch_hashes)?; // General steps for one round `vectorized equal_to & append`: - // 1. Collect vectorized context by checking hash values of `cols` in `map` - // 2. Perform `vectorized_equal_to` - // 3. Perform `vectorized_append` - // 4. Update `current_indices` - groups.resize(n_rows, usize::MAX); - self.scalarized_indices.clear(); + // 1. Collect vectorized context by checking hash values of `cols` in `map`, + // mainly fill `vectorized_append_row_indices`, `vectorized_equal_to_row_indices` + // and `vectorized_equal_to_group_indices` + // + // 2. Perform `vectorized_append` for `vectorized_append_row_indices`. + // `vectorized_append` must be performed before `vectorized_equal_to`, + // because some `group indices` in `vectorized_equal_to_group_indices` + // may be actually placeholders, and still point to no actual values in + // `group_values` before performing append. + // + // 3. Perform `vectorized_equal_to` for `vectorized_equal_to_row_indices` + // and `vectorized_equal_to_group_indices`. If found some rows in input `cols` + // not equal to `exist rows` in `group_values`, place them in `scalarized_indices` + // and perform `scalarized_intern` for them similar as what in [`GroupValuesColumn`] + // after. + // + // 4. Perform `scalarized_intern` for rows mentioned above, when we process like this + // can see the comments of `scalarized_intern`. + // // 1. Collect vectorized context by checking hash values of `cols` in `map` self.collect_vectorized_process_context(&batch_hashes, groups); @@ -568,7 +620,7 @@ impl GroupValues for VectorizedGroupValuesColumn { // 3. Perform `vectorized_equal_to` self.vectorized_equal_to(cols, groups); - // 4. Update `current_indices` + // 4. Perform `scalarized_intern` self.scalarized_intern(cols, &batch_hashes, groups); self.hashes_buffer = batch_hashes; From 41f5f045d070014975f44873fa642c43a7dc605f Mon Sep 17 00:00:00 2001 From: kamille Date: Thu, 31 Oct 2024 00:04:32 +0800 Subject: [PATCH 43/60] add comments to explain why we still need origin `GroupValuesColumn`. --- .../src/aggregates/group_values/column.rs | 48 +++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 95cd5271c449..ebd06e185572 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -85,7 +85,8 @@ impl GroupIndexView { } } -/// A [`GroupValues`] that stores multiple columns of group values. +/// A [`GroupValues`] that stores multiple columns of group values, +/// and supports vectorized operators for them /// pub struct VectorizedGroupValuesColumn { /// The output schema @@ -574,7 +575,9 @@ impl GroupValues for VectorizedGroupValuesColumn { v.push(Box::new(b) as _) } dt => { - return not_impl_err!("{dt} not supported in GroupValuesColumn") + return not_impl_err!( + "{dt} not supported in VectorizedGroupValuesColumn" + ) } } } @@ -756,8 +759,47 @@ impl GroupValues for VectorizedGroupValuesColumn { } } -/// A [`GroupValues`] that stores multiple columns of group values. +/// A [`GroupValues`] that stores multiple columns of group values, +/// and supports scalarized operators for them +/// +/// This scalarized implementation is used only for `streaming aggregation`, +/// because it depends on the order between `input rows` and their corresponding +/// `group indices`. +/// +/// For example, assuming a `input rows` with 4 new rows +/// (not equal to `exist rows` in `group_values`, and need to create +/// new groups for them): +/// +/// ```text +/// row1 (hash collision with the exist rows) +/// row2 +/// row3 (hash collision with the exist rows) +/// row4 +/// ``` +/// +/// # In [`GroupValuesColumn`], their `group indices` will be +/// +/// ```text +/// row1 --> 0 +/// row2 --> 1 +/// row3 --> 2 +/// row4 --> 3 +/// ``` +/// +/// `Group indices` order agrees with their input order, and the `streaming aggregation` +/// depends on this. +/// +/// # However In [`VectorizedGroupValuesColumn`], their `group indices` will be +/// +/// ```text +/// row1 --> 2 +/// row2 --> 0 +/// row3 --> 3 +/// row4 --> 1 +/// ``` /// +/// `Group indices` order are against with their input order, and this will lead to error +/// in `streaming aggregation`. /// pub struct GroupValuesColumn { /// The output schema From 7efce589347c44e062cdb7d0ad3a320b5a97af8f Mon Sep 17 00:00:00 2001 From: kamille Date: Thu, 31 Oct 2024 00:13:22 +0800 Subject: [PATCH 44/60] remove some stale comments. --- .../src/aggregates/group_values/column.rs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index ebd06e185572..56c6ed183001 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -199,13 +199,9 @@ impl VectorizedGroupValuesColumn { let mut group_values_len = self.group_values[0].len(); for (row, &target_hash) in batch_hashes.iter().enumerate() { - let entry = self.map.get(target_hash, |(exist_hash, _)| { - // Somewhat surprisingly, this closure can be called even if the - // hash doesn't match, so check the hash first with an integer - // comparison first avoid the more expensive comparison with - // group value. https://github.com/apache/datafusion/pull/11718 - target_hash == *exist_hash - }); + let entry = self + .map + .get(target_hash, |(exist_hash, _)| target_hash == *exist_hash); let Some((_, group_index_view)) = entry else { // 1. Bucket not found case From 5cbe3fad5d6d5a13713670b7db87a96732babe52 Mon Sep 17 00:00:00 2001 From: kamille Date: Thu, 31 Oct 2024 00:41:24 +0800 Subject: [PATCH 45/60] fix clippy. --- .../physical-plan/src/aggregates/group_values/column.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 56c6ed183001..8acf62b8d973 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -191,7 +191,7 @@ impl VectorizedGroupValuesColumn { fn collect_vectorized_process_context( &mut self, batch_hashes: &[u64], - groups: &mut Vec, + groups: &mut [usize], ) { self.vectorized_append_row_indices.clear(); self.vectorized_equal_to_row_indices.clear(); @@ -374,7 +374,7 @@ impl VectorizedGroupValuesColumn { &mut self, cols: &[ArrayRef], batch_hashes: &[u64], - groups: &mut Vec, + groups: &mut [usize], ) { if self.scalarized_indices.is_empty() { return; @@ -454,7 +454,7 @@ impl VectorizedGroupValuesColumn { group_index_view: &GroupIndexView, cols: &[ArrayRef], row: usize, - groups: &mut Vec, + groups: &mut [usize], ) -> bool { // Check if this row exists in `group_values` fn check_row_equal( From 8b23ff3fd0e24fad216b9519611618959e8680d9 Mon Sep 17 00:00:00 2001 From: kamille Date: Thu, 31 Oct 2024 00:41:40 +0800 Subject: [PATCH 46/60] add comments for `vectorized_equal_to` and `vectorized_append`. --- .../aggregates/group_values/group_column.rs | 47 ++++++++++++------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 2348dea916fc..ec3d1cbe9c24 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -58,25 +58,40 @@ pub trait GroupColumn: Send + Sync { /// /// Note that this comparison returns true if both elements are NULL fn equal_to(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool; + /// Appends the row at `row` in `array` to this builder fn append_val(&mut self, array: &ArrayRef, row: usize); + /// The vectorized version equal to + /// + /// When found nth row stored in this builder at `lhs_row` + /// is equal to the row in `array` at `rhs_row`, + /// it will record the `true` result at the corresponding + /// position in `equal_to_results`. + /// + /// And if found nth result in `equal_to_results` is already + /// `false`, the check for nth row will be skipped. + /// fn vectorized_equal_to( &self, - group_indices: &[usize], + lhs_rows: &[usize], array: &ArrayRef, - rows: &[usize], + rhs_rows: &[usize], equal_to_results: &mut [bool], ); + /// The vectorized version `append_val` fn vectorized_append(&mut self, array: &ArrayRef, rows: &[usize]); /// Returns the number of rows stored in this builder fn len(&self) -> usize; + /// Returns the number of bytes used by this [`GroupColumn`] fn size(&self) -> usize; + /// Builds a new array from all of the stored rows fn build(self: Box) -> ArrayRef; + /// Builds a new array from the first `n` stored rows, shifting the /// remaining rows to the start of the builder fn take_n(&mut self, n: usize) -> ArrayRef; @@ -143,16 +158,16 @@ impl GroupColumn fn vectorized_equal_to( &self, - group_indices: &[usize], + lhs_rows: &[usize], array: &ArrayRef, - rows: &[usize], + rhs_rows: &[usize], equal_to_results: &mut [bool], ) { let array = array.as_primitive::(); - let iter = group_indices + let iter = lhs_rows .iter() - .zip(rows.iter()) + .zip(rhs_rows.iter()) .zip(equal_to_results.iter_mut()); for ((&lhs_row, &rhs_row), equal_to_result) in iter { // Has found not equal to, don't need to check @@ -320,22 +335,22 @@ where fn vectorized_equal_to_inner( &self, - group_indices: &[usize], + lhs_rows: &[usize], array: &ArrayRef, - rows: &[usize], + rhs_rows: &[usize], equal_to_results: &mut [bool], ) where B: ByteArrayType, { let array = array.as_bytes::(); - for (idx, &lhs_row) in group_indices.iter().enumerate() { + for (idx, &lhs_row) in lhs_rows.iter().enumerate() { // Has found not equal to, don't need to check if !equal_to_results[idx] { continue; } - let rhs_row = rows[idx]; + let rhs_row = rhs_rows[idx]; equal_to_results[idx] = self.do_equal_to_inner(lhs_row, array, rhs_row); } } @@ -471,9 +486,9 @@ where fn vectorized_equal_to( &self, - group_indices: &[usize], + lhs_rows: &[usize], array: &ArrayRef, - rows: &[usize], + rhs_rows: &[usize], equal_to_results: &mut [bool], ) { // Sanity array type @@ -484,9 +499,9 @@ where DataType::Binary | DataType::LargeBinary )); self.vectorized_equal_to_inner::>( - group_indices, + lhs_rows, array, - rows, + rhs_rows, equal_to_results, ); } @@ -496,9 +511,9 @@ where DataType::Utf8 | DataType::LargeUtf8 )); self.vectorized_equal_to_inner::>( - group_indices, + lhs_rows, array, - rows, + rhs_rows, equal_to_results, ); } From df81f8fa716694a74b87d6ee18f9fbaedc320b7e Mon Sep 17 00:00:00 2001 From: kamille Date: Thu, 31 Oct 2024 01:57:19 +0800 Subject: [PATCH 47/60] fix clippy. --- .../physical-plan/src/aggregates/group_values/column.rs | 7 ++++--- .../src/aggregates/group_values/group_column.rs | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 8acf62b8d973..8d1a3c2791c3 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -273,7 +273,7 @@ impl VectorizedGroupValuesColumn { /// and perform `scalarized_intern` for them after. /// Usually, such `rows` having same hash but different value with `exists rows` /// are very few. - fn vectorized_equal_to(&mut self, cols: &[ArrayRef], groups: &mut Vec) { + fn vectorized_equal_to(&mut self, cols: &[ArrayRef], groups: &mut [usize]) { assert_eq!( self.vectorized_equal_to_group_indices.len(), self.vectorized_equal_to_row_indices.len() @@ -1142,7 +1142,8 @@ mod tests { }; let sub_batch = group_values.emit(EmitTo::First(num_emit)).unwrap(); - let sub_batch = RecordBatch::try_new(schema.clone(), sub_batch).unwrap(); + let sub_batch = + RecordBatch::try_new(Arc::clone(&schema), sub_batch).unwrap(); actual_sub_batches.push(sub_batch); num_remaining_rows -= num_emit; @@ -1463,7 +1464,7 @@ mod tests { fn load_to_group_values(&self, group_values: &mut impl GroupValues) { for batch in self.test_batches.iter() { - group_values.intern(&batch, &mut vec![]).unwrap(); + group_values.intern(batch, &mut vec![]).unwrap(); } } diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index ec3d1cbe9c24..31ca3ebbb89e 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -1197,7 +1197,7 @@ mod tests { equal_to_results: &mut Vec| { let iter = lhs_rows.iter().zip(rhs_rows.iter()); for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() { - equal_to_results[idx] = builder.equal_to(lhs_row, &input_array, rhs_row); + equal_to_results[idx] = builder.equal_to(lhs_row, input_array, rhs_row); } }; @@ -1310,7 +1310,7 @@ mod tests { equal_to_results: &mut Vec| { let iter = lhs_rows.iter().zip(rhs_rows.iter()); for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() { - equal_to_results[idx] = builder.equal_to(lhs_row, &input_array, rhs_row); + equal_to_results[idx] = builder.equal_to(lhs_row, input_array, rhs_row); } }; @@ -1500,7 +1500,7 @@ mod tests { equal_to_results: &mut Vec| { let iter = lhs_rows.iter().zip(rhs_rows.iter()); for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() { - equal_to_results[idx] = builder.equal_to(lhs_row, &input_array, rhs_row); + equal_to_results[idx] = builder.equal_to(lhs_row, input_array, rhs_row); } }; @@ -1703,7 +1703,7 @@ mod tests { equal_to_results: &mut Vec| { let iter = lhs_rows.iter().zip(rhs_rows.iter()); for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() { - equal_to_results[idx] = builder.equal_to(lhs_row, &input_array, rhs_row); + equal_to_results[idx] = builder.equal_to(lhs_row, input_array, rhs_row); } }; From 81f99a82d0127b7d9eecbd448f3b2a00a1820108 Mon Sep 17 00:00:00 2001 From: kamille Date: Thu, 31 Oct 2024 02:11:53 +0800 Subject: [PATCH 48/60] use zip to simplify codes. --- .../aggregates/group_values/group_column.rs | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 31ca3ebbb89e..75992eb365dd 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -169,6 +169,7 @@ impl GroupColumn .iter() .zip(rhs_rows.iter()) .zip(equal_to_results.iter_mut()); + for ((&lhs_row, &rhs_row), equal_to_result) in iter { // Has found not equal to, don't need to check if !*equal_to_result { @@ -344,14 +345,18 @@ where { let array = array.as_bytes::(); - for (idx, &lhs_row) in lhs_rows.iter().enumerate() { + let iter = lhs_rows + .iter() + .zip(rhs_rows.iter()) + .zip(equal_to_results.iter_mut()); + + for ((&lhs_row, &rhs_row), equal_to_result) in iter { // Has found not equal to, don't need to check - if !equal_to_results[idx] { + if !*equal_to_result { continue; } - let rhs_row = rhs_rows[idx]; - equal_to_results[idx] = self.do_equal_to_inner(lhs_row, array, rhs_row); + *equal_to_result = self.do_equal_to_inner(lhs_row, array, rhs_row); } } @@ -722,21 +727,25 @@ impl ByteViewGroupValueBuilder { fn vectorized_equal_to_inner( &self, - group_indices: &[usize], + lhs_rows: &[usize], array: &ArrayRef, - rows: &[usize], + rhs_rows: &[usize], equal_to_results: &mut [bool], ) { let array = array.as_byte_view::(); - for (idx, &lhs_row) in group_indices.iter().enumerate() { + let iter = lhs_rows + .iter() + .zip(rhs_rows.iter()) + .zip(equal_to_results.iter_mut()); + + for ((&lhs_row, &rhs_row), equal_to_result) in iter { // Has found not equal to, don't need to check - if !equal_to_results[idx] { + if !*equal_to_result { continue; } - let rhs_row = rows[idx]; - equal_to_results[idx] = self.do_equal_to_inner(lhs_row, array, rhs_row); + *equal_to_result = self.do_equal_to_inner(lhs_row, array, rhs_row); } } From b7a2443219bfc87fa4d71a6d1069d26e023160a4 Mon Sep 17 00:00:00 2001 From: kamille Date: Thu, 31 Oct 2024 13:37:32 +0800 Subject: [PATCH 49/60] use izip to simplify codes. --- .../aggregates/group_values/group_column.rs | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 75992eb365dd..8fd204d2e77d 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -33,6 +33,7 @@ use arrow_array::GenericByteArray; use arrow_array::GenericByteViewArray; use arrow_buffer::Buffer; use datafusion_common::utils::proxy::VecAllocExt; +use itertools::izip; use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder; use arrow_array::types::GenericStringType; @@ -165,12 +166,13 @@ impl GroupColumn ) { let array = array.as_primitive::(); - let iter = lhs_rows - .iter() - .zip(rhs_rows.iter()) - .zip(equal_to_results.iter_mut()); + let iter = izip!( + lhs_rows.iter(), + rhs_rows.iter(), + equal_to_results.iter_mut(), + ); - for ((&lhs_row, &rhs_row), equal_to_result) in iter { + for (&lhs_row, &rhs_row, equal_to_result) in iter { // Has found not equal to, don't need to check if !*equal_to_result { continue; @@ -345,12 +347,13 @@ where { let array = array.as_bytes::(); - let iter = lhs_rows - .iter() - .zip(rhs_rows.iter()) - .zip(equal_to_results.iter_mut()); + let iter = izip!( + lhs_rows.iter(), + rhs_rows.iter(), + equal_to_results.iter_mut(), + ); - for ((&lhs_row, &rhs_row), equal_to_result) in iter { + for (&lhs_row, &rhs_row, equal_to_result) in iter { // Has found not equal to, don't need to check if !*equal_to_result { continue; @@ -734,12 +737,13 @@ impl ByteViewGroupValueBuilder { ) { let array = array.as_byte_view::(); - let iter = lhs_rows - .iter() - .zip(rhs_rows.iter()) - .zip(equal_to_results.iter_mut()); + let iter = izip!( + lhs_rows.iter(), + rhs_rows.iter(), + equal_to_results.iter_mut(), + ); - for ((&lhs_row, &rhs_row), equal_to_result) in iter { + for (&lhs_row, &rhs_row, equal_to_result) in iter { // Has found not equal to, don't need to check if !*equal_to_result { continue; From 4b45708639d26b0e8e5a157e8cd0576cc8a2fb3b Mon Sep 17 00:00:00 2001 From: kamille <3144148605@qq.com> Date: Thu, 31 Oct 2024 13:53:38 +0800 Subject: [PATCH 50/60] Update datafusion/physical-plan/src/aggregates/group_values/group_column.rs Co-authored-by: Jay Zhan --- .../physical-plan/src/aggregates/group_values/group_column.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 8fd204d2e77d..1f59c617d883 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -173,7 +173,7 @@ impl GroupColumn ); for (&lhs_row, &rhs_row, equal_to_result) in iter { - // Has found not equal to, don't need to check + // Has found not equal to in previous column, don't need to check if !*equal_to_result { continue; } From d1b879add94c06ac51645a274345b4229a91a246 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Thu, 31 Oct 2024 19:58:07 +0800 Subject: [PATCH 51/60] first_n attempt Signed-off-by: jayzhan211 --- .../src/aggregates/group_values/column.rs | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 8d1a3c2791c3..539e79bfc389 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -661,10 +661,7 @@ impl GroupValues for VectorizedGroupValuesColumn { .iter_mut() .map(|v| v.take_n(n)) .collect::>(); - let new_group_index_lists = - Vec::with_capacity(self.group_index_lists.len()); - let old_group_index_lists = - mem::replace(&mut self.group_index_lists, new_group_index_lists); + let mut index = 0; // SAFETY: self.map outlives iterator and is not modified concurrently unsafe { @@ -673,12 +670,12 @@ impl GroupValues for VectorizedGroupValuesColumn { if bucket.as_ref().1.is_non_inlined() { // Non-inlined case // We take `group_index_list` from `old_group_index_lists` - let list_offset = bucket.as_ref().1.value() as usize; - let old_group_index_list = - &old_group_index_lists[list_offset]; + // list_offset is incrementally + let list_offset = bucket.as_ref().1.value() as usize; + println!("list_offset: {:?}", list_offset); let mut new_group_index_list = Vec::new(); - for &group_index in old_group_index_list { + for group_index in self.group_index_lists[list_offset].iter() { if let Some(remaining) = group_index.checked_sub(n) { new_group_index_list.push(remaining); } @@ -695,11 +692,11 @@ impl GroupValues for VectorizedGroupValuesColumn { bucket.as_mut().1 = GroupIndexView::new_inlined(*group_index as u64); } else { - let new_list_offset = self.group_index_lists.len(); - self.group_index_lists.push(new_group_index_list); + self.group_index_lists[index] = new_group_index_list; bucket.as_mut().1 = GroupIndexView::new_non_inlined( - new_list_offset as u64, + index as u64, ); + index += 1; } } else { // Inlined case, we just decrement group index by n @@ -717,6 +714,8 @@ impl GroupValues for VectorizedGroupValuesColumn { } } + self.group_index_lists.truncate(index); + output } }; From 14841db719ae3401c713d2decba8abd23de887a1 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 1 Nov 2024 14:38:39 +0800 Subject: [PATCH 52/60] add test Signed-off-by: jayzhan211 --- .../src/aggregates/group_values/column.rs | 79 +++++++++++++++++-- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 539e79bfc389..2ba819d168e0 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -673,9 +673,9 @@ impl GroupValues for VectorizedGroupValuesColumn { // list_offset is incrementally let list_offset = bucket.as_ref().1.value() as usize; - println!("list_offset: {:?}", list_offset); let mut new_group_index_list = Vec::new(); - for group_index in self.group_index_lists[list_offset].iter() { + for group_index in self.group_index_lists[list_offset].iter() + { if let Some(remaining) = group_index.checked_sub(n) { new_group_index_list.push(remaining); } @@ -693,9 +693,8 @@ impl GroupValues for VectorizedGroupValuesColumn { GroupIndexView::new_inlined(*group_index as u64); } else { self.group_index_lists[index] = new_group_index_list; - bucket.as_mut().1 = GroupIndexView::new_non_inlined( - index as u64, - ); + bucket.as_mut().1 = + GroupIndexView::new_non_inlined(index as u64); index += 1; } } else { @@ -1090,17 +1089,20 @@ fn supported_type(data_type: &DataType) -> bool { #[cfg(test)] mod tests { - use std::sync::Arc; + use std::{collections::HashMap, sync::Arc}; use arrow::{compute::concat_batches, util::pretty::pretty_format_batches}; use arrow_array::{ArrayRef, Int64Array, RecordBatch, StringArray, StringViewArray}; use arrow_schema::{DataType, Field, Schema, SchemaRef}; + use datafusion_common::utils::proxy::RawTableAllocExt; use datafusion_expr::EmitTo; use crate::aggregates::group_values::{ column::VectorizedGroupValuesColumn, GroupValues, }; + use super::GroupIndexView; + #[test] fn test_intern_for_vectorized_group_values() { let data_set = VectorizedTestDataSet::new(); @@ -1114,6 +1116,71 @@ mod tests { check_result(&actual_batch, &data_set.expected_batch); } + #[test] + fn test_emit_first_n_non_inlined_1() { + let field = Field::new("item", DataType::Int32, true); + let schema = Arc::new(Schema::new_with_metadata(vec![field], HashMap::new())); + let mut group_values = VectorizedGroupValuesColumn::try_new(schema).unwrap(); + + insert_inline(&mut group_values, 0, 0); + insert_non_inline(&mut group_values, 1, 0, vec![1, 2]); + insert_inline(&mut group_values, 2, 3); + let _ = group_values.emit(EmitTo::First(4)).unwrap(); + // All the index < 4, all erased + assert_eq!(group_values.map.len(), 0); + } + + #[test] + fn test_emit_first_n_non_inlined_2() { + let field = Field::new("item", DataType::Int32, true); + let schema = Arc::new(Schema::new_with_metadata(vec![field], HashMap::new())); + let mut group_values = VectorizedGroupValuesColumn::try_new(schema).unwrap(); + insert_inline(&mut group_values, 0, 0); // erased + insert_non_inline(&mut group_values, 1, 0, vec![1, 5]); // remain 1 (5 - 4) + insert_inline(&mut group_values, 2, 7); // remain 3 (7 - 4) + insert_non_inline(&mut group_values, 3, 1, vec![2, 8, 9]); // remain 2 (8 - 4) and (9 - 4) + let _ = group_values.emit(EmitTo::First(4)).unwrap(); + assert_eq!(group_values.map.len(), 3); + let group_index = group_values.map.get(1, |_| true).unwrap().1; + assert!(!group_index.is_non_inlined()); + assert_eq!(group_index.value(), 1); + let group_index = group_values.map.get(2, |_| true).unwrap().1; + assert!(!group_index.is_non_inlined()); + assert_eq!(group_index.value(), 3); + let group_index = group_values.map.get(3, |_| true).unwrap().1; + assert!(group_index.is_non_inlined()); + assert_eq!(group_index.value(), 0); // offset is 0 + assert_eq!(group_values.group_index_lists[0], vec![4, 5]); + } + + fn insert_inline( + group_values: &mut VectorizedGroupValuesColumn, + hash_key: u64, + group_index: u64, + ) { + let group_index_view = GroupIndexView::new_inlined(group_index); + group_values.map.insert_accounted( + (hash_key, group_index_view), + |(hash, _)| *hash, + &mut group_values.map_size, + ); + } + + fn insert_non_inline( + group_values: &mut VectorizedGroupValuesColumn, + hash_key: u64, + list_offset: u64, + group_indexes: Vec, + ) { + let group_index_view = GroupIndexView::new_non_inlined(list_offset); + group_values.group_index_lists.push(group_indexes); + group_values.map.insert_accounted( + (hash_key, group_index_view), + |(hash, _)| *hash, + &mut group_values.map_size, + ); + } + #[test] fn test_emit_first_n_for_vectorized_group_values() { let data_set = VectorizedTestDataSet::new(); From 8cd581d8081b72c3b25174f409246105aacf2a45 Mon Sep 17 00:00:00 2001 From: kamille Date: Sat, 2 Nov 2024 01:50:59 +0800 Subject: [PATCH 53/60] improve hashtable modifying in emit first n test. --- .../src/aggregates/group_values/column.rs | 231 +++++++++++++----- 1 file changed, 165 insertions(+), 66 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 2ba819d168e0..ee198951217e 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -59,7 +59,7 @@ const VALUE_MASK: u64 = 0x7FFFFFFFFFFFFFFF; /// /// `inlined flag`: 1 represents `non-inlined`, and 0 represents `inlined` /// -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] struct GroupIndexView(u64); impl GroupIndexView { @@ -499,6 +499,28 @@ impl VectorizedGroupValuesColumn { true } } + + /// Return group indices of the hash, also if its `group_index_view` is non-inlined + #[cfg(test)] + fn get_indices_by_hash(&self, hash: u64) -> Option<(Vec, GroupIndexView)> { + let entry = self.map.get(hash, |(exist_hash, _)| hash == *exist_hash); + + match entry { + Some((_, group_index_view)) => { + if group_index_view.is_non_inlined() { + let list_offset = group_index_view.value() as usize; + Some(( + self.group_index_lists[list_offset].clone(), + *group_index_view, + )) + } else { + let group_index = group_index_view.value() as usize; + Some((vec![group_index], *group_index_view)) + } + } + None => None, + } + } } /// instantiates a [`PrimitiveGroupValueBuilder`] and pushes it into $v @@ -1116,71 +1138,6 @@ mod tests { check_result(&actual_batch, &data_set.expected_batch); } - #[test] - fn test_emit_first_n_non_inlined_1() { - let field = Field::new("item", DataType::Int32, true); - let schema = Arc::new(Schema::new_with_metadata(vec![field], HashMap::new())); - let mut group_values = VectorizedGroupValuesColumn::try_new(schema).unwrap(); - - insert_inline(&mut group_values, 0, 0); - insert_non_inline(&mut group_values, 1, 0, vec![1, 2]); - insert_inline(&mut group_values, 2, 3); - let _ = group_values.emit(EmitTo::First(4)).unwrap(); - // All the index < 4, all erased - assert_eq!(group_values.map.len(), 0); - } - - #[test] - fn test_emit_first_n_non_inlined_2() { - let field = Field::new("item", DataType::Int32, true); - let schema = Arc::new(Schema::new_with_metadata(vec![field], HashMap::new())); - let mut group_values = VectorizedGroupValuesColumn::try_new(schema).unwrap(); - insert_inline(&mut group_values, 0, 0); // erased - insert_non_inline(&mut group_values, 1, 0, vec![1, 5]); // remain 1 (5 - 4) - insert_inline(&mut group_values, 2, 7); // remain 3 (7 - 4) - insert_non_inline(&mut group_values, 3, 1, vec![2, 8, 9]); // remain 2 (8 - 4) and (9 - 4) - let _ = group_values.emit(EmitTo::First(4)).unwrap(); - assert_eq!(group_values.map.len(), 3); - let group_index = group_values.map.get(1, |_| true).unwrap().1; - assert!(!group_index.is_non_inlined()); - assert_eq!(group_index.value(), 1); - let group_index = group_values.map.get(2, |_| true).unwrap().1; - assert!(!group_index.is_non_inlined()); - assert_eq!(group_index.value(), 3); - let group_index = group_values.map.get(3, |_| true).unwrap().1; - assert!(group_index.is_non_inlined()); - assert_eq!(group_index.value(), 0); // offset is 0 - assert_eq!(group_values.group_index_lists[0], vec![4, 5]); - } - - fn insert_inline( - group_values: &mut VectorizedGroupValuesColumn, - hash_key: u64, - group_index: u64, - ) { - let group_index_view = GroupIndexView::new_inlined(group_index); - group_values.map.insert_accounted( - (hash_key, group_index_view), - |(hash, _)| *hash, - &mut group_values.map_size, - ); - } - - fn insert_non_inline( - group_values: &mut VectorizedGroupValuesColumn, - hash_key: u64, - list_offset: u64, - group_indexes: Vec, - ) { - let group_index_view = GroupIndexView::new_non_inlined(list_offset); - group_values.group_index_lists.push(group_indexes); - group_values.map.insert_accounted( - (hash_key, group_index_view), - |(hash, _)| *hash, - &mut group_values.map_size, - ); - } - #[test] fn test_emit_first_n_for_vectorized_group_values() { let data_set = VectorizedTestDataSet::new(); @@ -1221,6 +1178,120 @@ mod tests { } } + #[test] + fn test_hashtable_modifying_in_emit_first_n() { + // Situations should be covered: + // 1. Erase inlined group index view + // 2. Erase whole non-inlined group index view + // 3. Erase + decrease group indices in non-inlined group index view + // + view still non-inlined after decreasing + // 4. Erase + decrease group indices in non-inlined group index view + // + view switch to inlined after decreasing + // 5. Only decrease group index in inlined group index view + // 6. Only decrease group indices in non-inlined group index view + // 7. Erase all things + + let field = Field::new("item", DataType::Int32, true); + let schema = Arc::new(Schema::new_with_metadata(vec![field], HashMap::new())); + let mut group_values = VectorizedGroupValuesColumn::try_new(schema).unwrap(); + + // Insert group index views and check if success to insert + insert_inline_group_index_view(&mut group_values, 0, 0); + insert_non_inline_group_index_view(&mut group_values, 1, vec![1, 2]); + insert_non_inline_group_index_view(&mut group_values, 2, vec![3, 4, 5]); + insert_inline_group_index_view(&mut group_values, 3, 6); + insert_non_inline_group_index_view(&mut group_values, 4, vec![7, 8]); + insert_non_inline_group_index_view(&mut group_values, 5, vec![9, 10, 11]); + + assert_eq!( + group_values.get_indices_by_hash(0).unwrap(), + (vec![0], GroupIndexView::new_inlined(0)) + ); + assert_eq!( + group_values.get_indices_by_hash(1).unwrap(), + (vec![1, 2], GroupIndexView::new_non_inlined(0)) + ); + assert_eq!( + group_values.get_indices_by_hash(2).unwrap(), + (vec![3, 4, 5], GroupIndexView::new_non_inlined(1)) + ); + assert_eq!( + group_values.get_indices_by_hash(3).unwrap(), + (vec![6], GroupIndexView::new_inlined(6)) + ); + assert_eq!( + group_values.get_indices_by_hash(4).unwrap(), + (vec![7, 8], GroupIndexView::new_non_inlined(2)) + ); + assert_eq!( + group_values.get_indices_by_hash(5).unwrap(), + (vec![9, 10, 11], GroupIndexView::new_non_inlined(3)) + ); + assert_eq!(group_values.map.len(), 6); + + // Emit first 4 to test cases 1~3, 5~6 + let _ = group_values.emit(EmitTo::First(4)).unwrap(); + assert!(group_values.get_indices_by_hash(0).is_none()); + assert!(group_values.get_indices_by_hash(1).is_none()); + assert_eq!( + group_values.get_indices_by_hash(2).unwrap(), + (vec![0, 1], GroupIndexView::new_non_inlined(0)) + ); + assert_eq!( + group_values.get_indices_by_hash(3).unwrap(), + (vec![2], GroupIndexView::new_inlined(2)) + ); + assert_eq!( + group_values.get_indices_by_hash(4).unwrap(), + (vec![3, 4], GroupIndexView::new_non_inlined(1)) + ); + assert_eq!( + group_values.get_indices_by_hash(5).unwrap(), + (vec![5, 6, 7], GroupIndexView::new_non_inlined(2)) + ); + assert_eq!(group_values.map.len(), 4); + + // Emit first 1 to test case 4, and cases 5~6 again + let _ = group_values.emit(EmitTo::First(1)).unwrap(); + assert_eq!( + group_values.get_indices_by_hash(2).unwrap(), + (vec![0], GroupIndexView::new_inlined(0)) + ); + assert_eq!( + group_values.get_indices_by_hash(3).unwrap(), + (vec![1], GroupIndexView::new_inlined(1)) + ); + assert_eq!( + group_values.get_indices_by_hash(4).unwrap(), + (vec![2, 3], GroupIndexView::new_non_inlined(0)) + ); + assert_eq!( + group_values.get_indices_by_hash(5).unwrap(), + (vec![4, 5, 6], GroupIndexView::new_non_inlined(1)) + ); + assert_eq!(group_values.map.len(), 4); + + // Emit first 5 to test cases 1~3 again + let _ = group_values.emit(EmitTo::First(5)).unwrap(); + assert_eq!( + group_values.get_indices_by_hash(5).unwrap(), + (vec![0, 1], GroupIndexView::new_non_inlined(0)) + ); + assert_eq!(group_values.map.len(), 1); + + // Emit first 1 to test cases 4 again + let _ = group_values.emit(EmitTo::First(1)).unwrap(); + assert_eq!( + group_values.get_indices_by_hash(5).unwrap(), + (vec![0], GroupIndexView::new_inlined(0)) + ); + assert_eq!(group_values.map.len(), 1); + + // Emit first 1 to test cases 7 + let _ = group_values.emit(EmitTo::First(1)).unwrap(); + assert!(group_values.map.is_empty()); + } + /// Test data set for [`VectorizedGroupValuesColumn`] /// /// Define the test data and support loading them into test [`VectorizedGroupValuesColumn`] @@ -1571,4 +1642,32 @@ mod tests { ); } } + + fn insert_inline_group_index_view( + group_values: &mut VectorizedGroupValuesColumn, + hash_key: u64, + group_index: u64, + ) { + let group_index_view = GroupIndexView::new_inlined(group_index); + group_values.map.insert_accounted( + (hash_key, group_index_view), + |(hash, _)| *hash, + &mut group_values.map_size, + ); + } + + fn insert_non_inline_group_index_view( + group_values: &mut VectorizedGroupValuesColumn, + hash_key: u64, + group_indices: Vec, + ) { + let list_offset = group_values.group_index_lists.len(); + let group_index_view = GroupIndexView::new_non_inlined(list_offset as u64); + group_values.group_index_lists.push(group_indices); + group_values.map.insert_accounted( + (hash_key, group_index_view), + |(hash, _)| *hash, + &mut group_values.map_size, + ); + } } From 75aa1dcd94c1155afc69484db2e25c1920c5bcdb Mon Sep 17 00:00:00 2001 From: kamille Date: Sat, 2 Nov 2024 02:28:47 +0800 Subject: [PATCH 54/60] add `emit_group_index_list_buffer` to avoid allocating new `Vec` to store the remaining gourp indices. --- .../src/aggregates/group_values/column.rs | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index ee198951217e..a93f71c77191 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -121,6 +121,13 @@ pub struct VectorizedGroupValuesColumn { /// group_index_lists: Vec>, + /// When emitting first n, we need to decrease/erase group indices in + /// `map` and `group_index_lists`. + /// + /// This buffer is used to temporarily store the remaining group indices in + /// a specific list in `group_index_lists`. + emit_group_index_list_buffer: Vec, + /// Similar as `current_indices`, but `remaining_indices` /// is used to store the rows will be processed in next round. scalarized_indices: Vec, @@ -162,6 +169,7 @@ impl VectorizedGroupValuesColumn { schema, map, group_index_lists: Vec::new(), + emit_group_index_list_buffer: Vec::new(), map_size: 0, group_values: vec![], hashes_buffer: Default::default(), @@ -683,7 +691,7 @@ impl GroupValues for VectorizedGroupValuesColumn { .iter_mut() .map(|v| v.take_n(n)) .collect::>(); - let mut index = 0; + let mut next_new_list_offset = 0; // SAFETY: self.map outlives iterator and is not modified concurrently unsafe { @@ -694,12 +702,12 @@ impl GroupValues for VectorizedGroupValuesColumn { // We take `group_index_list` from `old_group_index_lists` // list_offset is incrementally + self.emit_group_index_list_buffer.clear(); let list_offset = bucket.as_ref().1.value() as usize; - let mut new_group_index_list = Vec::new(); for group_index in self.group_index_lists[list_offset].iter() { if let Some(remaining) = group_index.checked_sub(n) { - new_group_index_list.push(remaining); + self.emit_group_index_list_buffer.push(remaining); } } @@ -707,17 +715,23 @@ impl GroupValues for VectorizedGroupValuesColumn { // - `new_group_index_list` is empty, we should erase this bucket // - only one value in `new_group_index_list`, switch the `view` to `inlined` // - still multiple values in `new_group_index_list`, build and set the new `unlined view` - if new_group_index_list.is_empty() { + if self.emit_group_index_list_buffer.is_empty() { self.map.erase(bucket); - } else if new_group_index_list.len() == 1 { - let group_index = new_group_index_list.first().unwrap(); + } else if self.emit_group_index_list_buffer.len() == 1 { + let group_index = + self.emit_group_index_list_buffer.first().unwrap(); bucket.as_mut().1 = GroupIndexView::new_inlined(*group_index as u64); } else { - self.group_index_lists[index] = new_group_index_list; - bucket.as_mut().1 = - GroupIndexView::new_non_inlined(index as u64); - index += 1; + let group_index_list = + &mut self.group_index_lists[next_new_list_offset]; + group_index_list.clear(); + group_index_list + .extend(self.emit_group_index_list_buffer.iter()); + bucket.as_mut().1 = GroupIndexView::new_non_inlined( + next_new_list_offset as u64, + ); + next_new_list_offset += 1; } } else { // Inlined case, we just decrement group index by n @@ -735,7 +749,7 @@ impl GroupValues for VectorizedGroupValuesColumn { } } - self.group_index_lists.truncate(index); + self.group_index_lists.truncate(next_new_list_offset); output } @@ -767,6 +781,7 @@ impl GroupValues for VectorizedGroupValuesColumn { self.hashes_buffer.clear(); self.hashes_buffer.shrink_to(count); self.group_index_lists.clear(); + self.emit_group_index_list_buffer.clear(); self.scalarized_indices.clear(); self.vectorized_append_row_indices.clear(); self.vectorized_equal_to_row_indices.clear(); From 406acb4983efe0c2072c5d7759674eec9db9404a Mon Sep 17 00:00:00 2001 From: kamille Date: Sat, 2 Nov 2024 02:31:16 +0800 Subject: [PATCH 55/60] make comments in VectorizedGroupValuesColumn::intern simpler and clearer. --- datafusion/physical-plan/src/aggregates/group_values/column.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index a93f71c77191..3625fc454d37 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -627,8 +627,7 @@ impl GroupValues for VectorizedGroupValuesColumn { // 2. Perform `vectorized_append` for `vectorized_append_row_indices`. // `vectorized_append` must be performed before `vectorized_equal_to`, // because some `group indices` in `vectorized_equal_to_group_indices` - // may be actually placeholders, and still point to no actual values in - // `group_values` before performing append. + // maybe still point to no actual values in `group_values` before performing append. // // 3. Perform `vectorized_equal_to` for `vectorized_equal_to_row_indices` // and `vectorized_equal_to_group_indices`. If found some rows in input `cols` From 7a1ed90462fdfbcc85aad5bafe2e0dd98c2d645f Mon Sep 17 00:00:00 2001 From: kamille Date: Sun, 3 Nov 2024 01:55:57 +0800 Subject: [PATCH 56/60] define `VectorizedOperationBuffers` to hold buffers used in vectorized operations to make code clearer. --- .../src/aggregates/group_values/column.rs | 159 ++++++++++++------ 1 file changed, 109 insertions(+), 50 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 3625fc454d37..fe37329ddf9a 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -128,21 +128,8 @@ pub struct VectorizedGroupValuesColumn { /// a specific list in `group_index_lists`. emit_group_index_list_buffer: Vec, - /// Similar as `current_indices`, but `remaining_indices` - /// is used to store the rows will be processed in next round. - scalarized_indices: Vec, - - /// The `vectorized_equal_tod` row indices buffer - vectorized_equal_to_row_indices: Vec, - - /// The `vectorized_equal_tod` group indices buffer - vectorized_equal_to_group_indices: Vec, - - /// The `vectorized_equal_tod` result buffer - vectorized_equal_to_results: Vec, - - /// The `vectorized append` row indices buffer - vectorized_append_row_indices: Vec, + /// Buffers for `vectorized_append` and `vectorized_equal_to` + vectorized_operation_buffers: VectorizedOperationBuffers, /// The actual group by values, stored column-wise. Compare from /// the left to right, each column is stored as [`GroupColumn`]. @@ -161,6 +148,38 @@ pub struct VectorizedGroupValuesColumn { random_state: RandomState, } +/// Buffers to store intermediate results in `vectorized_append` +/// and `vectorized_equal_to`, for reducing memory allocation +#[derive(Default)] +struct VectorizedOperationBuffers { + /// The `vectorized append` row indices buffer + append_row_indices: Vec, + + /// The `vectorized_equal_to` row indices buffer + equal_to_row_indices: Vec, + + /// The `vectorized_equal_to` group indices buffer + equal_to_group_indices: Vec, + + /// The `vectorized_equal_to` result buffer + equal_to_results: Vec, + + /// The buffer for storing row indices found not equal to + /// exist groups in `group_values` in `vectorized_equal_to`. + /// We will perform `scalarized_intern` for such rows. + remaining_row_indices: Vec, +} + +impl VectorizedOperationBuffers { + fn clear(&mut self) { + self.append_row_indices.clear(); + self.equal_to_row_indices.clear(); + self.equal_to_group_indices.clear(); + self.equal_to_results.clear(); + self.remaining_row_indices.clear(); + } +} + impl VectorizedGroupValuesColumn { /// Create a new instance of GroupValuesColumn if supported for the specified schema pub fn try_new(schema: SchemaRef) -> Result { @@ -170,15 +189,11 @@ impl VectorizedGroupValuesColumn { map, group_index_lists: Vec::new(), emit_group_index_list_buffer: Vec::new(), + vectorized_operation_buffers: VectorizedOperationBuffers::default(), map_size: 0, group_values: vec![], hashes_buffer: Default::default(), random_state: Default::default(), - scalarized_indices: Default::default(), - vectorized_equal_to_row_indices: Default::default(), - vectorized_equal_to_group_indices: Default::default(), - vectorized_equal_to_results: Default::default(), - vectorized_append_row_indices: Default::default(), }) } @@ -201,9 +216,13 @@ impl VectorizedGroupValuesColumn { batch_hashes: &[u64], groups: &mut [usize], ) { - self.vectorized_append_row_indices.clear(); - self.vectorized_equal_to_row_indices.clear(); - self.vectorized_equal_to_group_indices.clear(); + self.vectorized_operation_buffers.append_row_indices.clear(); + self.vectorized_operation_buffers + .equal_to_row_indices + .clear(); + self.vectorized_operation_buffers + .equal_to_group_indices + .clear(); let mut group_values_len = self.group_values[0].len(); for (row, &target_hash) in batch_hashes.iter().enumerate() { @@ -227,7 +246,9 @@ impl VectorizedGroupValuesColumn { ); // Add row index to `vectorized_append_row_indices` - self.vectorized_append_row_indices.push(row); + self.vectorized_operation_buffers + .append_row_indices + .push(row); // Set group index to row in `groups` groups[row] = current_group_idx; @@ -245,26 +266,41 @@ impl VectorizedGroupValuesColumn { let list_offset = group_index_view.value() as usize; let group_index_list = &self.group_index_lists[list_offset]; for &group_index in group_index_list { - self.vectorized_equal_to_row_indices.push(row); - self.vectorized_equal_to_group_indices.push(group_index); + self.vectorized_operation_buffers + .equal_to_row_indices + .push(row); + self.vectorized_operation_buffers + .equal_to_group_indices + .push(group_index); } } else { let group_index = group_index_view.value() as usize; - self.vectorized_equal_to_row_indices.push(row); - self.vectorized_equal_to_group_indices.push(group_index); + self.vectorized_operation_buffers + .equal_to_row_indices + .push(row); + self.vectorized_operation_buffers + .equal_to_group_indices + .push(group_index); } } } /// Perform `vectorized_append`` for `rows` in `vectorized_append_row_indices` fn vectorized_append(&mut self, cols: &[ArrayRef]) { - if self.vectorized_append_row_indices.is_empty() { + if self + .vectorized_operation_buffers + .append_row_indices + .is_empty() + { return; } let iter = self.group_values.iter_mut().zip(cols.iter()); for (group_column, col) in iter { - group_column.vectorized_append(col, &self.vectorized_append_row_indices); + group_column.vectorized_append( + col, + &self.vectorized_operation_buffers.append_row_indices, + ); } } @@ -283,27 +319,41 @@ impl VectorizedGroupValuesColumn { /// are very few. fn vectorized_equal_to(&mut self, cols: &[ArrayRef], groups: &mut [usize]) { assert_eq!( - self.vectorized_equal_to_group_indices.len(), - self.vectorized_equal_to_row_indices.len() + self.vectorized_operation_buffers + .equal_to_group_indices + .len(), + self.vectorized_operation_buffers.equal_to_row_indices.len() ); - self.scalarized_indices.clear(); + self.vectorized_operation_buffers + .remaining_row_indices + .clear(); - if self.vectorized_equal_to_group_indices.is_empty() { + if self + .vectorized_operation_buffers + .equal_to_group_indices + .is_empty() + { return; } // 1. Perform `vectorized_equal_to` for `rows` in `vectorized_equal_to_group_indices` // and `group_indices` in `vectorized_equal_to_group_indices` - let mut equal_to_results = mem::take(&mut self.vectorized_equal_to_results); + let mut equal_to_results = + mem::take(&mut self.vectorized_operation_buffers.equal_to_results); equal_to_results.clear(); - equal_to_results.resize(self.vectorized_equal_to_group_indices.len(), true); + equal_to_results.resize( + self.vectorized_operation_buffers + .equal_to_group_indices + .len(), + true, + ); for (col_idx, group_col) in self.group_values.iter().enumerate() { group_col.vectorized_equal_to( - &self.vectorized_equal_to_group_indices, + &self.vectorized_operation_buffers.equal_to_group_indices, &cols[col_idx], - &self.vectorized_equal_to_row_indices, + &self.vectorized_operation_buffers.equal_to_row_indices, &mut equal_to_results, ); } @@ -311,19 +361,26 @@ impl VectorizedGroupValuesColumn { // 2. Check `equal_to_results`, if found not equal to `row`s, just add them // to `scalarized_indices`, and perform `scalarized_intern` for them after. let mut current_row_equal_to_result = false; - for (idx, &row) in self.vectorized_equal_to_row_indices.iter().enumerate() { + for (idx, &row) in self + .vectorized_operation_buffers + .equal_to_row_indices + .iter() + .enumerate() + { let equal_to_result = equal_to_results[idx]; // Equal to case, set the `group_indices` to `rows` in `groups` if equal_to_result { - groups[row] = self.vectorized_equal_to_group_indices[idx]; + groups[row] = + self.vectorized_operation_buffers.equal_to_group_indices[idx]; } current_row_equal_to_result |= equal_to_result; // Look forward next one row to check if have checked all results // of current row let next_row = self - .vectorized_equal_to_row_indices + .vectorized_operation_buffers + .equal_to_row_indices .get(idx + 1) .unwrap_or(&usize::MAX); @@ -331,7 +388,9 @@ impl VectorizedGroupValuesColumn { if row != *next_row { // Not equal to case, add `row` to `scalarized_indices` if !current_row_equal_to_result { - self.scalarized_indices.push(row); + self.vectorized_operation_buffers + .remaining_row_indices + .push(row); } // Init the total result for checking next row @@ -339,7 +398,7 @@ impl VectorizedGroupValuesColumn { } } - self.vectorized_equal_to_results = equal_to_results; + self.vectorized_operation_buffers.equal_to_results = equal_to_results; } /// It is possible that some `input rows` have the same @@ -384,13 +443,17 @@ impl VectorizedGroupValuesColumn { batch_hashes: &[u64], groups: &mut [usize], ) { - if self.scalarized_indices.is_empty() { + if self + .vectorized_operation_buffers + .remaining_row_indices + .is_empty() + { return; } let mut map = mem::take(&mut self.map); - for &row in &self.scalarized_indices { + for &row in &self.vectorized_operation_buffers.remaining_row_indices { let target_hash = batch_hashes[row]; let entry = map.get_mut(target_hash, |(exist_hash, _)| { // Somewhat surprisingly, this closure can be called even if the @@ -781,11 +844,7 @@ impl GroupValues for VectorizedGroupValuesColumn { self.hashes_buffer.shrink_to(count); self.group_index_lists.clear(); self.emit_group_index_list_buffer.clear(); - self.scalarized_indices.clear(); - self.vectorized_append_row_indices.clear(); - self.vectorized_equal_to_row_indices.clear(); - self.vectorized_equal_to_group_indices.clear(); - self.vectorized_equal_to_results.clear(); + self.vectorized_operation_buffers.clear(); } } From 2d982a1b5a739d562f8a1d30bd4bdb481b9e22fb Mon Sep 17 00:00:00 2001 From: kamille Date: Mon, 4 Nov 2024 23:10:11 +0800 Subject: [PATCH 57/60] unify `VectorizedGroupValuesColumn` and `GroupValuesColumn`. --- .../src/aggregates/group_values/column.rs | 682 ++++++++---------- .../src/aggregates/group_values/mod.rs | 12 +- 2 files changed, 292 insertions(+), 402 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index fe37329ddf9a..bf5ab6213d13 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -88,7 +88,7 @@ impl GroupIndexView { /// A [`GroupValues`] that stores multiple columns of group values, /// and supports vectorized operators for them /// -pub struct VectorizedGroupValuesColumn { +pub struct GroupValuesColumn { /// The output schema schema: SchemaRef, @@ -180,7 +180,11 @@ impl VectorizedOperationBuffers { } } -impl VectorizedGroupValuesColumn { +impl GroupValuesColumn { + // ======================================================================== + // Initialization functions + // ======================================================================== + /// Create a new instance of GroupValuesColumn if supported for the specified schema pub fn try_new(schema: SchemaRef) -> Result { let map = RawTable::with_capacity(0); @@ -197,6 +201,207 @@ impl VectorizedGroupValuesColumn { }) } + // ======================================================================== + // Scalarized intern + // ======================================================================== + + /// Scalarized intern + /// + /// This is used only for `streaming aggregation`, + /// because it depends on the order between `input rows` and their corresponding + /// `group indices`. + /// + /// For example, assuming `input rows` in `cols` with 4 new rows + /// (not equal to `exist rows` in `group_values`, and need to create + /// new groups for them): + /// + /// ```text + /// row1 (hash collision with the exist rows) + /// row2 + /// row3 (hash collision with the exist rows) + /// row4 + /// ``` + /// + /// # In [`GroupValuesColumn`], their `group indices` will be + /// + /// ```text + /// row1 --> 0 + /// row2 --> 1 + /// row3 --> 2 + /// row4 --> 3 + /// ``` + /// + /// `Group indices` order agrees with their input order, and the `streaming aggregation` + /// depends on this. + /// + /// # However In [`VectorizedGroupValuesColumn`], their `group indices` will be + /// + /// ```text + /// row1 --> 2 + /// row2 --> 0 + /// row3 --> 3 + /// row4 --> 1 + /// ``` + /// + /// `Group indices` order are against with their input order, and this will lead to error + /// in `streaming aggregation`. + /// + fn scalarized_intern( + &mut self, + cols: &[ArrayRef], + groups: &mut Vec, + ) -> Result<()> { + let n_rows = cols[0].len(); + + // tracks to which group each of the input rows belongs + groups.clear(); + + // 1.1 Calculate the group keys for the group values + let batch_hashes = &mut self.hashes_buffer; + batch_hashes.clear(); + batch_hashes.resize(n_rows, 0); + create_hashes(cols, &self.random_state, batch_hashes)?; + + for (row, &target_hash) in batch_hashes.iter().enumerate() { + let entry = self + .map + .get_mut(target_hash, |(exist_hash, group_idx_view)| { + // It is ensured to be inlined in `scalarized_intern` + debug_assert!(!group_idx_view.is_non_inlined()); + + // Somewhat surprisingly, this closure can be called even if the + // hash doesn't match, so check the hash first with an integer + // comparison first avoid the more expensive comparison with + // group value. https://github.com/apache/datafusion/pull/11718 + if target_hash != *exist_hash { + return false; + } + + fn check_row_equal( + array_row: &dyn GroupColumn, + lhs_row: usize, + array: &ArrayRef, + rhs_row: usize, + ) -> bool { + array_row.equal_to(lhs_row, array, rhs_row) + } + + for (i, group_val) in self.group_values.iter().enumerate() { + if !check_row_equal( + group_val.as_ref(), + group_idx_view.value() as usize, + &cols[i], + row, + ) { + return false; + } + } + + true + }); + + let group_idx = match entry { + // Existing group_index for this group value + Some((_hash, group_idx_view)) => group_idx_view.value() as usize, + // 1.2 Need to create new entry for the group + None => { + // Add new entry to aggr_state and save newly created index + // let group_idx = group_values.num_rows(); + // group_values.push(group_rows.row(row)); + + let mut checklen = 0; + let group_idx = self.group_values[0].len(); + for (i, group_value) in self.group_values.iter_mut().enumerate() { + group_value.append_val(&cols[i], row); + let len = group_value.len(); + if i == 0 { + checklen = len; + } else { + debug_assert_eq!(checklen, len); + } + } + + // for hasher function, use precomputed hash value + self.map.insert_accounted( + (target_hash, GroupIndexView::new_inlined(group_idx as u64)), + |(hash, _group_index)| *hash, + &mut self.map_size, + ); + group_idx + } + }; + groups.push(group_idx); + } + + Ok(()) + } + + // ======================================================================== + // Vectorized intern + // ======================================================================== + + /// Vectorized intern + /// + /// This is used in `non-streaming aggregation` without requiring the order between + /// rows in `cols` and corresponding groups in `group_values`. + /// + /// The vectorized approach can offer higher performance for avoiding row by row + /// downcast for `cols` and being able to implement even more optimizations(like simd). + /// + fn vectorized_intern( + &mut self, + cols: &[ArrayRef], + groups: &mut Vec, + ) -> Result<()> { + let n_rows = cols[0].len(); + + // tracks to which group each of the input rows belongs + groups.clear(); + groups.resize(n_rows, usize::MAX); + + let mut batch_hashes = mem::take(&mut self.hashes_buffer); + batch_hashes.clear(); + batch_hashes.resize(n_rows, 0); + create_hashes(cols, &self.random_state, &mut batch_hashes)?; + + // General steps for one round `vectorized equal_to & append`: + // 1. Collect vectorized context by checking hash values of `cols` in `map`, + // mainly fill `vectorized_append_row_indices`, `vectorized_equal_to_row_indices` + // and `vectorized_equal_to_group_indices` + // + // 2. Perform `vectorized_append` for `vectorized_append_row_indices`. + // `vectorized_append` must be performed before `vectorized_equal_to`, + // because some `group indices` in `vectorized_equal_to_group_indices` + // maybe still point to no actual values in `group_values` before performing append. + // + // 3. Perform `vectorized_equal_to` for `vectorized_equal_to_row_indices` + // and `vectorized_equal_to_group_indices`. If found some rows in input `cols` + // not equal to `exist rows` in `group_values`, place them in `scalarized_indices` + // and perform `scalarized_intern` for them similar as what in [`GroupValuesColumn`] + // after. + // + // 4. Perform `scalarized_intern` for rows mentioned above, when we process like this + // can see the comments of `scalarized_intern`. + // + + // 1. Collect vectorized context by checking hash values of `cols` in `map` + self.collect_vectorized_process_context(&batch_hashes, groups); + + // 2. Perform `vectorized_append` + self.vectorized_append(cols); + + // 3. Perform `vectorized_equal_to` + self.vectorized_equal_to(cols, groups); + + // 4. Perform scalarized inter for remaining rows + // (about remaining rows, can see comments for `remaining_rows`) + self.scalarized_intern_remaining(cols, &batch_hashes, groups); + + self.hashes_buffer = batch_hashes; + + Ok(()) + } + /// Collect vectorized context by checking hash values of `cols` in `map` /// /// 1. If bucket not found @@ -437,7 +642,7 @@ impl VectorizedGroupValuesColumn { /// In most situations, `scalarized_indices` will found to be empty after finishing to /// preform `vectorized_equal_to`. /// - fn scalarized_intern( + fn scalarized_intern_remaining( &mut self, cols: &[ArrayRef], batch_hashes: &[u64], @@ -471,7 +676,7 @@ impl VectorizedGroupValuesColumn { }; // Perform scalarized equal to - if self.scalarized_equal_to(group_index_view, cols, row, groups) { + if self.scalarized_equal_to_remaining(group_index_view, cols, row, groups) { // Found the row actually exists in group values, // don't need to create new group for it. continue; @@ -520,7 +725,7 @@ impl VectorizedGroupValuesColumn { self.map = map; } - fn scalarized_equal_to( + fn scalarized_equal_to_remaining( &self, group_index_view: &GroupIndexView, cols: &[ArrayRef], @@ -613,10 +818,8 @@ macro_rules! instantiate_primitive { }; } -impl GroupValues for VectorizedGroupValuesColumn { +impl GroupValues for GroupValuesColumn { fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec) -> Result<()> { - let n_rows = cols[0].len(); - if self.group_values.is_empty() { let mut v = Vec::with_capacity(cols.len()); @@ -673,50 +876,11 @@ impl GroupValues for VectorizedGroupValuesColumn { self.group_values = v; } - // tracks to which group each of the input rows belongs - groups.clear(); - groups.resize(n_rows, usize::MAX); - - let mut batch_hashes = mem::take(&mut self.hashes_buffer); - batch_hashes.clear(); - batch_hashes.resize(n_rows, 0); - create_hashes(cols, &self.random_state, &mut batch_hashes)?; - - // General steps for one round `vectorized equal_to & append`: - // 1. Collect vectorized context by checking hash values of `cols` in `map`, - // mainly fill `vectorized_append_row_indices`, `vectorized_equal_to_row_indices` - // and `vectorized_equal_to_group_indices` - // - // 2. Perform `vectorized_append` for `vectorized_append_row_indices`. - // `vectorized_append` must be performed before `vectorized_equal_to`, - // because some `group indices` in `vectorized_equal_to_group_indices` - // maybe still point to no actual values in `group_values` before performing append. - // - // 3. Perform `vectorized_equal_to` for `vectorized_equal_to_row_indices` - // and `vectorized_equal_to_group_indices`. If found some rows in input `cols` - // not equal to `exist rows` in `group_values`, place them in `scalarized_indices` - // and perform `scalarized_intern` for them similar as what in [`GroupValuesColumn`] - // after. - // - // 4. Perform `scalarized_intern` for rows mentioned above, when we process like this - // can see the comments of `scalarized_intern`. - // - - // 1. Collect vectorized context by checking hash values of `cols` in `map` - self.collect_vectorized_process_context(&batch_hashes, groups); - - // 2. Perform `vectorized_append` - self.vectorized_append(cols); - - // 3. Perform `vectorized_equal_to` - self.vectorized_equal_to(cols, groups); - - // 4. Perform `scalarized_intern` - self.scalarized_intern(cols, &batch_hashes, groups); - - self.hashes_buffer = batch_hashes; - - Ok(()) + if !STREAMING { + self.vectorized_intern(cols, groups) + } else { + self.scalarized_intern(cols, groups) + } } fn size(&self) -> usize { @@ -758,358 +922,74 @@ impl GroupValues for VectorizedGroupValuesColumn { // SAFETY: self.map outlives iterator and is not modified concurrently unsafe { for bucket in self.map.iter() { - // Check if it is `inlined` or `non-inlined` - if bucket.as_ref().1.is_non_inlined() { - // Non-inlined case - // We take `group_index_list` from `old_group_index_lists` - - // list_offset is incrementally - self.emit_group_index_list_buffer.clear(); - let list_offset = bucket.as_ref().1.value() as usize; - for group_index in self.group_index_lists[list_offset].iter() - { - if let Some(remaining) = group_index.checked_sub(n) { - self.emit_group_index_list_buffer.push(remaining); + // In non-streaming case, we need to check if the `group index view` + // is `inlined` or `non-inlined` + if !STREAMING { + if bucket.as_ref().1.is_non_inlined() { + // Non-inlined case + // We take `group_index_list` from `old_group_index_lists` + + // list_offset is incrementally + self.emit_group_index_list_buffer.clear(); + let list_offset = bucket.as_ref().1.value() as usize; + for group_index in + self.group_index_lists[list_offset].iter() + { + if let Some(remaining) = group_index.checked_sub(n) { + self.emit_group_index_list_buffer.push(remaining); + } } - } - // The possible results: - // - `new_group_index_list` is empty, we should erase this bucket - // - only one value in `new_group_index_list`, switch the `view` to `inlined` - // - still multiple values in `new_group_index_list`, build and set the new `unlined view` - if self.emit_group_index_list_buffer.is_empty() { - self.map.erase(bucket); - } else if self.emit_group_index_list_buffer.len() == 1 { - let group_index = - self.emit_group_index_list_buffer.first().unwrap(); - bucket.as_mut().1 = - GroupIndexView::new_inlined(*group_index as u64); - } else { - let group_index_list = - &mut self.group_index_lists[next_new_list_offset]; - group_index_list.clear(); - group_index_list - .extend(self.emit_group_index_list_buffer.iter()); - bucket.as_mut().1 = GroupIndexView::new_non_inlined( - next_new_list_offset as u64, - ); - next_new_list_offset += 1; - } - } else { - // Inlined case, we just decrement group index by n - let group_index = bucket.as_ref().1.value() as usize; - match group_index.checked_sub(n) { - // Group index was >= n, shift value down - Some(sub) => { + // The possible results: + // - `new_group_index_list` is empty, we should erase this bucket + // - only one value in `new_group_index_list`, switch the `view` to `inlined` + // - still multiple values in `new_group_index_list`, build and set the new `unlined view` + if self.emit_group_index_list_buffer.is_empty() { + self.map.erase(bucket); + } else if self.emit_group_index_list_buffer.len() == 1 { + let group_index = self + .emit_group_index_list_buffer + .first() + .unwrap(); bucket.as_mut().1 = - GroupIndexView::new_inlined(sub as u64) + GroupIndexView::new_inlined(*group_index as u64); + } else { + let group_index_list = + &mut self.group_index_lists[next_new_list_offset]; + group_index_list.clear(); + group_index_list + .extend(self.emit_group_index_list_buffer.iter()); + bucket.as_mut().1 = GroupIndexView::new_non_inlined( + next_new_list_offset as u64, + ); + next_new_list_offset += 1; } - // Group index was < n, so remove from table - None => self.map.erase(bucket), - } - } - } - } - - self.group_index_lists.truncate(next_new_list_offset); - - output - } - }; - - // TODO: Materialize dictionaries in group keys (#7647) - for (field, array) in self.schema.fields.iter().zip(&mut output) { - let expected = field.data_type(); - if let DataType::Dictionary(_, v) = expected { - let actual = array.data_type(); - if v.as_ref() != actual { - return Err(DataFusionError::Internal(format!( - "Converted group rows expected dictionary of {v} got {actual}" - ))); - } - *array = cast(array.as_ref(), expected)?; - } - } - - Ok(output) - } - - fn clear_shrink(&mut self, batch: &RecordBatch) { - let count = batch.num_rows(); - self.group_values.clear(); - self.map.clear(); - self.map.shrink_to(count, |_| 0); // hasher does not matter since the map is cleared - self.map_size = self.map.capacity() * size_of::<(u64, usize)>(); - self.hashes_buffer.clear(); - self.hashes_buffer.shrink_to(count); - self.group_index_lists.clear(); - self.emit_group_index_list_buffer.clear(); - self.vectorized_operation_buffers.clear(); - } -} - -/// A [`GroupValues`] that stores multiple columns of group values, -/// and supports scalarized operators for them -/// -/// This scalarized implementation is used only for `streaming aggregation`, -/// because it depends on the order between `input rows` and their corresponding -/// `group indices`. -/// -/// For example, assuming a `input rows` with 4 new rows -/// (not equal to `exist rows` in `group_values`, and need to create -/// new groups for them): -/// -/// ```text -/// row1 (hash collision with the exist rows) -/// row2 -/// row3 (hash collision with the exist rows) -/// row4 -/// ``` -/// -/// # In [`GroupValuesColumn`], their `group indices` will be -/// -/// ```text -/// row1 --> 0 -/// row2 --> 1 -/// row3 --> 2 -/// row4 --> 3 -/// ``` -/// -/// `Group indices` order agrees with their input order, and the `streaming aggregation` -/// depends on this. -/// -/// # However In [`VectorizedGroupValuesColumn`], their `group indices` will be -/// -/// ```text -/// row1 --> 2 -/// row2 --> 0 -/// row3 --> 3 -/// row4 --> 1 -/// ``` -/// -/// `Group indices` order are against with their input order, and this will lead to error -/// in `streaming aggregation`. -/// -pub struct GroupValuesColumn { - /// The output schema - schema: SchemaRef, - - /// Logically maps group values to a group_index in - /// [`Self::group_values`] and in each accumulator - /// - /// Uses the raw API of hashbrown to avoid actually storing the - /// keys (group values) in the table - /// - /// keys: u64 hashes of the GroupValue - /// values: (hash, group_index) - map: RawTable<(u64, usize)>, - - /// The size of `map` in bytes - map_size: usize, - - /// The actual group by values, stored column-wise. Compare from - /// the left to right, each column is stored as [`GroupColumn`]. - /// - /// Performance tests showed that this design is faster than using the - /// more general purpose [`GroupValuesRows`]. See the ticket for details: - /// - /// - /// [`GroupValuesRows`]: crate::aggregates::group_values::row::GroupValuesRows - group_values: Vec>, - - /// reused buffer to store hashes - hashes_buffer: Vec, - - /// Random state for creating hashes - random_state: RandomState, -} - -impl GroupValuesColumn { - /// Create a new instance of GroupValuesColumn if supported for the specified schema - pub fn try_new(schema: SchemaRef) -> Result { - let map = RawTable::with_capacity(0); - Ok(Self { - schema, - map, - map_size: 0, - group_values: vec![], - hashes_buffer: Default::default(), - random_state: Default::default(), - }) - } -} -impl GroupValues for GroupValuesColumn { - fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec) -> Result<()> { - let n_rows = cols[0].len(); - - if self.group_values.is_empty() { - let mut v = Vec::with_capacity(cols.len()); - - for f in self.schema.fields().iter() { - let nullable = f.is_nullable(); - match f.data_type() { - &DataType::Int8 => instantiate_primitive!(v, nullable, Int8Type), - &DataType::Int16 => instantiate_primitive!(v, nullable, Int16Type), - &DataType::Int32 => instantiate_primitive!(v, nullable, Int32Type), - &DataType::Int64 => instantiate_primitive!(v, nullable, Int64Type), - &DataType::UInt8 => instantiate_primitive!(v, nullable, UInt8Type), - &DataType::UInt16 => instantiate_primitive!(v, nullable, UInt16Type), - &DataType::UInt32 => instantiate_primitive!(v, nullable, UInt32Type), - &DataType::UInt64 => instantiate_primitive!(v, nullable, UInt64Type), - &DataType::Float32 => { - instantiate_primitive!(v, nullable, Float32Type) - } - &DataType::Float64 => { - instantiate_primitive!(v, nullable, Float64Type) - } - &DataType::Date32 => instantiate_primitive!(v, nullable, Date32Type), - &DataType::Date64 => instantiate_primitive!(v, nullable, Date64Type), - &DataType::Utf8 => { - let b = ByteGroupValueBuilder::::new(OutputType::Utf8); - v.push(Box::new(b) as _) - } - &DataType::LargeUtf8 => { - let b = ByteGroupValueBuilder::::new(OutputType::Utf8); - v.push(Box::new(b) as _) - } - &DataType::Binary => { - let b = ByteGroupValueBuilder::::new(OutputType::Binary); - v.push(Box::new(b) as _) - } - &DataType::LargeBinary => { - let b = ByteGroupValueBuilder::::new(OutputType::Binary); - v.push(Box::new(b) as _) - } - dt => { - return not_impl_err!("{dt} not supported in GroupValuesColumn") - } - } - } - self.group_values = v; - } - - // tracks to which group each of the input rows belongs - groups.clear(); - - // 1.1 Calculate the group keys for the group values - let batch_hashes = &mut self.hashes_buffer; - batch_hashes.clear(); - batch_hashes.resize(n_rows, 0); - create_hashes(cols, &self.random_state, batch_hashes)?; - - for (row, &target_hash) in batch_hashes.iter().enumerate() { - let entry = self.map.get_mut(target_hash, |(exist_hash, group_idx)| { - // Somewhat surprisingly, this closure can be called even if the - // hash doesn't match, so check the hash first with an integer - // comparison first avoid the more expensive comparison with - // group value. https://github.com/apache/datafusion/pull/11718 - if target_hash != *exist_hash { - return false; - } - - fn check_row_equal( - array_row: &dyn GroupColumn, - lhs_row: usize, - array: &ArrayRef, - rhs_row: usize, - ) -> bool { - array_row.equal_to(lhs_row, array, rhs_row) - } - - for (i, group_val) in self.group_values.iter().enumerate() { - if !check_row_equal(group_val.as_ref(), *group_idx, &cols[i], row) { - return false; - } - } - - true - }); - - let group_idx = match entry { - // Existing group_index for this group value - Some((_hash, group_idx)) => *group_idx, - // 1.2 Need to create new entry for the group - None => { - // Add new entry to aggr_state and save newly created index - // let group_idx = group_values.num_rows(); - // group_values.push(group_rows.row(row)); - - let mut checklen = 0; - let group_idx = self.group_values[0].len(); - for (i, group_value) in self.group_values.iter_mut().enumerate() { - group_value.append_val(&cols[i], row); - let len = group_value.len(); - if i == 0 { - checklen = len; - } else { - debug_assert_eq!(checklen, len); + continue; + } } - } - // for hasher function, use precomputed hash value - self.map.insert_accounted( - (target_hash, group_idx), - |(hash, _group_index)| *hash, - &mut self.map_size, - ); - group_idx - } - }; - groups.push(group_idx); - } - - Ok(()) - } - - fn size(&self) -> usize { - let group_values_size: usize = self.group_values.iter().map(|v| v.size()).sum(); - group_values_size + self.map_size + self.hashes_buffer.allocated_size() - } - - fn is_empty(&self) -> bool { - self.len() == 0 - } - - fn len(&self) -> usize { - if self.group_values.is_empty() { - return 0; - } - - self.group_values[0].len() - } - - fn emit(&mut self, emit_to: EmitTo) -> Result> { - let mut output = match emit_to { - EmitTo::All => { - let group_values = mem::take(&mut self.group_values); - debug_assert!(self.group_values.is_empty()); - - group_values - .into_iter() - .map(|v| v.build()) - .collect::>() - } - EmitTo::First(n) => { - let output = self - .group_values - .iter_mut() - .map(|v| v.take_n(n)) - .collect::>(); + // In `streaming case`, the `group index view` is ensured to be `inlined` + debug_assert!(!bucket.as_ref().1.is_non_inlined()); - // SAFETY: self.map outlives iterator and is not modified concurrently - unsafe { - for bucket in self.map.iter() { - // Decrement group index by n - match bucket.as_ref().1.checked_sub(n) { + // Inlined case, we just decrement group index by n) + let group_index = bucket.as_ref().1.value() as usize; + match group_index.checked_sub(n) { // Group index was >= n, shift value down - Some(sub) => bucket.as_mut().1 = sub, + Some(sub) => { + bucket.as_mut().1 = + GroupIndexView::new_inlined(sub as u64) + } // Group index was < n, so remove from table None => self.map.erase(bucket), } } } + if !STREAMING { + self.group_index_lists.truncate(next_new_list_offset); + } + output } }; @@ -1139,6 +1019,13 @@ impl GroupValues for GroupValuesColumn { self.map_size = self.map.capacity() * size_of::<(u64, usize)>(); self.hashes_buffer.clear(); self.hashes_buffer.shrink_to(count); + + // Such structure is only used in `non-streaming` case + if !STREAMING { + self.group_index_lists.clear(); + self.emit_group_index_list_buffer.clear(); + self.vectorized_operation_buffers.clear(); + } } } @@ -1152,12 +1039,10 @@ pub fn supported_schema(schema: &Schema) -> bool { .all(supported_type) } -/// Returns true if the specified data type is supported by -/// [`GroupValuesColumn`] or [`VectorizedGroupValuesColumn`] +/// Returns true if the specified data type is supported by [`GroupValuesColumn`] /// /// In order to be supported, there must be a specialized implementation of /// [`GroupColumn`] for the data type, instantiated in [`GroupValuesColumn::intern`] -/// or [`VectorizedGroupValuesColumn::intern`] fn supported_type(data_type: &DataType) -> bool { matches!( *data_type, @@ -1193,7 +1078,7 @@ mod tests { use datafusion_expr::EmitTo; use crate::aggregates::group_values::{ - column::VectorizedGroupValuesColumn, GroupValues, + column::GroupValuesColumn, GroupValues, }; use super::GroupIndexView; @@ -1202,7 +1087,7 @@ mod tests { fn test_intern_for_vectorized_group_values() { let data_set = VectorizedTestDataSet::new(); let mut group_values = - VectorizedGroupValuesColumn::try_new(data_set.schema()).unwrap(); + GroupValuesColumn::::try_new(data_set.schema()).unwrap(); data_set.load_to_group_values(&mut group_values); let actual_batch = group_values.emit(EmitTo::All).unwrap(); @@ -1215,7 +1100,7 @@ mod tests { fn test_emit_first_n_for_vectorized_group_values() { let data_set = VectorizedTestDataSet::new(); let mut group_values = - VectorizedGroupValuesColumn::try_new(data_set.schema()).unwrap(); + GroupValuesColumn::::try_new(data_set.schema()).unwrap(); // 1~num_rows times to emit the groups let num_rows = data_set.expected_batch.num_rows(); @@ -1266,7 +1151,8 @@ mod tests { let field = Field::new("item", DataType::Int32, true); let schema = Arc::new(Schema::new_with_metadata(vec![field], HashMap::new())); - let mut group_values = VectorizedGroupValuesColumn::try_new(schema).unwrap(); + let mut group_values = + GroupValuesColumn::::try_new(schema).unwrap(); // Insert group index views and check if success to insert insert_inline_group_index_view(&mut group_values, 0, 0); @@ -1717,7 +1603,7 @@ mod tests { } fn insert_inline_group_index_view( - group_values: &mut VectorizedGroupValuesColumn, + group_values: &mut GroupValuesColumn, hash_key: u64, group_index: u64, ) { @@ -1730,7 +1616,7 @@ mod tests { } fn insert_non_inline_group_index_view( - group_values: &mut VectorizedGroupValuesColumn, + group_values: &mut GroupValuesColumn, hash_key: u64, group_indices: Vec, ) { diff --git a/datafusion/physical-plan/src/aggregates/group_values/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/mod.rs index af1b82de6227..696407fbb4be 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/mod.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/mod.rs @@ -29,7 +29,7 @@ use primitive::GroupValuesPrimitive; mod column; mod row; -use column::VectorizedGroupValuesColumn; +use column::GroupValuesColumn; use row::GroupValuesRows; mod bytes; @@ -37,7 +37,7 @@ mod bytes_view; use bytes::GroupValuesByes; use datafusion_physical_expr::binary_map::OutputType; -use crate::aggregates::{group_values::column::GroupValuesColumn, order::GroupOrdering}; +use crate::aggregates::order::GroupOrdering; mod group_column; mod null_builder; @@ -150,9 +150,13 @@ pub fn new_group_values( if column::supported_schema(schema.as_ref()) { if matches!(group_ordering, GroupOrdering::None) { - Ok(Box::new(VectorizedGroupValuesColumn::try_new(schema)?)) + Ok(Box::new(GroupValuesColumn::::try_new( + schema, + )?)) } else { - Ok(Box::new(GroupValuesColumn::try_new(schema)?)) + Ok(Box::new(GroupValuesColumn::::try_new( + schema, + )?)) } } else { Ok(Box::new(GroupValuesRows::try_new(schema)?)) From e4bd57918b1725f37f3f500c503c07b1c1bf90bf Mon Sep 17 00:00:00 2001 From: kamille Date: Mon, 4 Nov 2024 23:15:16 +0800 Subject: [PATCH 58/60] fix fmt. --- .../physical-plan/src/aggregates/group_values/column.rs | 7 ++----- .../physical-plan/src/aggregates/group_values/mod.rs | 8 ++------ 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index bf5ab6213d13..aad4878de916 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -1077,9 +1077,7 @@ mod tests { use datafusion_common::utils::proxy::RawTableAllocExt; use datafusion_expr::EmitTo; - use crate::aggregates::group_values::{ - column::GroupValuesColumn, GroupValues, - }; + use crate::aggregates::group_values::{column::GroupValuesColumn, GroupValues}; use super::GroupIndexView; @@ -1151,8 +1149,7 @@ mod tests { let field = Field::new("item", DataType::Int32, true); let schema = Arc::new(Schema::new_with_metadata(vec![field], HashMap::new())); - let mut group_values = - GroupValuesColumn::::try_new(schema).unwrap(); + let mut group_values = GroupValuesColumn::::try_new(schema).unwrap(); // Insert group index views and check if success to insert insert_inline_group_index_view(&mut group_values, 0, 0); diff --git a/datafusion/physical-plan/src/aggregates/group_values/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/mod.rs index 696407fbb4be..aefd9c162246 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/mod.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/mod.rs @@ -150,13 +150,9 @@ pub fn new_group_values( if column::supported_schema(schema.as_ref()) { if matches!(group_ordering, GroupOrdering::None) { - Ok(Box::new(GroupValuesColumn::::try_new( - schema, - )?)) + Ok(Box::new(GroupValuesColumn::::try_new(schema)?)) } else { - Ok(Box::new(GroupValuesColumn::::try_new( - schema, - )?)) + Ok(Box::new(GroupValuesColumn::::try_new(schema)?)) } } else { Ok(Box::new(GroupValuesRows::try_new(schema)?)) From 14fffb846ff5f47e9a41b53ceda359f7c95d2778 Mon Sep 17 00:00:00 2001 From: kamille Date: Mon, 4 Nov 2024 23:19:57 +0800 Subject: [PATCH 59/60] fix comments. --- .../src/aggregates/group_values/column.rs | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index aad4878de916..1ee4c48fb9b1 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -207,9 +207,8 @@ impl GroupValuesColumn { /// Scalarized intern /// - /// This is used only for `streaming aggregation`, - /// because it depends on the order between `input rows` and their corresponding - /// `group indices`. + /// This is used only for `streaming aggregation`, because `streaming aggregation` + /// depends on the order between `input rows` and their corresponding `group indices`. /// /// For example, assuming `input rows` in `cols` with 4 new rows /// (not equal to `exist rows` in `group_values`, and need to create @@ -222,7 +221,7 @@ impl GroupValuesColumn { /// row4 /// ``` /// - /// # In [`GroupValuesColumn`], their `group indices` will be + /// # In `scalarized_intern`, their `group indices` will be /// /// ```text /// row1 --> 0 @@ -234,7 +233,7 @@ impl GroupValuesColumn { /// `Group indices` order agrees with their input order, and the `streaming aggregation` /// depends on this. /// - /// # However In [`VectorizedGroupValuesColumn`], their `group indices` will be + /// # However In `vectorized_intern`, their `group indices` will be /// /// ```text /// row1 --> 2 @@ -376,12 +375,12 @@ impl GroupValuesColumn { // // 3. Perform `vectorized_equal_to` for `vectorized_equal_to_row_indices` // and `vectorized_equal_to_group_indices`. If found some rows in input `cols` - // not equal to `exist rows` in `group_values`, place them in `scalarized_indices` - // and perform `scalarized_intern` for them similar as what in [`GroupValuesColumn`] + // not equal to `exist rows` in `group_values`, place them in `remaining_row_indices` + // and perform `scalarized_intern_remaining` for them similar as `scalarized_intern` // after. // - // 4. Perform `scalarized_intern` for rows mentioned above, when we process like this - // can see the comments of `scalarized_intern`. + // 4. Perform `scalarized_intern_remaining` for rows mentioned above, about in what situation + // we will process this can see the comments of `scalarized_intern_remaining`. // // 1. Collect vectorized context by checking hash values of `cols` in `map` @@ -394,7 +393,7 @@ impl GroupValuesColumn { self.vectorized_equal_to(cols, groups); // 4. Perform scalarized inter for remaining rows - // (about remaining rows, can see comments for `remaining_rows`) + // (about remaining rows, can see comments for `remaining_row_indices`) self.scalarized_intern_remaining(cols, &batch_hashes, groups); self.hashes_buffer = batch_hashes; @@ -1020,7 +1019,7 @@ impl GroupValues for GroupValuesColumn { self.hashes_buffer.clear(); self.hashes_buffer.shrink_to(count); - // Such structure is only used in `non-streaming` case + // Such structures are only used in `non-streaming` case if !STREAMING { self.group_index_lists.clear(); self.emit_group_index_list_buffer.clear(); From d479cc2fb4f54b8df35d7e68a27920f0cb734fc5 Mon Sep 17 00:00:00 2001 From: kamille Date: Tue, 5 Nov 2024 00:44:03 +0800 Subject: [PATCH 60/60] fix clippy. --- .../src/aggregates/group_values/column.rs | 86 +++++++++---------- 1 file changed, 39 insertions(+), 47 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 1ee4c48fb9b1..8100bb876ded 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -866,9 +866,7 @@ impl GroupValues for GroupValuesColumn { v.push(Box::new(b) as _) } dt => { - return not_impl_err!( - "{dt} not supported in VectorizedGroupValuesColumn" - ) + return not_impl_err!("{dt} not supported in GroupValuesColumn") } } } @@ -923,49 +921,44 @@ impl GroupValues for GroupValuesColumn { for bucket in self.map.iter() { // In non-streaming case, we need to check if the `group index view` // is `inlined` or `non-inlined` - if !STREAMING { - if bucket.as_ref().1.is_non_inlined() { - // Non-inlined case - // We take `group_index_list` from `old_group_index_lists` - - // list_offset is incrementally - self.emit_group_index_list_buffer.clear(); - let list_offset = bucket.as_ref().1.value() as usize; - for group_index in - self.group_index_lists[list_offset].iter() - { - if let Some(remaining) = group_index.checked_sub(n) { - self.emit_group_index_list_buffer.push(remaining); - } - } - - // The possible results: - // - `new_group_index_list` is empty, we should erase this bucket - // - only one value in `new_group_index_list`, switch the `view` to `inlined` - // - still multiple values in `new_group_index_list`, build and set the new `unlined view` - if self.emit_group_index_list_buffer.is_empty() { - self.map.erase(bucket); - } else if self.emit_group_index_list_buffer.len() == 1 { - let group_index = self - .emit_group_index_list_buffer - .first() - .unwrap(); - bucket.as_mut().1 = - GroupIndexView::new_inlined(*group_index as u64); - } else { - let group_index_list = - &mut self.group_index_lists[next_new_list_offset]; - group_index_list.clear(); - group_index_list - .extend(self.emit_group_index_list_buffer.iter()); - bucket.as_mut().1 = GroupIndexView::new_non_inlined( - next_new_list_offset as u64, - ); - next_new_list_offset += 1; + if !STREAMING && bucket.as_ref().1.is_non_inlined() { + // Non-inlined case + // We take `group_index_list` from `old_group_index_lists` + + // list_offset is incrementally + self.emit_group_index_list_buffer.clear(); + let list_offset = bucket.as_ref().1.value() as usize; + for group_index in self.group_index_lists[list_offset].iter() + { + if let Some(remaining) = group_index.checked_sub(n) { + self.emit_group_index_list_buffer.push(remaining); } + } - continue; + // The possible results: + // - `new_group_index_list` is empty, we should erase this bucket + // - only one value in `new_group_index_list`, switch the `view` to `inlined` + // - still multiple values in `new_group_index_list`, build and set the new `unlined view` + if self.emit_group_index_list_buffer.is_empty() { + self.map.erase(bucket); + } else if self.emit_group_index_list_buffer.len() == 1 { + let group_index = + self.emit_group_index_list_buffer.first().unwrap(); + bucket.as_mut().1 = + GroupIndexView::new_inlined(*group_index as u64); + } else { + let group_index_list = + &mut self.group_index_lists[next_new_list_offset]; + group_index_list.clear(); + group_index_list + .extend(self.emit_group_index_list_buffer.iter()); + bucket.as_mut().1 = GroupIndexView::new_non_inlined( + next_new_list_offset as u64, + ); + next_new_list_offset += 1; } + + continue; } // In `streaming case`, the `group index view` is ensured to be `inlined` @@ -1028,8 +1021,7 @@ impl GroupValues for GroupValuesColumn { } } -/// Returns true if [`GroupValuesColumn`] or [`VectorizedGroupValuesColumn`] -/// supported for the specified schema +/// Returns true if [`GroupValuesColumn`] supported for the specified schema pub fn supported_schema(schema: &Schema) -> bool { schema .fields() @@ -1247,9 +1239,9 @@ mod tests { assert!(group_values.map.is_empty()); } - /// Test data set for [`VectorizedGroupValuesColumn`] + /// Test data set for [`GroupValuesColumn::vectorized_intern`] /// - /// Define the test data and support loading them into test [`VectorizedGroupValuesColumn`] + /// Define the test data and support loading them into test [`GroupValuesColumn::vectorized_intern`] /// /// The covering situations: ///