Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cherry PR snapshot: Fix panic when hashing empty FixedSizeList Array #76

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/dependencies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Check dependencies
run: |
cd dev/depcheck
Expand Down
28 changes: 14 additions & 14 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable

- name: Cache Cargo
uses: actions/cache@v4
Expand Down Expand Up @@ -144,7 +144,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Run tests (excluding doctests)
run: cargo test --lib --tests --bins --features avro,json,backtrace
- name: Verify Working Directory Clean
Expand All @@ -163,7 +163,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Run tests (excluding doctests)
run: |
cd datafusion-cli
Expand All @@ -184,7 +184,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Run examples
run: |
# test datafusion-sql examples
Expand All @@ -210,7 +210,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Run doctests
run: |
cargo test --doc --features avro,json
Expand All @@ -231,7 +231,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Run cargo doc
run: ci/scripts/rust_docs.sh

Expand All @@ -245,7 +245,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Install wasm-pack
run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
- name: Build with wasm-pack
Expand All @@ -266,7 +266,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Generate benchmark data and expected query results
run: |
mkdir -p datafusion/sqllogictest/test_files/tpch/data
Expand Down Expand Up @@ -384,7 +384,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Run datafusion-common tests
run: cargo test -p datafusion-common --features=pyarrow

Expand Down Expand Up @@ -413,7 +413,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Run
run: |
echo '' > datafusion/proto/src/generated/datafusion.rs
Expand Down Expand Up @@ -474,7 +474,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Install Clippy
run: rustup component add clippy
- name: Run clippy
Expand All @@ -494,7 +494,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Run tests
run: |
cd datafusion
Expand All @@ -513,7 +513,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- name: Install taplo
run: cargo +stable install taplo-cli --version ^0.9 --locked
# if you encounter an error, try running 'taplo format' to fix the formatting automatically.
Expand All @@ -533,7 +533,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: 1.82
rust-version: stable
- uses: actions/setup-node@v4
with:
node-version: "20"
Expand Down
3 changes: 1 addition & 2 deletions datafusion/common/src/functional_dependencies.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,12 @@
//! FunctionalDependencies keeps track of functional dependencies
//! inside DFSchema.

use std::collections::HashSet;
use std::fmt::{Display, Formatter};
use std::ops::Deref;
use std::vec::IntoIter;

use crate::utils::{merge_and_order_indices, set_difference};
use crate::{DFSchema, JoinType};
use crate::{DFSchema, HashSet, JoinType};

/// This object defines a constraint on a table.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
Expand Down
16 changes: 15 additions & 1 deletion datafusion/common/src/hash_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,11 @@ fn hash_fixed_list_array(
) -> Result<()> {
let values = Arc::clone(array.values());
let value_len = array.value_length();
let offset_size = value_len as usize / array.len();
let offset_size = if array.len() == 0 {
0
} else {
value_len as usize / array.len()
};
let nulls = array.nulls();
let mut values_hashes = vec![0u64; values.len()];
create_hashes(&[values], random_state, &mut values_hashes)?;
Expand Down Expand Up @@ -462,6 +466,16 @@ mod tests {
Ok(())
}

#[test]
fn create_hashes_for_empty_fixed_size_lit() -> Result<()> {
let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish();
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let hashes_buff = &mut vec![0; 0];
let hashes = create_hashes(&[Arc::new(empty_array)], &random_state, hashes_buff)?;
assert_eq!(hashes, &Vec::<u64>::new());
Ok(())
}

#[test]
fn create_hashes_for_float_arrays() -> Result<()> {
let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
Expand Down
5 changes: 5 additions & 0 deletions datafusion/common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ pub use functional_dependencies::{
get_target_functional_dependencies, Constraint, Constraints, Dependency,
FunctionalDependence, FunctionalDependencies,
};
use hashbrown::hash_map::DefaultHashBuilder;
pub use join_type::{JoinConstraint, JoinSide, JoinType};
pub use param_value::ParamValues;
pub use scalar::{ScalarType, ScalarValue};
Expand All @@ -87,6 +88,10 @@ pub use error::{
_substrait_datafusion_err,
};

// The HashMap and HashSet implementations that should be used as the uniform defaults
pub type HashMap<K, V, S = DefaultHashBuilder> = hashbrown::HashMap<K, V, S>;
pub type HashSet<T, S = DefaultHashBuilder> = hashbrown::HashSet<T, S>;

/// Downcast an Arrow Array to a concrete type, return an `DataFusionError::Internal` if the cast is
/// not possible. In normal usage of DataFusion the downcast should always succeed.
///
Expand Down
3 changes: 1 addition & 2 deletions datafusion/core/src/bin/print_functions_docs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@
// under the License.

use datafusion::execution::SessionStateDefaults;
use datafusion_common::{not_impl_err, Result};
use datafusion_common::{not_impl_err, HashSet, Result};
use datafusion_expr::{
aggregate_doc_sections, scalar_doc_sections, window_doc_sections, AggregateUDF,
DocSection, Documentation, ScalarUDF, WindowUDF,
};
use hashbrown::HashSet;
use itertools::Itertools;
use std::env::args;
use std::fmt::Write as _;
Expand Down
6 changes: 4 additions & 2 deletions datafusion/core/src/catalog_common/listing_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,16 @@
//! [`ListingSchemaProvider`]: [`SchemaProvider`] that scans ObjectStores for tables automatically

use std::any::Any;
use std::collections::{HashMap, HashSet};
use std::collections::HashSet;
use std::path::Path;
use std::sync::{Arc, Mutex};

use crate::catalog::{SchemaProvider, TableProvider, TableProviderFactory};
use crate::execution::context::SessionState;

use datafusion_common::{Constraints, DFSchema, DataFusionError, TableReference};
use datafusion_common::{
Constraints, DFSchema, DataFusionError, HashMap, TableReference,
};
use datafusion_expr::CreateExternalTable;

use async_trait::async_trait;
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/src/datasource/file_format/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ use datafusion_physical_plan::metrics::MetricsSet;

use async_trait::async_trait;
use bytes::Bytes;
use hashbrown::HashMap;
use datafusion_common::HashMap;
use log::debug;
use object_store::buffered::BufWriter;
use parquet::arrow::arrow_writer::{
Expand Down
3 changes: 1 addition & 2 deletions datafusion/core/src/datasource/listing/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,14 @@

//! Helper functions for the table implementation

use std::collections::HashMap;
use std::mem;
use std::sync::Arc;

use super::ListingTableUrl;
use super::PartitionedFile;
use crate::execution::context::SessionState;
use datafusion_common::internal_err;
use datafusion_common::{Result, ScalarValue};
use datafusion_common::{HashMap, Result, ScalarValue};
use datafusion_expr::{BinaryExpr, Operator};

use arrow::{
Expand Down
4 changes: 1 addition & 3 deletions datafusion/core/src/physical_optimizer/sort_pushdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use crate::physical_plan::{ExecutionPlan, ExecutionPlanProperties};
use datafusion_common::tree_node::{
ConcreteTreeNode, Transformed, TreeNode, TreeNodeRecursion,
};
use datafusion_common::{plan_err, JoinSide, Result};
use datafusion_common::{plan_err, HashSet, JoinSide, Result};
use datafusion_expr::JoinType;
use datafusion_physical_expr::expressions::Column;
use datafusion_physical_expr::utils::collect_columns;
Expand All @@ -41,8 +41,6 @@ use datafusion_physical_expr_common::sort_expr::{
LexOrdering, LexOrderingRef, LexRequirement,
};

use hashbrown::HashSet;

/// This is a "data class" we use within the [`EnforceSorting`] rule to push
/// down [`SortExec`] in the plan. In some cases, we can reduce the total
/// computational cost by pushing down `SortExec`s through some executors. The
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ use test_utils::{add_empty_batches, StringBatchGenerator};
use crate::fuzz_cases::aggregation_fuzzer::{
AggregationFuzzerBuilder, ColumnDescr, DatasetGeneratorConfig, QueryBuilder,
};
use datafusion_common::HashMap;
use datafusion_physical_expr_common::sort_expr::LexOrdering;
use hashbrown::HashMap;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use tokio::task::JoinSet;
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/tests/fuzz_cases/window_fuzz.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
use test_utils::add_empty_batches;

use datafusion::functions_window::row_number::row_number_udwf;
use datafusion_common::HashMap;
use datafusion_functions_window::lead_lag::{lag_udwf, lead_udwf};
use datafusion_functions_window::rank::{dense_rank_udwf, rank_udwf};
use datafusion_physical_expr_common::sort_expr::LexOrdering;
use hashbrown::HashMap;
use rand::distributions::Alphanumeric;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
Expand Down
11 changes: 8 additions & 3 deletions datafusion/core/tests/memory_limit/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ use datafusion_execution::memory_pool::{
};
use datafusion_expr::{Expr, TableType};
use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr};
use datafusion_physical_plan::spill::get_record_batch_memory_size;
use futures::StreamExt;
use std::any::Any;
use std::num::NonZeroUsize;
Expand Down Expand Up @@ -265,14 +266,18 @@ async fn sort_spill_reservation() {
// This test case shows how sort_spill_reservation works by
// purposely sorting data that requires non trivial memory to
// sort/merge.

// Merge operation needs extra memory to do row conversion, so make the
// memory limit larger.
let mem_limit = partition_size * 2;
let test = TestCase::new()
// This query uses a different order than the input table to
// force a sort. It also needs to have multiple columns to
// force RowFormat / interner that makes merge require
// substantial memory
.with_query("select * from t ORDER BY a , b DESC")
// enough memory to sort if we don't try to merge it all at once
.with_memory_limit(partition_size)
.with_memory_limit(mem_limit)
// use a single partition so only a sort is needed
.with_scenario(scenario)
.with_disk_manager_config(DiskManagerConfig::NewOs)
Expand Down Expand Up @@ -311,7 +316,7 @@ async fn sort_spill_reservation() {
// reserve sufficient space up front for merge and this time,
// which will force the spills to happen with less buffered
// input and thus with enough to merge.
.with_sort_spill_reservation_bytes(partition_size / 2);
.with_sort_spill_reservation_bytes(mem_limit / 2);

test.with_config(config).with_expected_success().run().await;
}
Expand Down Expand Up @@ -774,7 +779,7 @@ fn make_dict_batches() -> Vec<RecordBatch> {

// How many bytes does the memory from dict_batches consume?
fn batches_byte_size(batches: &[RecordBatch]) -> usize {
batches.iter().map(|b| b.get_array_memory_size()).sum()
batches.iter().map(get_record_batch_memory_size).sum()
}

#[derive(Debug)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
// under the License.

use std::any::Any;
use std::collections::HashMap;
use std::hash::{DefaultHasher, Hash, Hasher};
use std::sync::Arc;

Expand All @@ -39,7 +38,8 @@ use datafusion_common::cast::{as_float64_array, as_int32_array};
use datafusion_common::tree_node::{Transformed, TreeNode};
use datafusion_common::{
assert_batches_eq, assert_batches_sorted_eq, assert_contains, exec_err, internal_err,
not_impl_err, plan_err, DFSchema, DataFusionError, ExprSchema, Result, ScalarValue,
not_impl_err, plan_err, DFSchema, DataFusionError, ExprSchema, HashMap, Result,
ScalarValue,
};
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
use datafusion_expr::{
Expand Down
2 changes: 1 addition & 1 deletion datafusion/execution/src/memory_pool/pool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
// under the License.

use crate::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation};
use datafusion_common::HashMap;
use datafusion_common::{resources_datafusion_err, DataFusionError, Result};
use hashbrown::HashMap;
use log::debug;
use parking_lot::Mutex;
use std::{
Expand Down
3 changes: 1 addition & 2 deletions datafusion/expr/src/conditional_expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
use crate::expr::Case;
use crate::{expr_schema::ExprSchemable, Expr};
use arrow::datatypes::DataType;
use datafusion_common::{plan_err, DFSchema, Result};
use std::collections::HashSet;
use datafusion_common::{plan_err, DFSchema, HashSet, Result};

/// Helper struct for building [Expr::Case]
pub struct CaseBuilder {
Expand Down
2 changes: 1 addition & 1 deletion datafusion/expr/src/execution_props.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
use crate::var_provider::{VarProvider, VarType};
use chrono::{DateTime, TimeZone, Utc};
use datafusion_common::alias::AliasGenerator;
use std::collections::HashMap;
use datafusion_common::HashMap;
use std::sync::Arc;

/// Holds per-query execution properties and data (such as statement
Expand Down
Loading