Skip to content

Commit

Permalink
Merge branch 'main' into feature-scalar_regexp_match_expr
Browse files Browse the repository at this point in the history
  • Loading branch information
zhuliquan committed Sep 17, 2024
2 parents b770697 + a08f923 commit a600f4d
Show file tree
Hide file tree
Showing 371 changed files with 32,105 additions and 22,502 deletions.
74 changes: 39 additions & 35 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ license = "Apache-2.0"
readme = "README.md"
repository = "https://github.com/apache/datafusion"
rust-version = "1.76"
version = "41.0.0"
version = "42.0.0"

[workspace.dependencies]
# We turn off default-features for some dependencies here so the workspaces which inherit them can
Expand All @@ -69,50 +69,50 @@ version = "41.0.0"
ahash = { version = "0.8", default-features = false, features = [
"runtime-rng",
] }
arrow = { version = "52.2.0", features = [
arrow = { version = "53.0.0", features = [
"prettyprint",
] }
arrow-array = { version = "52.2.0", default-features = false, features = [
arrow-array = { version = "53.0.0", default-features = false, features = [
"chrono-tz",
] }
arrow-buffer = { version = "52.2.0", default-features = false }
arrow-flight = { version = "52.2.0", features = [
arrow-buffer = { version = "53.0.0", default-features = false }
arrow-flight = { version = "53.0.0", features = [
"flight-sql-experimental",
] }
arrow-ipc = { version = "52.2.0", default-features = false, features = [
arrow-ipc = { version = "53.0.0", default-features = false, features = [
"lz4",
] }
arrow-ord = { version = "52.2.0", default-features = false }
arrow-schema = { version = "52.2.0", default-features = false }
arrow-string = { version = "52.2.0", default-features = false }
arrow-ord = { version = "53.0.0", default-features = false }
arrow-schema = { version = "53.0.0", default-features = false }
arrow-string = { version = "53.0.0", default-features = false }
async-trait = "0.1.73"
bigdecimal = "=0.4.1"
bytes = "1.4"
chrono = { version = "0.4.34", default-features = false }
chrono = { version = "0.4.38", default-features = false }
ctor = "0.2.0"
dashmap = "6.0.1"
datafusion = { path = "datafusion/core", version = "41.0.0", default-features = false }
datafusion-catalog = { path = "datafusion/catalog", version = "41.0.0" }
datafusion-common = { path = "datafusion/common", version = "41.0.0", default-features = false }
datafusion-common-runtime = { path = "datafusion/common-runtime", version = "41.0.0" }
datafusion-execution = { path = "datafusion/execution", version = "41.0.0" }
datafusion-expr = { path = "datafusion/expr", version = "41.0.0" }
datafusion-expr-common = { path = "datafusion/expr-common", version = "41.0.0" }
datafusion-functions = { path = "datafusion/functions", version = "41.0.0" }
datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "41.0.0" }
datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "41.0.0" }
datafusion-functions-nested = { path = "datafusion/functions-nested", version = "41.0.0" }
datafusion-functions-window = { path = "datafusion/functions-window", version = "41.0.0" }
datafusion-optimizer = { path = "datafusion/optimizer", version = "41.0.0", default-features = false }
datafusion-physical-expr = { path = "datafusion/physical-expr", version = "41.0.0", default-features = false }
datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "41.0.0", default-features = false }
datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "41.0.0" }
datafusion-physical-plan = { path = "datafusion/physical-plan", version = "41.0.0" }
datafusion-proto = { path = "datafusion/proto", version = "41.0.0" }
datafusion-proto-common = { path = "datafusion/proto-common", version = "41.0.0" }
datafusion-sql = { path = "datafusion/sql", version = "41.0.0" }
datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "41.0.0" }
datafusion-substrait = { path = "datafusion/substrait", version = "41.0.0" }
datafusion = { path = "datafusion/core", version = "42.0.0", default-features = false }
datafusion-catalog = { path = "datafusion/catalog", version = "42.0.0" }
datafusion-common = { path = "datafusion/common", version = "42.0.0", default-features = false }
datafusion-common-runtime = { path = "datafusion/common-runtime", version = "42.0.0" }
datafusion-execution = { path = "datafusion/execution", version = "42.0.0" }
datafusion-expr = { path = "datafusion/expr", version = "42.0.0" }
datafusion-expr-common = { path = "datafusion/expr-common", version = "42.0.0" }
datafusion-functions = { path = "datafusion/functions", version = "42.0.0" }
datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "42.0.0" }
datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "42.0.0" }
datafusion-functions-nested = { path = "datafusion/functions-nested", version = "42.0.0" }
datafusion-functions-window = { path = "datafusion/functions-window", version = "42.0.0" }
datafusion-optimizer = { path = "datafusion/optimizer", version = "42.0.0", default-features = false }
datafusion-physical-expr = { path = "datafusion/physical-expr", version = "42.0.0", default-features = false }
datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "42.0.0", default-features = false }
datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "42.0.0" }
datafusion-physical-plan = { path = "datafusion/physical-plan", version = "42.0.0" }
datafusion-proto = { path = "datafusion/proto", version = "42.0.0" }
datafusion-proto-common = { path = "datafusion/proto-common", version = "42.0.0" }
datafusion-sql = { path = "datafusion/sql", version = "42.0.0" }
datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "42.0.0" }
datafusion-substrait = { path = "datafusion/substrait", version = "42.0.0" }
doc-comment = "0.3"
env_logger = "0.11"
futures = "0.3"
Expand All @@ -122,18 +122,22 @@ indexmap = "2.0.0"
itertools = "0.13"
log = "^0.4"
num_cpus = "1.13.0"
object_store = { version = "0.10.2", default-features = false }
object_store = { version = "0.11.0", default-features = false }
parking_lot = "0.12"
parquet = { version = "52.2.0", default-features = false, features = [
parquet = { version = "53.0.0", default-features = false, features = [
"arrow",
"async",
"object_store",
] }
pbjson = { version = "0.7.0" }
# Should match arrow-flight's version of prost.
prost = "0.13.1"
prost-derive = "0.13.1"
rand = "0.8"
regex = "1.8"
rstest = "0.22.0"
serde_json = "1"
sqlparser = { version = "0.50.0", features = ["visitor"] }
sqlparser = { version = "0.51.0", features = ["visitor"] }
tempfile = "3"
thiserror = "1.0.44"
tokio = { version = "1.36", features = ["macros", "rt", "sync"] }
Expand Down
44 changes: 33 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,36 @@
[discord-badge]: https://img.shields.io/discord/885562378132000778.svg?logo=discord&style=flat-square
[discord-url]: https://discord.com/invite/Qw5gKqHxUM

[Website](https://github.com/apache/datafusion) |
[Guides](https://github.com/apache/datafusion/tree/main/docs) |
[Website](https://datafusion.apache.org/) |
[API Docs](https://docs.rs/datafusion/latest/datafusion/) |
[Chat](https://discord.com/channels/885562378132000778/885562378132000781)

<img src="./docs/source/_static/images/2x_bgwhite_original.png" width="512" alt="logo"/>

Apache DataFusion is a very fast, extensible query engine for building high-quality data-centric systems in
[Rust](http://rustlang.org), using the [Apache Arrow](https://arrow.apache.org)
in-memory format. [Python Bindings](https://github.com/apache/datafusion-python) are also available. DataFusion offers SQL and Dataframe APIs, excellent [performance](https://benchmark.clickhouse.com/), built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and a great community.
<a href="https://datafusion.apache.org/">
<img src="./docs/source/_static/images/2x_bgwhite_original.png" width="512" alt="logo"/>
</a>

DataFusion is an extensible query engine written in [Rust] that
uses [Apache Arrow] as its in-memory format. DataFusion's target users are
developers building fast and feature rich database and analytic systems,
customized to particular workloads. See [use cases] for examples.

"Out of the box," DataFusion offers [SQL] and [`Dataframe`] APIs,
excellent [performance], built-in support for CSV, Parquet, JSON, and Avro,
extensive customization, and a great community.
[Python Bindings] are also available.

DataFusion features a full query planner, a columnar, streaming, multi-threaded,
vectorized execution engine, and partitioned data sources. You can
customize DataFusion at almost all points including additional data sources,
query languages, functions, custom operators and more.
See the [Architecture] section for more details.

[rust]: http://rustlang.org
[apache arrow]: https://arrow.apache.org
[use cases]: https://datafusion.apache.org/user-guide/introduction.html#use-cases
[python bindings]: https://github.com/apache/datafusion-python
[performance]: https://benchmark.clickhouse.com/
[architecture]: https://datafusion.apache.org/contributor-guide/architecture.html

Here are links to some important information

Expand Down Expand Up @@ -97,9 +117,11 @@ Optional features:

## Rust Version Compatibility Policy

DataFusion's Minimum Required Stable Rust Version (MSRV) policy is to support
each stable Rust version for 6 months after it is
[released](https://github.com/rust-lang/rust/blob/master/RELEASES.md). This
generally translates to support for the most recent 3 to 4 stable Rust versions.
DataFusion's Minimum Required Stable Rust Version (MSRV) policy is to support stable [4 latest
Rust versions](https://releases.rs) OR the stable minor Rust version as of 4 months, whichever is lower.

For example, given the releases `1.78.0`, `1.79.0`, `1.80.0`, `1.80.1` and `1.81.0` DataFusion will support 1.78.0, which is 3 minor versions prior to the most minor recent `1.81`.

If a hotfix is released for the minimum supported Rust version (MSRV), the MSRV will be the minor version with all hotfixes, even if it surpasses the four-month window.

We enforce this policy using a [MSRV CI Check](https://github.com/search?q=repo%3Aapache%2Fdatafusion+rust-version+language%3ATOML+path%3A%2F%5ECargo.toml%2F&type=code)
38 changes: 19 additions & 19 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ parquet: Benchmark of parquet reader's filtering speed
sort: Benchmark of sorting speed
clickbench_1: ClickBench queries against a single parquet file
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
clickbench_extended: ClickBench "inspired" queries against a single parquet (DataFusion specific)
clickbench_extended: ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
**********
* Supported Configuration (Environment Variables)
Expand Down Expand Up @@ -106,7 +106,7 @@ while [[ $# -gt 0 ]]; do
shift # past argument
usage
;;
-*|--*)
-*)
echo "Unknown option $1"
exit 1
;;
Expand Down Expand Up @@ -175,7 +175,7 @@ main() {
run)
# Parse positional parameters
BENCHMARK=${ARG2:-"${BENCHMARK}"}
BRANCH_NAME=$(cd ${DATAFUSION_DIR} && git rev-parse --abbrev-ref HEAD)
BRANCH_NAME=$(cd "${DATAFUSION_DIR}" && git rev-parse --abbrev-ref HEAD)
BRANCH_NAME=${BRANCH_NAME//\//_} # mind blowing syntax to replace / with _
RESULTS_NAME=${RESULTS_NAME:-"${BRANCH_NAME}"}
RESULTS_DIR=${RESULTS_DIR:-"$SCRIPT_DIR/results/$RESULTS_NAME"}
Expand All @@ -189,7 +189,7 @@ main() {
echo "DATA_DIR: ${DATA_DIR}"
echo "RESULTS_DIR: ${RESULTS_DIR}"
echo "CARGO_COMMAND: ${CARGO_COMMAND}"
echo "PREFER_HASH_JOIN": ${PREFER_HASH_JOIN}
echo "PREFER_HASH_JOIN: ${PREFER_HASH_JOIN}"
echo "***************************"

# navigate to the appropriate directory
Expand Down Expand Up @@ -288,7 +288,7 @@ data_tpch() {
echo " tbl files exist ($FILE exists)."
else
echo " creating tbl files with tpch_dbgen..."
docker run -v "${TPCH_DIR}":/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s ${SCALE_FACTOR}
docker run -v "${TPCH_DIR}":/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s "${SCALE_FACTOR}"
fi

# Copy expected answers into the ./data/answers directory if it does not already exist
Expand Down Expand Up @@ -325,7 +325,7 @@ run_tpch() {
RESULTS_FILE="${RESULTS_DIR}/tpch_sf${SCALE_FACTOR}.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch benchmark..."
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join ${PREFER_HASH_JOIN} --format parquet -o ${RESULTS_FILE}
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}"
}

# Runs the tpch in memory
Expand All @@ -341,23 +341,23 @@ run_tpch_mem() {
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpch_mem benchmark..."
# -m means in memory
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join ${PREFER_HASH_JOIN} -m --format parquet -o ${RESULTS_FILE}
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}"
}

# Runs the parquet filter benchmark
run_parquet() {
RESULTS_FILE="${RESULTS_DIR}/parquet.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running parquet filter benchmark..."
$CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --prefer_hash_join ${PREFER_HASH_JOIN} --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
$CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
}

# Runs the sort benchmark
run_sort() {
RESULTS_FILE="${RESULTS_DIR}/sort.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running sort benchmark..."
$CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --prefer_hash_join ${PREFER_HASH_JOIN} --scale-factor 1.0 --iterations 5 -o ${RESULTS_FILE}
$CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --scale-factor 1.0 --iterations 5 -o "${RESULTS_FILE}"
}


Expand All @@ -369,7 +369,7 @@ data_clickbench_1() {
pushd "${DATA_DIR}" > /dev/null

# Avoid downloading if it already exists and is the right size
OUTPUT_SIZE=`wc -c hits.parquet 2>/dev/null | awk '{print $1}' || true`
OUTPUT_SIZE=$(wc -c hits.parquet 2>/dev/null | awk '{print $1}' || true)
echo -n "Checking hits.parquet..."
if test "${OUTPUT_SIZE}" = "14779976446"; then
echo -n "... found ${OUTPUT_SIZE} bytes ..."
Expand All @@ -393,7 +393,7 @@ data_clickbench_partitioned() {
pushd "${DATA_DIR}/hits_partitioned" > /dev/null

echo -n "Checking hits_partitioned..."
OUTPUT_SIZE=`wc -c * 2>/dev/null | tail -n 1 | awk '{print $1}' || true`
OUTPUT_SIZE=$(wc -c -- * 2>/dev/null | tail -n 1 | awk '{print $1}' || true)
if test "${OUTPUT_SIZE}" = "14737666736"; then
echo -n "... found ${OUTPUT_SIZE} bytes ..."
else
Expand All @@ -411,23 +411,23 @@ run_clickbench_1() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (1 file) benchmark..."
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
}

# Runs the clickbench benchmark with the partitioned parquet files
run_clickbench_partitioned() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (partitioned, 100 files) benchmark..."
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
}

# Runs the clickbench "extended" benchmark with a single large parquet file
run_clickbench_extended() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (1 file) extended benchmark..."
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o ${RESULTS_FILE}
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o "${RESULTS_FILE}"
}

compare_benchmarks() {
Expand All @@ -447,12 +447,12 @@ compare_benchmarks() {
fi

echo "Comparing ${BRANCH1} and ${BRANCH2}"
for bench in `ls ${BASE_RESULTS_DIR}/${BRANCH1}` ; do
RESULTS_FILE1="${BASE_RESULTS_DIR}/${BRANCH1}/${bench}"
RESULTS_FILE2="${BASE_RESULTS_DIR}/${BRANCH2}/${bench}"
for RESULTS_FILE1 in "${BASE_RESULTS_DIR}/${BRANCH1}"/*.json ; do
BENCH=$(basename "${RESULTS_FILE1}")
RESULTS_FILE2="${BASE_RESULTS_DIR}/${BRANCH2}/${BENCH}"
if test -f "${RESULTS_FILE2}" ; then
echo "--------------------"
echo "Benchmark ${bench}"
echo "Benchmark ${BENCH}"
echo "--------------------"
PATH=$VIRTUAL_ENV/bin:$PATH python3 "${SCRIPT_DIR}"/compare.py "${RESULTS_FILE1}" "${RESULTS_FILE2}"
else
Expand All @@ -463,7 +463,7 @@ compare_benchmarks() {
}

setup_venv() {
python3 -m venv $VIRTUAL_ENV
python3 -m venv "$VIRTUAL_ENV"
PATH=$VIRTUAL_ENV/bin:$PATH python3 -m pip install -r requirements.txt
}

Expand Down
Loading

0 comments on commit a600f4d

Please sign in to comment.