diff --git a/.github/actions/build_linux/action.yml b/.github/actions/build_linux/action.yml index 1878382a85d4..68bb2a22b72e 100644 --- a/.github/actions/build_linux/action.yml +++ b/.github/actions/build_linux/action.yml @@ -30,7 +30,7 @@ runs: uses: ./.github/actions/setup_build_tool with: target: ${{ inputs.target }} - bypass_env_vars: RUSTFLAGS,RUST_LOG + bypass_env_vars: RUSTFLAGS,RUST_LOG,PYO3_PYTHON,LD_LIBRARY_PATH,PYO3_CROSS_PYTHON_VERSION - name: Cross setup if: startsWith(inputs.target, 'aarch64-') @@ -61,6 +61,9 @@ runs: echo "RUSTFLAGS=${flags} -C link-arg=-Wl,--compress-debug-sections=zlib" >> $GITHUB_ENV target=${{ inputs.target }} echo "BUILD_ARCH=${target/-unknown-linux-*}" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=/usr/local/lib" >> $GITHUB_ENV + echo "PYO3_PYTHON=python3.12" >> $GITHUB_ENV + echo "PYO3_CROSS_PYTHON_VERSION=/3.12" >> $GITHUB_ENV # build all binaries for debug - name: Build Debug diff --git a/.github/actions/build_linux_sanitizer/action.yml b/.github/actions/build_linux_sanitizer/action.yml index 615b59104b8b..87eb8c2a314a 100644 --- a/.github/actions/build_linux_sanitizer/action.yml +++ b/.github/actions/build_linux_sanitizer/action.yml @@ -19,7 +19,7 @@ runs: uses: ./.github/actions/setup_build_tool with: target: ${{ inputs.target }} - bypass_env_vars: RUSTFLAGS,RUST_LOG + bypass_env_vars: RUSTFLAGS,RUST_LOG,PYO3_PYTHON,LD_LIBRARY_PATH - name: Cross setup if: startsWith(inputs.target, 'aarch64-') diff --git a/.github/actions/check/action.yml b/.github/actions/check/action.yml index 161e844958e7..dd64ec578709 100644 --- a/.github/actions/check/action.yml +++ b/.github/actions/check/action.yml @@ -10,7 +10,7 @@ runs: - name: Setup Build Tool uses: ./.github/actions/setup_build_tool with: - bypass_env_vars: RUSTFLAGS,RUST_LOG,GITHUB_TOKEN + bypass_env_vars: RUSTFLAGS,RUST_LOG,GITHUB_TOKEN,PYO3_PYTHON,LD_LIBRARY_PATH - name: Check Apache License Header uses: korandoru/hawkeye@v2 diff --git a/.github/actions/setup_build_tool/action.yml b/.github/actions/setup_build_tool/action.yml index 580be26142e6..290c35a21398 100644 --- a/.github/actions/setup_build_tool/action.yml +++ b/.github/actions/setup_build_tool/action.yml @@ -8,7 +8,7 @@ inputs: bypass_env_vars: description: "Environment variables bypass to docker container" required: false - default: RUSTFLAGS,RUST_LOG + default: RUSTFLAGS,RUST_LOG,PYO3_PYTHON,LD_LIBRARY_PATH runs: using: "composite" steps: @@ -28,6 +28,9 @@ runs: cat <$BIN_LOCAL/build-tool #!/bin/bash script_name=\$(basename "\$0") + export PYO3_PYTHON=python3.12 + export LD_LIBRARY_PATH=/usr/local/lib + export PYO3_CROSS_PYTHON_VERSION=3.12 export INTERACTIVE=false export TARGET=${{ inputs.target }} export CARGO_INCREMENTAL=0 diff --git a/.github/actions/test_unit/action.yml b/.github/actions/test_unit/action.yml index adb8fc5719b4..75860bd6a374 100644 --- a/.github/actions/test_unit/action.yml +++ b/.github/actions/test_unit/action.yml @@ -7,7 +7,7 @@ runs: - name: Setup Build Tool uses: ./.github/actions/setup_build_tool with: - bypass_env_vars: RUSTFLAGS,RUSTDOCFLAGS,RUST_TEST_THREADS,RUST_LOG,RUST_BACKTRACE + bypass_env_vars: RUSTFLAGS,RUSTDOCFLAGS,RUST_TEST_THREADS,RUST_LOG,RUST_BACKTRACE,PYO3_PYTHON,LD_LIBRARY_PATH - shell: bash run: | diff --git a/.github/workflows/bindings.python.yml b/.github/workflows/bindings.python.yml index 1f756d2858e9..440d33a236da 100644 --- a/.github/workflows/bindings.python.yml +++ b/.github/workflows/bindings.python.yml @@ -1,18 +1,20 @@ name: Bindings Python on: - pull_request: - branches: - - main - paths: - - "src/**" - - ".github/workflows/bindings.python.yml" - workflow_call: - inputs: - tag: - description: Tag to release - required: true - type: string + ## uncomment it when bendpy is enabled + workflow_dispatch: + # pull_request: + # branches: + # - main + # paths: + # - "src/**" + # - ".github/workflows/bindings.python.yml" + # workflow_call: + # inputs: + # tag: + # description: Tag to release + # required: true + # type: string concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 89e6d98aeb98..cdbd31fc56db 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -604,13 +604,13 @@ jobs: with: name: test-sqlsmith - bindings_python: - if: inputs.stable - needs: create_release - uses: ./.github/workflows/bindings.python.yml - secrets: inherit - with: - tag: ${{ needs.create_release.outputs.version }} + # bindings_python: + # if: inputs.stable + # needs: create_release + # uses: ./.github/workflows/bindings.python.yml + # secrets: inherit + # with: + # tag: ${{ needs.create_release.outputs.version }} notify: runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index 50843d6e06ec..921332ab9c38 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -301,7 +301,6 @@ dependencies = [ "arrow-schema", "arrow-select", "arrow-string", - "pyo3", ] [[package]] @@ -509,7 +508,6 @@ version = "51.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02d9483aaabe910c4781153ae1b6ae0393f72d9ef757d38d09d450070cf2e528" dependencies = [ - "bitflags 2.4.2", "serde", ] @@ -557,6 +555,21 @@ dependencies = [ "rquickjs", ] +[[package]] +name = "arrow-udf-python" +version = "0.1.0" +source = "git+https://github.com/datafuse-extras/arrow-udf?rev=d0a21f0#d0a21f0fde330a0e5f658a55b58e0405d8372844" +dependencies = [ + "anyhow", + "arrow-array", + "arrow-buffer", + "arrow-ipc", + "arrow-schema", + "lazy_static", + "pyo3", + "pyo3-build-config", +] + [[package]] name = "arrow-udf-wasm" version = "0.2.2" @@ -1046,28 +1059,6 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" -[[package]] -name = "bendpy" -version = "0.0.0" -dependencies = [ - "arrow", - "arrow-schema", - "ctor 0.2.7", - "databend-common-config", - "databend-common-exception", - "databend-common-expression", - "databend-common-license", - "databend-common-meta-app", - "databend-common-meta-embedded", - "databend-common-users", - "databend-query", - "pyo3", - "pyo3-build-config 0.18.3", - "tokio", - "tokio-stream", - "uuid", -] - [[package]] name = "bigdecimal" version = "0.4.2" @@ -2486,16 +2477,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "ctor" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad291aa74992b9b7a7e88c38acbbf6ad7e107f1d90ee8775b7bc1fc3394f485c" -dependencies = [ - "quote", - "syn 2.0.52", -] - [[package]] name = "ctrlc" version = "3.4.2" @@ -3089,7 +3070,7 @@ dependencies = [ "comfy-table 6.2.0", "crc32fast", "criterion", - "ctor 0.1.26", + "ctor", "databend-common-arrow", "databend-common-ast", "databend-common-base", @@ -3749,7 +3730,7 @@ dependencies = [ "chrono-tz", "cidr", "cron", - "ctor 0.1.26", + "ctor", "dashmap", "databend-common-ast", "databend-common-async-functions", @@ -4609,6 +4590,7 @@ dependencies = [ "arrow-ipc", "arrow-schema", "arrow-udf-js", + "arrow-udf-python", "arrow-udf-wasm", "async-backtrace", "async-channel 1.9.0", @@ -4623,7 +4605,7 @@ dependencies = [ "chrono-tz", "config", "criterion", - "ctor 0.1.26", + "ctor", "dashmap", "databend-common-arrow", "databend-common-ast", @@ -9472,7 +9454,7 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fb9b5c752f145ac5046bccc3c4f62892e3c950c1d1eab80c5949cd68a2078db" dependencies = [ - "ctor 0.1.26", + "ctor", "web-time", ] @@ -11409,9 +11391,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53bdbb96d49157e65d45cc287af5f32ffadd5f4761438b527b055fb0d4bb8233" +checksum = "a5e00b96a521718e08e03b1a622f01c8a8deb50719335de3f60b3b3950f069d8" dependencies = [ "cfg-if", "indoc", @@ -11419,7 +11401,7 @@ dependencies = [ "memoffset", "parking_lot 0.12.1", "portable-atomic", - "pyo3-build-config 0.20.3", + "pyo3-build-config", "pyo3-ffi", "pyo3-macros", "unindent", @@ -11427,19 +11409,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.18.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cb946f5ac61bb61a5014924910d936ebd2b23b705f7a4a3c40b05c720b079a3" -dependencies = [ - "once_cell", - "target-lexicon", -] - -[[package]] -name = "pyo3-build-config" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deaa5745de3f5231ce10517a1f5dd97d53e5a2fd77aa6b5842292085831d48d7" +checksum = "7883df5835fafdad87c0d888b266c8ec0f4c9ca48a5bed6bbb592e8dedee1b50" dependencies = [ "once_cell", "target-lexicon", @@ -11447,19 +11419,19 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b42531d03e08d4ef1f6e85a2ed422eb678b8cd62b762e53891c05faf0d4afa" +checksum = "01be5843dc60b916ab4dad1dca6d20b9b4e6ddc8e15f50c47fe6d85f1fb97403" dependencies = [ "libc", - "pyo3-build-config 0.20.3", + "pyo3-build-config", ] [[package]] name = "pyo3-macros" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7305c720fa01b8055ec95e484a6eca7a83c841267f0dd5280f0c8b8551d2c158" +checksum = "77b34069fc0682e11b31dbd10321cbf94808394c56fd996796ce45217dfac53c" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -11469,13 +11441,13 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.20.3" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c7e9b68bb9c3149c5b0cade5d07f953d6d125eb4337723c4ccdb665f1f96185" +checksum = "08260721f32db5e1a5beae69a55553f56b99bd0e1c3e6e0a5e8851a9d0f5a85c" dependencies = [ "heck 0.4.1", "proc-macro2", - "pyo3-build-config 0.20.3", + "pyo3-build-config", "quote", "syn 2.0.52", ] diff --git a/Cargo.toml b/Cargo.toml index 148ecdc72732..f21fb7518390 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -105,7 +105,7 @@ members = [ "src/meta/service", # sqllogictest "tests/sqllogictests", - "src/bendpy", + # "src/bendpy", # sqlsmith "src/tests/sqlsmith", ] diff --git a/src/bendpy/pyproject.toml b/src/bendpy/pyproject.toml index 322bac60e054..99619e7fe8d1 100644 --- a/src/bendpy/pyproject.toml +++ b/src/bendpy/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "databend" -requires-python = ">=3.7" +requires-python = ">=3.12" description = "Databend Python Binding" classifiers = [ "Programming Language :: Rust", diff --git a/src/binaries/Cargo.toml b/src/binaries/Cargo.toml index fde65f348b63..416e18cd1807 100644 --- a/src/binaries/Cargo.toml +++ b/src/binaries/Cargo.toml @@ -14,6 +14,7 @@ memory-profiling = [ "databend-query/memory-profiling", "databend-common-base/memory-profiling", ] +python-udf = ["databend-query/python-udf"] simd = ["databend-meta/simd", "databend-query/simd"] z3-prove = ["databend-query/z3-prove"] jemalloc = ["databend-common-base/jemalloc"] diff --git a/src/common/exception/src/exception_code.rs b/src/common/exception/src/exception_code.rs index 32403c2fb0f1..8fc226fb1628 100644 --- a/src/common/exception/src/exception_code.rs +++ b/src/common/exception/src/exception_code.rs @@ -191,6 +191,7 @@ build_exceptions! { /// /// For example: license key is expired LicenseKeyInvalid(1402), + EnterpriseFeatureNotEnable(1403), BackgroundJobAlreadyExists(1501), UnknownBackgroundJob(1502), diff --git a/src/query/service/Cargo.toml b/src/query/service/Cargo.toml index 1bb11b4d951b..ca9ec64a5cbb 100644 --- a/src/query/service/Cargo.toml +++ b/src/query/service/Cargo.toml @@ -14,7 +14,7 @@ test = true [features] default = ["simd", "z3-prove"] simd = ["databend-common-arrow/simd"] - +python-udf = ["arrow-udf-python"] z3-prove = ["databend-common-sql/z3-prove"] disable_initial_exec_tls = ["databend-common-base/disable_initial_exec_tls"] @@ -103,6 +103,7 @@ jsonb = { workspace = true } # Crates.io dependencies arrow-udf-js = { workspace = true } +arrow-udf-python = { package = "arrow-udf-python", git = "https://github.com/datafuse-extras/arrow-udf", rev = "d0a21f0", optional = true } arrow-udf-wasm = { workspace = true } arrow-array = { workspace = true } diff --git a/src/query/service/src/pipelines/processors/transforms/transform_udf_script.rs b/src/query/service/src/pipelines/processors/transforms/transform_udf_script.rs index 8ad16ba1441b..8d2e2b6fe513 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_udf_script.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_udf_script.rs @@ -39,9 +39,15 @@ use crate::pipelines::processors::InputPort; use crate::pipelines::processors::OutputPort; use crate::pipelines::processors::Processor; +/// python runtime should be only initialized once by gil lock, see: https://github.com/python/cpython/blob/main/Python/pystate.c +#[cfg(feature = "python-udf")] +static GLOBAL_PYTHON_RUNTIME: std::sync::LazyLock>> = + std::sync::LazyLock::new(|| Arc::new(RwLock::new(arrow_udf_python::Runtime::new().unwrap()))); + pub enum ScriptRuntime { JavaScript(Arc>), WebAssembly(Arc>), + Python, } impl ScriptRuntime { @@ -59,6 +65,7 @@ impl ScriptRuntime { ErrorCode::UDFDataError(format!("Cannot create js runtime: {}", err)) }), "wasm" => Self::create_wasm_runtime(code), + "python" => Ok(Self::Python), _ => Err(ErrorCode::from_string(format!( "Invalid {} lang Runtime not supported", lang @@ -101,6 +108,24 @@ impl ScriptRuntime { &func.func_name, ) } + #[cfg(feature = "python-udf")] + ScriptRuntime::Python => { + let code: &str = std::str::from_utf8(code)?; + let mut runtime = GLOBAL_PYTHON_RUNTIME.write(); + runtime.add_function_with_handler( + &func.name, + arrow_schema.field(0).data_type().clone(), + arrow_udf_python::CallMode::ReturnNullOnNullInput, + code, + &func.func_name, + ) + } + #[cfg(not(feature = "python-udf"))] + ScriptRuntime::Python => { + return Err(ErrorCode::EnterpriseFeatureNotEnable( + "Failed to create python script udf", + )); + } // Ignore the execution for WASM context ScriptRuntime::WebAssembly(_) => Ok(()), }?; @@ -123,6 +148,22 @@ impl ScriptRuntime { )) })? } + #[cfg(feature = "python-udf")] + ScriptRuntime::Python => { + let runtime = GLOBAL_PYTHON_RUNTIME.read(); + runtime.call(&func.name, input_batch).map_err(|err| { + ErrorCode::UDFDataError(format!( + "Python UDF '{}' execution failed: {}", + func.name, err + )) + })? + } + #[cfg(not(feature = "python-udf"))] + ScriptRuntime::Python => { + return Err(ErrorCode::EnterpriseFeatureNotEnable( + "Failed to execute python script udf", + )); + } ScriptRuntime::WebAssembly(runtime) => { let runtime = runtime.read(); runtime.call(&func.func_name, input_batch).map_err(|err| { diff --git a/src/query/sql/src/planner/binder/udf.rs b/src/query/sql/src/planner/binder/udf.rs index 5124aef88b69..76134548d898 100644 --- a/src/query/sql/src/planner/binder/udf.rs +++ b/src/query/sql/src/planner/binder/udf.rs @@ -41,7 +41,8 @@ use crate::Binder; impl Binder { fn is_allowed_language(language: &str) -> bool { - let allowed_languages: HashSet<&str> = ["javascript", "wasm"].iter().cloned().collect(); + let allowed_languages: HashSet<&str> = + ["javascript", "wasm", "python"].iter().cloned().collect(); allowed_languages.contains(&language.to_lowercase().as_str()) } @@ -148,7 +149,7 @@ impl Binder { if !Self::is_allowed_language(language) { return Err(ErrorCode::InvalidArgument(format!( - "Unallowed UDF language '{language}', must be javascript or wasm" + "Unallowed UDF language '{language}', must be python, javascript or wasm" ))); } diff --git a/tests/sqllogictests/suites/udf_native/03_0001_udf_js.test b/tests/sqllogictests/suites/udf_native/03_0001_udf_js.test index d13fd8ca6966..c963c8c2600a 100644 --- a/tests/sqllogictests/suites/udf_native/03_0001_udf_js.test +++ b/tests/sqllogictests/suites/udf_native/03_0001_udf_js.test @@ -1,5 +1,5 @@ statement ok -CREATE FUNCTION gcd_js (INT, INT) RETURNS BIGINT LANGUAGE javascript HANDLER = 'gcd_js' AS $$ +CREATE OR REPLACE FUNCTION gcd_js (INT, INT) RETURNS BIGINT LANGUAGE javascript HANDLER = 'gcd_js' AS $$ export function gcd_js(a, b) { while (b != 0) { let t = b; diff --git a/tests/sqllogictests/suites/udf_native/03_0001_udf_py.test b/tests/sqllogictests/suites/udf_native/03_0001_udf_py.test new file mode 100644 index 000000000000..059763c94657 --- /dev/null +++ b/tests/sqllogictests/suites/udf_native/03_0001_udf_py.test @@ -0,0 +1,16 @@ +## enable it when compiled with ee feature +## statement ok +## CREATE OR REPLACE FUNCTION gcd_py (INT, INT) RETURNS BIGINT LANGUAGE python HANDLER = 'gcd' AS $$ +## def gcd(a: int, b: int) -> int: +## while b: +## a, b = b, a % b +## return a +## $$ + +## query F +## select number, gcd_py(number * 3, number * 6) from numbers(5) where number > 0 order by 1; +## ---- +## 1 3 +## 2 6 +## 3 9 +## 4 12