From 077fb8c29258d7552da6b18504e448e301673d5f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 7 Nov 2025 07:03:58 -0500 Subject: [PATCH 1/8] Remove pyarrow --- .github/workflows/rust.yml | 26 ----- Cargo.lock | 102 ------------------- README.md | 1 - ci/scripts/rust_clippy.sh | 2 +- datafusion/common/Cargo.toml | 2 - datafusion/common/src/lib.rs | 2 - datafusion/common/src/pyarrow.rs | 169 ------------------------------- datafusion/core/Cargo.toml | 1 - 8 files changed, 1 insertion(+), 304 deletions(-) delete mode 100644 datafusion/common/src/pyarrow.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4b3c31e6b3b0..cd2f89874e1a 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -209,8 +209,6 @@ jobs: run: cargo check --profile ci --no-default-features -p datafusion --features=math_expressions - name: Check datafusion (parquet) run: cargo check --profile ci --no-default-features -p datafusion --features=parquet - - name: Check datafusion (pyarrow) - run: cargo check --profile ci --no-default-features -p datafusion --features=pyarrow - name: Check datafusion (regex_expressions) run: cargo check --profile ci --no-default-features -p datafusion --features=regex_expressions - name: Check datafusion (recursive_protection) @@ -572,30 +570,6 @@ jobs: shell: bash run: cargo test --profile ci --exclude datafusion-cli --workspace --lib --tests --bins --features avro,json,backtrace,integration-tests - test-datafusion-pyarrow: - name: cargo test pyarrow (amd64) - needs: linux-build-lib - runs-on: ubuntu-latest - container: - image: amd64/rust:bullseye # Use the bullseye tag image which comes with python3.9 - steps: - - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - with: - submodules: true - fetch-depth: 1 - - name: Install PyArrow - run: | - echo "LIBRARY_PATH=$LD_LIBRARY_PATH" >> $GITHUB_ENV - apt-get update - apt-get install python3-pip -y - python3 -m pip install pyarrow - - name: Setup Rust toolchain - uses: ./.github/actions/setup-builder - with: - rust-version: stable - - name: Run datafusion-common tests - run: cargo test --profile ci -p datafusion-common --features=pyarrow,sql - vendor: name: Verify Vendored Code runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index f500265108ff..f85795c9ad0d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -238,7 +238,6 @@ dependencies = [ "arrow-ipc", "arrow-json", "arrow-ord", - "arrow-pyarrow", "arrow-row", "arrow-schema", "arrow-select", @@ -422,18 +421,6 @@ dependencies = [ "arrow-select", ] -[[package]] -name = "arrow-pyarrow" -version = "57.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcfb2be2e9096236f449c11f425cddde18c4cc540f516d90f066f10a29ed515" -dependencies = [ - "arrow-array", - "arrow-data", - "arrow-schema", - "pyo3", -] - [[package]] name = "arrow-row" version = "57.0.0" @@ -2024,7 +2011,6 @@ dependencies = [ "object_store", "parquet", "paste", - "pyo3", "rand 0.9.2", "recursive", "sqlparser", @@ -3765,12 +3751,6 @@ dependencies = [ "web-time", ] -[[package]] -name = "indoc" -version = "2.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" - [[package]] name = "insta" version = "1.43.2" @@ -4113,15 +4093,6 @@ version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" -[[package]] -name = "memoffset" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" -dependencies = [ - "autocfg", -] - [[package]] name = "mimalloc" version = "0.1.48" @@ -4911,67 +4882,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "pyo3" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383" -dependencies = [ - "indoc", - "libc", - "memoffset", - "once_cell", - "portable-atomic", - "pyo3-build-config", - "pyo3-ffi", - "pyo3-macros", - "unindent", -] - -[[package]] -name = "pyo3-build-config" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f" -dependencies = [ - "target-lexicon", -] - -[[package]] -name = "pyo3-ffi" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105" -dependencies = [ - "libc", - "pyo3-build-config", -] - -[[package]] -name = "pyo3-macros" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded" -dependencies = [ - "proc-macro2", - "pyo3-macros-backend", - "quote", - "syn 2.0.108", -] - -[[package]] -name = "pyo3-macros-backend" -version = "0.26.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf" -dependencies = [ - "heck 0.5.0", - "proc-macro2", - "pyo3-build-config", - "quote", - "syn 2.0.108", -] - [[package]] name = "quad-rand" version = "0.2.3" @@ -6203,12 +6113,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" -[[package]] -name = "target-lexicon" -version = "0.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c" - [[package]] name = "tempfile" version = "3.23.0" @@ -6806,12 +6710,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" -[[package]] -name = "unindent" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" - [[package]] name = "unit-prefix" version = "0.5.1" diff --git a/README.md b/README.md index 5191496eaafe..5c55b2b15efa 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,6 @@ Optional features: - `avro`: support for reading the [Apache Avro] format - `backtrace`: include backtrace information in error messages - `parquet_encryption`: support for using [Parquet Modular Encryption] -- `pyarrow`: conversions between PyArrow and DataFusion types - `serde`: enable arrow-schema's `serde` feature [apache avro]: https://avro.apache.org/ diff --git a/ci/scripts/rust_clippy.sh b/ci/scripts/rust_clippy.sh index 6a00ad810956..aa994bc2b8c8 100755 --- a/ci/scripts/rust_clippy.sh +++ b/ci/scripts/rust_clippy.sh @@ -18,4 +18,4 @@ # under the License. set -ex -cargo clippy --all-targets --workspace --features avro,pyarrow,integration-tests,extended_tests -- -D warnings \ No newline at end of file +cargo clippy --all-targets --workspace --features avro,integration-tests,extended_tests -- -D warnings diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index a9eb0f2220c6..c5f8972ff0c1 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -45,7 +45,6 @@ parquet_encryption = [ "parquet/encryption", "dep:hex", ] -pyarrow = ["pyo3", "arrow/pyarrow", "parquet"] force_hash_collisions = [] recursive_protection = ["dep:recursive"] parquet = ["dep:parquet"] @@ -71,7 +70,6 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.26", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true, optional = true } tokio = { workspace = true } diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index c8d5a30ee3e0..c8172820e8ba 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -34,8 +34,6 @@ mod dfschema; mod functional_dependencies; mod join_type; mod param_value; -#[cfg(feature = "pyarrow")] -mod pyarrow; mod schema_reference; mod table_reference; mod unnest; diff --git a/datafusion/common/src/pyarrow.rs b/datafusion/common/src/pyarrow.rs deleted file mode 100644 index 18c6739735ff..000000000000 --- a/datafusion/common/src/pyarrow.rs +++ /dev/null @@ -1,169 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Conversions between PyArrow and DataFusion types - -use arrow::array::{Array, ArrayData}; -use arrow::pyarrow::{FromPyArrow, ToPyArrow}; -use pyo3::exceptions::PyException; -use pyo3::prelude::PyErr; -use pyo3::types::{PyAnyMethods, PyList}; -use pyo3::{Bound, FromPyObject, IntoPyObject, PyAny, PyResult, Python}; - -use crate::{DataFusionError, ScalarValue}; - -impl From for PyErr { - fn from(err: DataFusionError) -> PyErr { - PyException::new_err(err.to_string()) - } -} - -impl FromPyArrow for ScalarValue { - fn from_pyarrow_bound(value: &Bound<'_, PyAny>) -> PyResult { - let py = value.py(); - let typ = value.getattr("type")?; - let val = value.call_method0("as_py")?; - - // construct pyarrow array from the python value and pyarrow type - let factory = py.import("pyarrow")?.getattr("array")?; - let args = PyList::new(py, [val])?; - let array = factory.call1((args, typ))?; - - // convert the pyarrow array to rust array using C data interface - let array = arrow::array::make_array(ArrayData::from_pyarrow_bound(&array)?); - let scalar = ScalarValue::try_from_array(&array, 0)?; - - Ok(scalar) - } -} - -impl ToPyArrow for ScalarValue { - fn to_pyarrow<'py>(&self, py: Python<'py>) -> PyResult> { - let array = self.to_array()?; - // convert to pyarrow array using C data interface - let pyarray = array.to_data().to_pyarrow(py)?; - let pyscalar = pyarray.call_method1("__getitem__", (0,))?; - - Ok(pyscalar) - } -} - -impl<'source> FromPyObject<'source> for ScalarValue { - fn extract_bound(value: &Bound<'source, PyAny>) -> PyResult { - Self::from_pyarrow_bound(value) - } -} - -impl<'source> IntoPyObject<'source> for ScalarValue { - type Target = PyAny; - - type Output = Bound<'source, Self::Target>; - - type Error = PyErr; - - fn into_pyobject(self, py: Python<'source>) -> Result { - let array = self.to_array()?; - // convert to pyarrow array using C data interface - let pyarray = array.to_data().to_pyarrow(py)?; - pyarray.call_method1("__getitem__", (0,)) - } -} - -#[cfg(test)] -mod tests { - use pyo3::ffi::c_str; - use pyo3::py_run; - use pyo3::types::PyDict; - use pyo3::Python; - - use super::*; - - fn init_python() { - Python::initialize(); - Python::attach(|py| { - if py.run(c_str!("import pyarrow"), None, None).is_err() { - let locals = PyDict::new(py); - py.run( - c_str!( - "import sys; executable = sys.executable; python_path = sys.path" - ), - None, - Some(&locals), - ) - .expect("Couldn't get python info"); - let executable = locals.get_item("executable").unwrap(); - let executable: String = executable.extract().unwrap(); - - let python_path = locals.get_item("python_path").unwrap(); - let python_path: Vec = python_path.extract().unwrap(); - - panic!("pyarrow not found\nExecutable: {executable}\nPython path: {python_path:?}\n\ - HINT: try `pip install pyarrow`\n\ - NOTE: On Mac OS, you must compile against a Framework Python \ - (default in python.org installers and brew, but not pyenv)\n\ - NOTE: On Mac OS, PYO3 might point to incorrect Python library \ - path when using virtual environments. Try \ - `export PYTHONPATH=$(python -c \"import sys; print(sys.path[-1])\")`\n") - } - }) - } - - #[test] - fn test_roundtrip() { - init_python(); - - let example_scalars = [ - ScalarValue::Boolean(Some(true)), - ScalarValue::Int32(Some(23)), - ScalarValue::Float64(Some(12.34)), - ScalarValue::from("Hello!"), - ScalarValue::Date32(Some(1234)), - ]; - - Python::attach(|py| { - for scalar in example_scalars.iter() { - let result = - ScalarValue::from_pyarrow_bound(&scalar.to_pyarrow(py).unwrap()) - .unwrap(); - assert_eq!(scalar, &result); - } - }); - } - - #[test] - fn test_py_scalar() -> PyResult<()> { - init_python(); - - Python::attach(|py| -> PyResult<()> { - let scalar_float = ScalarValue::Float64(Some(12.34)); - let py_float = scalar_float - .into_pyobject(py)? - .call_method0("as_py") - .unwrap(); - py_run!(py, py_float, "assert py_float == 12.34"); - - let scalar_string = ScalarValue::Utf8(Some("Hello!".to_string())); - let py_string = scalar_string - .into_pyobject(py)? - .call_method0("as_py") - .unwrap(); - py_run!(py, py_string, "assert py_string == 'Hello!'"); - - Ok(()) - }) - } -} diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index f672e3a94681..8de020aa2fb1 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -76,7 +76,6 @@ parquet_encryption = [ "datafusion-common/parquet_encryption", "datafusion-datasource-parquet/parquet_encryption", ] -pyarrow = ["datafusion-common/pyarrow", "parquet"] regex_expressions = [ "datafusion-functions/regex_expressions", ] From 3832bd6cd55a138e14e2a5845b25cef240a00d42 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 8 Nov 2025 09:50:00 -0500 Subject: [PATCH 2/8] Add pyarrow feature removal to upgrade notes --- docs/source/library-user-guide/upgrading.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 0b227000f73d..4642cc4189cd 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -19,6 +19,15 @@ # Upgrade Guides +## DataFusion `52.0.0` + +TODO(tsaucer) Add link to issue for 52 release. + +### Removal of `pyarrow` feature + +The `pyarrow` feature flag has been removed. This feature has been migrated to +the `datafusion-python` repository since version `44.0.0`. + ## DataFusion `51.0.0` **Note:** DataFusion `51.0.0` has not been released yet. The information provided in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version. From 4bc39f75c23c561ef6a90774bd26297a5825ae2f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 10 Nov 2025 12:12:20 -0500 Subject: [PATCH 3/8] Update upgrade guide --- docs/source/library-user-guide/upgrading.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 4642cc4189cd..91f46d9931a3 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -21,7 +21,9 @@ ## DataFusion `52.0.0` -TODO(tsaucer) Add link to issue for 52 release. +**Note:** DataFusion `52.0.0` has not been released yet. The information provided in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version. + +You can see the current [status of the `52.0.0`release here](https://github.com/apache/datafusion/issues/18566) ### Removal of `pyarrow` feature @@ -30,10 +32,6 @@ the `datafusion-python` repository since version `44.0.0`. ## DataFusion `51.0.0` -**Note:** DataFusion `51.0.0` has not been released yet. The information provided in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version. - -You can see the current [status of the `51.0.0`release here](https://github.com/apache/datafusion/issues/17558) - ### `arrow` / `parquet` updated to 57.0.0 ### Upgrade to arrow `57.0.0` and parquet `57.0.0` From 06efa050941f29e044c42a121e8ded9540a2ab62 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 10 Nov 2025 12:14:08 -0500 Subject: [PATCH 4/8] Add back in 51 release text. That can be a different PR --- docs/source/library-user-guide/upgrading.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 91f46d9931a3..5bbbb3031d36 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -32,6 +32,11 @@ the `datafusion-python` repository since version `44.0.0`. ## DataFusion `51.0.0` +**Note:** DataFusion `51.0.0` has not been released yet. The information provid +ed in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version. + +You can see the current [status of the `51.0.0`release here](https://github.com/apache/datafusion/issues/17558) + ### `arrow` / `parquet` updated to 57.0.0 ### Upgrade to arrow `57.0.0` and parquet `57.0.0` From d57758ce0703470f9a0f7be0909f0b104492a554 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 10 Nov 2025 12:15:42 -0500 Subject: [PATCH 5/8] typo --- docs/source/library-user-guide/upgrading.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 5bbbb3031d36..d8735d655282 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -32,8 +32,7 @@ the `datafusion-python` repository since version `44.0.0`. ## DataFusion `51.0.0` -**Note:** DataFusion `51.0.0` has not been released yet. The information provid -ed in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version. +**Note:** DataFusion `51.0.0` has not been released yet. The information provided in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version. You can see the current [status of the `51.0.0`release here](https://github.com/apache/datafusion/issues/17558) From 4e26148d3bb7517a833db5f1353dea700c93bd2e Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 10 Nov 2025 12:30:04 -0500 Subject: [PATCH 6/8] Remove unused dep introduced in #18146 --- Cargo.lock | 1 - datafusion/catalog-listing/Cargo.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f85795c9ad0d..6c43db0424ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1957,7 +1957,6 @@ dependencies = [ "itertools 0.14.0", "log", "object_store", - "tokio", ] [[package]] diff --git a/datafusion/catalog-listing/Cargo.toml b/datafusion/catalog-listing/Cargo.toml index 4eaeed675a20..bf5c0dc9a82f 100644 --- a/datafusion/catalog-listing/Cargo.toml +++ b/datafusion/catalog-listing/Cargo.toml @@ -46,7 +46,6 @@ futures = { workspace = true } itertools = { workspace = true } log = { workspace = true } object_store = { workspace = true } -tokio = { workspace = true } [dev-dependencies] datafusion-datasource-parquet = { workspace = true } From 141f8e11441562501b955c4c98a266c1394601b4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 23 Nov 2025 09:34:17 -0500 Subject: [PATCH 7/8] fix --- docs/source/library-user-guide/upgrading.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 795a40ede374..91ba5acb1957 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -157,8 +157,6 @@ Instead of silently succeeding. The remove API no longer requires a mutable instance -> > > > > > > apache/main - ## DataFusion `51.0.0` ### `arrow` / `parquet` updated to 57.0.0 From 86aa53018f543a286ab06580ca889e141da29ded Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 23 Nov 2025 09:43:35 -0500 Subject: [PATCH 8/8] Remove datafusion-ffi due to cargo-machete telling me to --- Cargo.lock | 1 - datafusion-examples/Cargo.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f79f067fbc27..6f0f58b7617a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2187,7 +2187,6 @@ dependencies = [ "bytes", "dashmap", "datafusion", - "datafusion-ffi", "datafusion-physical-expr-adapter", "datafusion-proto", "env_logger", diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 61711f8472eb..000c4b34c07f 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -54,7 +54,6 @@ dashmap = { workspace = true } # note only use main datafusion crate for examples base64 = "0.22.1" datafusion = { workspace = true, default-features = true, features = ["parquet_encryption"] } -datafusion-ffi = { workspace = true } datafusion-physical-expr-adapter = { workspace = true } datafusion-proto = { workspace = true } env_logger = { workspace = true }