Implement UDAF improvements for list type handling

kosiew · kosiew · commit 5f101761d988 · 2026-01-20T13:16:18.000+08:00
Store UDAF return type in Rust accumulator and wrap
pyarrow Array/ChunkedArray returns into list scalars
for list-like return types. Add a UDAF test to return
a list of timestamps via a pyarrow array, validating
the aggregate output for correctness.
diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py
@@ -17,6 +17,8 @@
 
 from __future__ import annotations
 
+from datetime import datetime
+
 import pyarrow as pa
 import pyarrow.compute as pc
 import pytest
@@ -58,6 +60,25 @@ def state(self) -> list[pa.Scalar]:
         return [self._sum]
 
 
+class CollectTimestamps(Accumulator):
+    def __init__(self):
+        self._values: list[datetime] = []
+
+    def state(self) -> list[pa.Scalar]:
+        return [pa.scalar(self._values, type=pa.list_(pa.timestamp("ns")))]
+
+    def update(self, values: pa.Array) -> None:
+        self._values.extend(values.to_pylist())
+
+    def merge(self, states: list[pa.Array]) -> None:
+        for state in states[0].to_pylist():
+            if state is not None:
+                self._values.extend(state)
+
+    def evaluate(self) -> pa.Array:
+        return pa.array(self._values, type=pa.timestamp("ns"))
+
+
 @pytest.fixture
 def df(ctx):
     # create a RecordBatch and a new DataFrame from it
@@ -217,3 +238,27 @@ def test_register_udaf(ctx, df) -> None:
     df_result = ctx.sql("select summarize(b) from test_table")
 
     assert df_result.collect()[0][0][0].as_py() == 14.0
+
+
+def test_udaf_list_timestamp_return(ctx) -> None:
+    timestamps = [datetime(2024, 1, 1), datetime(2024, 1, 2)]
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array(timestamps, type=pa.timestamp("ns"))],
+        names=["ts"],
+    )
+    df = ctx.create_dataframe([[batch]], name="timestamp_table")
+
+    collect = udaf(
+        CollectTimestamps,
+        pa.timestamp("ns"),
+        pa.list_(pa.timestamp("ns")),
+        [pa.list_(pa.timestamp("ns"))],
+        volatility="immutable",
+    )
+
+    result = df.aggregate([], [collect(column("ts"))]).collect()[0]
+
+    assert result.column(0) == pa.array(
+        [timestamps],
+        type=pa.list_(pa.timestamp("ns")),
+    )
diff --git a/src/udaf.rs b/src/udaf.rs
@@ -27,7 +27,7 @@ use datafusion::logical_expr::{
 };
 use datafusion_ffi::udaf::{FFI_AggregateUDF, ForeignAggregateUDF};
 use pyo3::prelude::*;
-use pyo3::types::{PyCapsule, PyTuple};
+use pyo3::types::{PyCapsule, PyDict, PyTuple, PyType};
 
 use crate::common::data_type::PyScalarValue;
 use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult};
@@ -37,11 +37,12 @@ use crate::utils::{parse_volatility, validate_pycapsule};
 #[derive(Debug)]
 struct RustAccumulator {
     accum: Py<PyAny>,
+    return_type: DataType,
 }
 
 impl RustAccumulator {
-    fn new(accum: Py<PyAny>) -> Self {
-        Self { accum }
+    fn new(accum: Py<PyAny>, return_type: DataType) -> Self {
+        Self { accum, return_type }
     }
 }
 
@@ -59,10 +60,23 @@ impl Accumulator for RustAccumulator {
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
         Python::attach(|py| {
-            self.accum
-                .bind(py)
-                .call_method0("evaluate")?
-                .extract::<PyScalarValue>()
+            let value = self.accum.bind(py).call_method0("evaluate")?;
+            let is_list_type = matches!(
+                self.return_type,
+                DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _)
+            );
+            if is_list_type && is_pyarrow_array_like(py, &value)? {
+                let pyarrow = PyModule::import(py, "pyarrow")?;
+                let list_value = value.call_method0("to_pylist")?;
+                let py_type = self.return_type.to_pyarrow(py)?;
+                let kwargs = PyDict::new(py);
+                kwargs.set_item("type", py_type)?;
+                return pyarrow
+                    .getattr("scalar")?
+                    .call((list_value,), Some(kwargs))?
+                    .extract::<PyScalarValue>();
+            }
+            value.extract::<PyScalarValue>()
         })
         .map(|v| v.0)
         .map_err(|e| DataFusionError::Execution(format!("{e}")))
@@ -144,16 +158,26 @@ impl Accumulator for RustAccumulator {
 }
 
 pub fn to_rust_accumulator(accum: Py<PyAny>) -> AccumulatorFactoryFunction {
-    Arc::new(move |_| -> Result<Box<dyn Accumulator>> {
+    Arc::new(move |args| -> Result<Box<dyn Accumulator>> {
         let accum = Python::attach(|py| {
             accum
                 .call0(py)
                 .map_err(|e| DataFusionError::Execution(format!("{e}")))
         })?;
-        Ok(Box::new(RustAccumulator::new(accum)))
+        Ok(Box::new(RustAccumulator::new(
+            accum,
+            args.return_type.clone(),
+        )))
     })
 }
 
+fn is_pyarrow_array_like(py: Python<'_>, value: &Bound<'_, PyAny>) -> PyResult<bool> {
+    let pyarrow = PyModule::import(py, "pyarrow")?;
+    let array_type = pyarrow.getattr("Array")?.downcast::<PyType>()?;
+    let chunked_array_type = pyarrow.getattr("ChunkedArray")?.downcast::<PyType>()?;
+    Ok(value.is_instance(array_type)? || value.is_instance(chunked_array_type)?)
+}
+
 fn aggregate_udf_from_capsule(capsule: &Bound<'_, PyCapsule>) -> PyDataFusionResult<AggregateUDF> {
     validate_pycapsule(capsule, "datafusion_aggregate_udf")?;