@@ -123,7 +123,7 @@ also see how the inputs to ``update`` and ``merge`` differ.
123123
124124.. code-block :: python
125125
126- import pyarrow
126+ import pyarrow as pa
127127 import pyarrow.compute
128128 import datafusion
129129 from datafusion import col, udaf, Accumulator
@@ -136,16 +136,16 @@ also see how the inputs to ``update`` and ``merge`` differ.
136136 def __init__ (self ):
137137 self ._sum = 0.0
138138
139- def update (self , values_a : pyarrow .Array, values_b : pyarrow .Array) -> None :
139+ def update (self , values_a : pa .Array, values_b : pa .Array) -> None :
140140 self ._sum = self ._sum + pyarrow.compute.sum(values_a).as_py() - pyarrow.compute.sum(values_b).as_py()
141141
142- def merge (self , states : List[pyarrow .Array]) -> None :
142+ def merge (self , states : list[pa .Array]) -> None :
143143 self ._sum = self ._sum + pyarrow.compute.sum(states[0 ]).as_py()
144144
145- def state (self ) -> pyarrow.Array :
146- return pyarrow.array([ self ._sum])
145+ def state (self ) -> list[pa.Scalar] :
146+ return [ pyarrow.scalar( self ._sum)]
147147
148- def evaluate (self ) -> pyarrow .Scalar:
148+ def evaluate (self ) -> pa .Scalar:
149149 return pyarrow.scalar(self ._sum)
150150
151151 ctx = datafusion.SessionContext()
@@ -156,20 +156,28 @@ also see how the inputs to ``update`` and ``merge`` differ.
156156 }
157157 )
158158
159- my_udaf = udaf(MyAccumulator, [pyarrow .float64(), pyarrow .float64()], pyarrow .float64(), [pyarrow .float64()], ' stable' )
159+ my_udaf = udaf(MyAccumulator, [pa .float64(), pa .float64()], pa .float64(), [pa .float64()], ' stable' )
160160
161161 df.aggregate([], [my_udaf(col(" a" ), col(" b" )).alias(" col_diff" )])
162162
163163 FAQ
164164^^^
165165
166166**How do I return a list from a UDAF? **
167- Use a list-valued scalar and declare list types for both the return and state
168- definitions. Returning a ``pyarrow.Array `` from ``evaluate `` is not supported
169- unless you convert it to a list scalar. For example, in ``evaluate `` you can
170- return ``pa.scalar([...], type=pa.list_(pa.timestamp("ms"))) `` and register the
171- UDAF with ``return_type=pa.list_(pa.timestamp("ms")) `` and
172- ``state_type=[pa.list_(pa.timestamp("ms"))] ``.
167+
168+ Both the ``evaluate `` and the ``state `` functions expect to return scalar values.
169+ If you wish to return a list array as a scalar value, the best practice is to
170+ wrap the values in a ``pyarrow.Scalar `` object. For example, you can return a
171+ timestamp list with ``pa.scalar([...], type=pa.list_(pa.timestamp("ms"))) `` and
172+ register the appropriate return or state types as
173+ ``return_type=pa.list_(pa.timestamp("ms")) `` and
174+ ``state_type=[pa.list_(pa.timestamp("ms"))] ``, respectively.
175+
176+ As of DataFusion 52.0.0 , you can pass return any Python object, including a
177+ PyArrow array, as the return value(s) for these functions and DataFusion will
178+ attempt to create a scalar type from the value. DataFusion has been tested to
179+ convert PyArrow, nanoarrow, and arro3 objects as well as primitive data types
180+ like integers, strings, and so on.
173181
174182Window Functions
175183----------------
0 commit comments