From b9d5f6539fb4449b2a758de265db3af2de5c9510 Mon Sep 17 00:00:00 2001
From: Tim Saucer <timsaucer@gmail.com>
Date: Mon, 30 Mar 2026 16:06:41 -0400
Subject: [PATCH 1/9] Add missing array/list functions and aliases (#1452)

Add new array functions from upstream DataFusion v53: array_any_value,
array_distance, array_max, array_min, array_reverse, arrays_zip,
string_to_array, and gen_series. Add corresponding list_* aliases and
missing list_* aliases for existing functions (list_empty, list_pop_back,
list_pop_front, list_has, list_has_all, list_has_any). Also add
array_contains/list_contains as aliases for array_has, generate_series
as alias for gen_series, and string_to_list as alias for string_to_array.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/core/src/functions.rs   |  30 ++++
 python/datafusion/functions.py | 294 ++++++++++++++++++++++++++++++++-
 2 files changed, 323 insertions(+), 1 deletion(-)
diff --git a/crates/core/src/functions.rs b/crates/core/src/functions.rs
index fefe14b3e..94759ac4a 100644
--- a/crates/core/src/functions.rs
+++ b/crates/core/src/functions.rs
@@ -93,6 +93,22 @@ fn array_cat(exprs: Vec<PyExpr>) -> PyExpr {
     array_concat(exprs)
 }
 
+#[pyfunction]
+fn array_distance(array1: PyExpr, array2: PyExpr) -> PyExpr {
+    let args = vec![array1.into(), array2.into()];
+    Expr::ScalarFunction(datafusion::logical_expr::expr::ScalarFunction::new_udf(
+        datafusion::functions_nested::distance::array_distance_udf(),
+        args,
+    ))
+    .into()
+}
+
+#[pyfunction]
+fn arrays_zip(exprs: Vec<PyExpr>) -> PyExpr {
+    let exprs = exprs.into_iter().map(|x| x.into()).collect();
+    datafusion::functions_nested::expr_fn::arrays_zip(exprs).into()
+}
+
 #[pyfunction]
 #[pyo3(signature = (array, element, index=None))]
 fn array_position(array: PyExpr, element: PyExpr, index: Option<i64>) -> PyExpr {
@@ -667,6 +683,12 @@ array_fn!(array_intersect, first_array second_array);
 array_fn!(array_union, array1 array2);
 array_fn!(array_except, first_array second_array);
 array_fn!(array_resize, array size value);
+array_fn!(array_any_value, array);
+array_fn!(array_max, array);
+array_fn!(array_min, array);
+array_fn!(array_reverse, array);
+array_fn!(string_to_array, string delimiter null_string);
+array_fn!(gen_series, start stop step);
 array_fn!(cardinality, array);
 array_fn!(flatten, array);
 array_fn!(range, start stop step);
@@ -1129,6 +1151,14 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_wrapped(wrap_pyfunction!(array_replace_all))?;
     m.add_wrapped(wrap_pyfunction!(array_sort))?;
     m.add_wrapped(wrap_pyfunction!(array_slice))?;
+    m.add_wrapped(wrap_pyfunction!(array_any_value))?;
+    m.add_wrapped(wrap_pyfunction!(array_distance))?;
+    m.add_wrapped(wrap_pyfunction!(array_max))?;
+    m.add_wrapped(wrap_pyfunction!(array_min))?;
+    m.add_wrapped(wrap_pyfunction!(array_reverse))?;
+    m.add_wrapped(wrap_pyfunction!(arrays_zip))?;
+    m.add_wrapped(wrap_pyfunction!(string_to_array))?;
+    m.add_wrapped(wrap_pyfunction!(gen_series))?;
     m.add_wrapped(wrap_pyfunction!(flatten))?;
     m.add_wrapped(wrap_pyfunction!(cardinality))?;
 
diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
index 2ef2f0473..e1954d4d9 100644
--- a/python/datafusion/functions.py
+++ b/python/datafusion/functions.py
@@ -53,10 +53,13 @@
     "approx_percentile_cont_with_weight",
     "array",
     "array_agg",
+    "array_any_value",
     "array_append",
     "array_cat",
     "array_concat",
+    "array_contains",
     "array_dims",
+    "array_distance",
     "array_distinct",
     "array_element",
     "array_empty",
@@ -69,6 +72,8 @@
     "array_intersect",
     "array_join",
     "array_length",
+    "array_max",
+    "array_min",
     "array_ndims",
     "array_pop_back",
     "array_pop_front",
@@ -85,10 +90,12 @@
     "array_replace_all",
     "array_replace_n",
     "array_resize",
+    "array_reverse",
     "array_slice",
     "array_sort",
     "array_to_string",
     "array_union",
+    "arrays_zip",
     "arrow_cast",
     "arrow_typeof",
     "ascii",
@@ -152,6 +159,8 @@
     "floor",
     "from_unixtime",
     "gcd",
+    "gen_series",
+    "generate_series",
     "in_list",
     "initcap",
     "isnan",
@@ -163,19 +172,30 @@
     "left",
     "length",
     "levenshtein",
+    "list_any_value",
     "list_append",
     "list_cat",
     "list_concat",
+    "list_contains",
     "list_dims",
+    "list_distance",
     "list_distinct",
     "list_element",
+    "list_empty",
     "list_except",
     "list_extract",
+    "list_has",
+    "list_has_all",
+    "list_has_any",
     "list_indexof",
     "list_intersect",
     "list_join",
     "list_length",
+    "list_max",
+    "list_min",
     "list_ndims",
+    "list_pop_back",
+    "list_pop_front",
     "list_position",
     "list_positions",
     "list_prepend",
@@ -189,10 +209,12 @@
     "list_replace_all",
     "list_replace_n",
     "list_resize",
+    "list_reverse",
     "list_slice",
     "list_sort",
     "list_to_string",
     "list_union",
+    "list_zip",
     "ln",
     "log",
     "log2",
@@ -263,6 +285,8 @@
     "stddev_pop",
     "stddev_samp",
     "string_agg",
+    "string_to_array",
+    "string_to_list",
     "strpos",
     "struct",
     "substr",
@@ -293,7 +317,6 @@
     "var_samp",
     "var_sample",
     "when",
-    # Window Functions
     "window",
 ]
 
@@ -2719,6 +2742,15 @@ def array_empty(array: Expr) -> Expr:
     return Expr(f.array_empty(array.expr))
 
 
+def list_empty(array: Expr) -> Expr:
+    """Returns a boolean indicating whether the array is empty.
+
+    See Also:
+        This is an alias for :py:func:`array_empty`.
+    """
+    return array_empty(array)
+
+
 def array_extract(array: Expr, n: Expr) -> Expr:
     """Extracts the element with the index n from the array.
 
@@ -2816,6 +2848,51 @@ def array_has_any(first_array: Expr, second_array: Expr) -> Expr:
     return Expr(f.array_has_any(first_array.expr, second_array.expr))
 
 
+def array_contains(first_array: Expr, second_array: Expr) -> Expr:
+    """Returns true if the element appears in the first array, otherwise false.
+
+    See Also:
+        This is an alias for :py:func:`array_has`.
+    """
+    return array_has(first_array, second_array)
+
+
+def list_has(first_array: Expr, second_array: Expr) -> Expr:
+    """Returns true if the element appears in the first array, otherwise false.
+
+    See Also:
+        This is an alias for :py:func:`array_has`.
+    """
+    return array_has(first_array, second_array)
+
+
+def list_has_all(first_array: Expr, second_array: Expr) -> Expr:
+    """Determines if there is complete overlap ``second_array`` in ``first_array``.
+
+    See Also:
+        This is an alias for :py:func:`array_has_all`.
+    """
+    return array_has_all(first_array, second_array)
+
+
+def list_has_any(first_array: Expr, second_array: Expr) -> Expr:
+    """Determine if there is an overlap between ``first_array`` and ``second_array``.
+
+    See Also:
+        This is an alias for :py:func:`array_has_any`.
+    """
+    return array_has_any(first_array, second_array)
+
+
+def list_contains(first_array: Expr, second_array: Expr) -> Expr:
+    """Returns true if the element appears in the first array, otherwise false.
+
+    See Also:
+        This is an alias for :py:func:`array_has`.
+    """
+    return array_has(first_array, second_array)
+
+
 def array_position(array: Expr, element: Expr, index: int | None = 1) -> Expr:
     """Return the position of the first occurrence of ``element`` in ``array``.
 
@@ -2983,6 +3060,24 @@ def array_pop_front(array: Expr) -> Expr:
     return Expr(f.array_pop_front(array.expr))
 
 
+def list_pop_back(array: Expr) -> Expr:
+    """Returns the array without the last element.
+
+    See Also:
+        This is an alias for :py:func:`array_pop_back`.
+    """
+    return array_pop_back(array)
+
+
+def list_pop_front(array: Expr) -> Expr:
+    """Returns the array without the first element.
+
+    See Also:
+        This is an alias for :py:func:`array_pop_front`.
+    """
+    return array_pop_front(array)
+
+
 def array_remove(array: Expr, element: Expr) -> Expr:
     """Removes the first element from the array equal to the given value.
 
@@ -3354,6 +3449,203 @@ def list_resize(array: Expr, size: Expr, value: Expr) -> Expr:
     return array_resize(array, size, value)
 
 
+def array_any_value(array: Expr) -> Expr:
+    """Returns the first non-null element in the array.
+
+    Examples:
+        >>> ctx = dfn.SessionContext()
+        >>> df = ctx.from_pydict({"a": [[None, 2, 3]]})
+        >>> result = df.select(
+        ...     dfn.functions.array_any_value(dfn.col("a")).alias("result"))
+        >>> result.collect_column("result")[0].as_py()
+        2
+    """
+    return Expr(f.array_any_value(array.expr))
+
+
+def list_any_value(array: Expr) -> Expr:
+    """Returns the first non-null element in the array.
+
+    See Also:
+        This is an alias for :py:func:`array_any_value`.
+    """
+    return array_any_value(array)
+
+
+def array_distance(array1: Expr, array2: Expr) -> Expr:
+    """Returns the Euclidean distance between two numeric arrays.
+
+    Examples:
+        >>> ctx = dfn.SessionContext()
+        >>> df = ctx.from_pydict({"a": [[1.0, 2.0]], "b": [[1.0, 4.0]]})
+        >>> result = df.select(
+        ...     dfn.functions.array_distance(
+        ...         dfn.col("a"), dfn.col("b"),
+        ...     ).alias("result"))
+        >>> result.collect_column("result")[0].as_py()
+        2.0
+    """
+    return Expr(f.array_distance(array1.expr, array2.expr))
+
+
+def list_distance(array1: Expr, array2: Expr) -> Expr:
+    """Returns the Euclidean distance between two numeric arrays.
+
+    See Also:
+        This is an alias for :py:func:`array_distance`.
+    """
+    return array_distance(array1, array2)
+
+
+def array_max(array: Expr) -> Expr:
+    """Returns the maximum value in the array.
+
+    Examples:
+        >>> ctx = dfn.SessionContext()
+        >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
+        >>> result = df.select(
+        ...     dfn.functions.array_max(dfn.col("a")).alias("result"))
+        >>> result.collect_column("result")[0].as_py()
+        3
+    """
+    return Expr(f.array_max(array.expr))
+
+
+def list_max(array: Expr) -> Expr:
+    """Returns the maximum value in the array.
+
+    See Also:
+        This is an alias for :py:func:`array_max`.
+    """
+    return array_max(array)
+
+
+def array_min(array: Expr) -> Expr:
+    """Returns the minimum value in the array.
+
+    Examples:
+        >>> ctx = dfn.SessionContext()
+        >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
+        >>> result = df.select(
+        ...     dfn.functions.array_min(dfn.col("a")).alias("result"))
+        >>> result.collect_column("result")[0].as_py()
+        1
+    """
+    return Expr(f.array_min(array.expr))
+
+
+def list_min(array: Expr) -> Expr:
+    """Returns the minimum value in the array.
+
+    See Also:
+        This is an alias for :py:func:`array_min`.
+    """
+    return array_min(array)
+
+
+def array_reverse(array: Expr) -> Expr:
+    """Reverses the order of elements in the array.
+
+    Examples:
+        >>> ctx = dfn.SessionContext()
+        >>> df = ctx.from_pydict({"a": [[1, 2, 3]]})
+        >>> result = df.select(
+        ...     dfn.functions.array_reverse(dfn.col("a")).alias("result"))
+        >>> result.collect_column("result")[0].as_py()
+        [3, 2, 1]
+    """
+    return Expr(f.array_reverse(array.expr))
+
+
+def list_reverse(array: Expr) -> Expr:
+    """Reverses the order of elements in the array.
+
+    See Also:
+        This is an alias for :py:func:`array_reverse`.
+    """
+    return array_reverse(array)
+
+
+def arrays_zip(*arrays: Expr) -> Expr:
+    """Combines multiple arrays into a single array of structs.
+
+    Examples:
+        >>> ctx = dfn.SessionContext()
+        >>> df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]})
+        >>> result = df.select(
+        ...     dfn.functions.arrays_zip(dfn.col("a"), dfn.col("b")).alias("result"))
+        >>> result.collect_column("result")[0].as_py()
+        [{'c0': 1, 'c1': 3}, {'c0': 2, 'c1': 4}]
+    """
+    args = [a.expr for a in arrays]
+    return Expr(f.arrays_zip(args))
+
+
+def list_zip(*arrays: Expr) -> Expr:
+    """Combines multiple arrays into a single array of structs.
+
+    See Also:
+        This is an alias for :py:func:`arrays_zip`.
+    """
+    return arrays_zip(*arrays)
+
+
+def string_to_array(string: Expr, delimiter: Expr, null_string: Expr) -> Expr:
+    """Splits a string based on a delimiter and returns an array of parts.
+
+    Any parts matching the ``null_string`` will be replaced with ``NULL``.
+
+    Examples:
+        >>> ctx = dfn.SessionContext()
+        >>> df = ctx.from_pydict({"a": ["hello,world"]})
+        >>> result = df.select(
+        ...     dfn.functions.string_to_array(
+        ...         dfn.col("a"), dfn.lit(","), dfn.lit(""),
+        ...     ).alias("result"))
+        >>> result.collect_column("result")[0].as_py()
+        ['hello', 'world']
+    """
+    return Expr(f.string_to_array(string.expr, delimiter.expr, null_string.expr))
+
+
+def string_to_list(string: Expr, delimiter: Expr, null_string: Expr) -> Expr:
+    """Splits a string based on a delimiter and returns an array of parts.
+
+    See Also:
+        This is an alias for :py:func:`string_to_array`.
+    """
+    return string_to_array(string, delimiter, null_string)
+
+
+def gen_series(start: Expr, stop: Expr, step: Expr) -> Expr:
+    """Creates a list of values in the range between start and stop.
+
+    Unlike :py:func:`range`, this includes the upper bound.
+
+    Examples:
+        >>> ctx = dfn.SessionContext()
+        >>> df = ctx.from_pydict({"a": [0]})
+        >>> result = df.select(
+        ...     dfn.functions.gen_series(
+        ...         dfn.lit(1), dfn.lit(5), dfn.lit(1),
+        ...     ).alias("result"))
+        >>> result.collect_column("result")[0].as_py()
+        [1, 2, 3, 4, 5]
+    """
+    return Expr(f.gen_series(start.expr, stop.expr, step.expr))
+
+
+def generate_series(start: Expr, stop: Expr, step: Expr) -> Expr:
+    """Creates a list of values in the range between start and stop.
+
+    Unlike :py:func:`range`, this includes the upper bound.
+
+    See Also:
+        This is an alias for :py:func:`gen_series`.
+    """
+    return gen_series(start, stop, step)
+
+
 def flatten(array: Expr) -> Expr:
     """Flattens an array of arrays into a single array.
 

From a7c7de497b7821e9b38449803e13f34bb7902a52 Mon Sep 17 00:00:00 2001
From: Tim Saucer <timsaucer@gmail.com>
Date: Mon, 30 Mar 2026 16:14:13 -0400
Subject: [PATCH 2/9] Add unit tests for new array/list functions and aliases

Tests cover all functions and aliases added in the previous commit:
array_any_value, array_distance, array_max, array_min, array_reverse,
arrays_zip, string_to_array, gen_series, generate_series,
array_contains, list_contains, list_empty, list_pop_back,
list_pop_front, list_has, list_has_all, list_has_any, and list_*
aliases for the new functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 python/tests/test_functions.py | 189 +++++++++++++++++++++++++++++++++
 1 file changed, 189 insertions(+)

diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
index db141fbe0..8f3c9b856 100644
--- a/python/tests/test_functions.py
+++ b/python/tests/test_functions.py
@@ -1469,3 +1469,192 @@ def test_coalesce(df):
     assert result.column(0) == pa.array(
         ["Hello", "fallback", "!"], type=pa.string_view()
     )
+
+
+def test_array_any_value():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[None, 2, 3], [None, None, None], [1, 2, 3]]})
+    result = df.select(f.array_any_value(column("a")).alias("v")).collect()
+    values = [row.as_py() for row in result[0].column(0)]
+    assert values[0] == 2
+    assert values[1] is None
+    assert values[2] == 1
+
+
+def test_list_any_value():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[None, 5]]})
+    result = df.select(f.list_any_value(column("a")).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() == 5
+
+
+def test_array_distance():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1.0, 2.0]], "b": [[1.0, 4.0]]})
+    result = df.select(f.array_distance(column("a"), column("b")).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() == pytest.approx(2.0)
+
+
+def test_list_distance():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[3.0, 0.0]], "b": [[0.0, 4.0]]})
+    result = df.select(f.list_distance(column("a"), column("b")).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() == pytest.approx(5.0)
+
+
+def test_array_max():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1, 5, 3], [10, 2]]})
+    result = df.select(f.array_max(column("a")).alias("v")).collect()
+    values = [row.as_py() for row in result[0].column(0)]
+    assert values == [5, 10]
+
+
+def test_list_max():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[7, 2, 9]]})
+    result = df.select(f.list_max(column("a")).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() == 9
+
+
+def test_array_min():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1, 5, 3], [10, 2]]})
+    result = df.select(f.array_min(column("a")).alias("v")).collect()
+    values = [row.as_py() for row in result[0].column(0)]
+    assert values == [1, 2]
+
+
+def test_list_min():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[7, 2, 9]]})
+    result = df.select(f.list_min(column("a")).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() == 2
+
+
+def test_array_reverse():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5]]})
+    result = df.select(f.array_reverse(column("a")).alias("v")).collect()
+    values = [row.as_py() for row in result[0].column(0)]
+    assert values == [[3, 2, 1], [5, 4]]
+
+
+def test_list_reverse():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[10, 20, 30]]})
+    result = df.select(f.list_reverse(column("a")).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() == [30, 20, 10]
+
+
+def test_arrays_zip():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]})
+    result = df.select(f.arrays_zip(column("a"), column("b")).alias("v")).collect()
+    values = result[0].column(0)[0].as_py()
+    assert values == [{"c0": 1, "c1": 3}, {"c0": 2, "c1": 4}]
+
+
+def test_list_zip():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]})
+    result = df.select(f.list_zip(column("a"), column("b")).alias("v")).collect()
+    values = result[0].column(0)[0].as_py()
+    assert values == [{"c0": 1, "c1": 3}, {"c0": 2, "c1": 4}]
+
+
+def test_string_to_array():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": ["hello,world,foo"]})
+    result = df.select(
+        f.string_to_array(column("a"), literal(","), literal("")).alias("v")
+    ).collect()
+    assert result[0].column(0)[0].as_py() == ["hello", "world", "foo"]
+
+
+def test_string_to_list():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": ["a-b-c"]})
+    result = df.select(
+        f.string_to_list(column("a"), literal("-"), literal("")).alias("v")
+    ).collect()
+    assert result[0].column(0)[0].as_py() == ["a", "b", "c"]
+
+
+def test_gen_series():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [0]})
+    result = df.select(
+        f.gen_series(literal(1), literal(5), literal(1)).alias("v")
+    ).collect()
+    assert result[0].column(0)[0].as_py() == [1, 2, 3, 4, 5]
+
+
+def test_generate_series():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [0]})
+    result = df.select(
+        f.generate_series(literal(1), literal(3), literal(1)).alias("v")
+    ).collect()
+    assert result[0].column(0)[0].as_py() == [1, 2, 3]
+
+
+def test_array_contains():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1, 2, 3]]})
+    result = df.select(f.array_contains(column("a"), literal(2)).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() is True
+
+
+def test_list_contains():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1, 2, 3]]})
+    result = df.select(f.list_contains(column("a"), literal(99)).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() is False
+
+
+def test_list_empty():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[], [1, 2]]})
+    result = df.select(f.list_empty(column("a")).alias("v")).collect()
+    values = [row.as_py() for row in result[0].column(0)]
+    assert values == [True, False]
+
+
+def test_list_pop_back():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1, 2, 3]]})
+    result = df.select(f.list_pop_back(column("a")).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() == [1, 2]
+
+
+def test_list_pop_front():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1, 2, 3]]})
+    result = df.select(f.list_pop_front(column("a")).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() == [2, 3]
+
+
+def test_list_has():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1, 2, 3]]})
+    result = df.select(f.list_has(column("a"), literal(2)).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() is True
+
+
+def test_list_has_all():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1, 2, 3]]})
+    result = df.select(
+        f.list_has_all(column("a"), f.make_array(literal(1), literal(2))).alias("v")
+    ).collect()
+    assert result[0].column(0)[0].as_py() is True
+
+
+def test_list_has_any():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [[1, 2, 3]]})
+    result = df.select(
+        f.list_has_any(column("a"), f.make_array(literal(5), literal(2))).alias("v")
+    ).collect()
+    assert result[0].column(0)[0].as_py() is True

From 84d739339514464e18dce2656aeb931be3a88667 Mon Sep 17 00:00:00 2001
From: Tim Saucer <timsaucer@gmail.com>
Date: Fri, 3 Apr 2026 14:17:08 -0400
Subject: [PATCH 3/9] Improve array function APIs: optional params, better
 naming, restore comment

- Make null_string optional in string_to_array/string_to_list
- Make step optional in gen_series/generate_series
- Rename second_array to element in array_contains/list_has/list_contains
- Restore # Window Functions section comment in __all__
- Add tests for optional parameter variants

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/core/src/functions.rs   | 30 ++++++++++++++++++++++--
 python/datafusion/functions.py | 43 ++++++++++++++++++++--------------
 python/tests/test_functions.py | 31 +++++++++++++++++-------
 3 files changed, 75 insertions(+), 29 deletions(-)

diff --git a/crates/core/src/functions.rs b/crates/core/src/functions.rs
index 94759ac4a..286c94291 100644
--- a/crates/core/src/functions.rs
+++ b/crates/core/src/functions.rs
@@ -109,6 +109,34 @@ fn arrays_zip(exprs: Vec<PyExpr>) -> PyExpr {
     datafusion::functions_nested::expr_fn::arrays_zip(exprs).into()
 }
 
+#[pyfunction]
+#[pyo3(signature = (string, delimiter, null_string=None))]
+fn string_to_array(string: PyExpr, delimiter: PyExpr, null_string: Option<PyExpr>) -> PyExpr {
+    let mut args = vec![string.into(), delimiter.into()];
+    if let Some(null_string) = null_string {
+        args.push(null_string.into());
+    }
+    Expr::ScalarFunction(datafusion::logical_expr::expr::ScalarFunction::new_udf(
+        datafusion::functions_nested::string::string_to_array_udf(),
+        args,
+    ))
+    .into()
+}
+
+#[pyfunction]
+#[pyo3(signature = (start, stop, step=None))]
+fn gen_series(start: PyExpr, stop: PyExpr, step: Option<PyExpr>) -> PyExpr {
+    let mut args = vec![start.into(), stop.into()];
+    if let Some(step) = step {
+        args.push(step.into());
+    }
+    Expr::ScalarFunction(datafusion::logical_expr::expr::ScalarFunction::new_udf(
+        datafusion::functions_nested::range::gen_series_udf(),
+        args,
+    ))
+    .into()
+}
+
 #[pyfunction]
 #[pyo3(signature = (array, element, index=None))]
 fn array_position(array: PyExpr, element: PyExpr, index: Option<i64>) -> PyExpr {
@@ -687,8 +715,6 @@ array_fn!(array_any_value, array);
 array_fn!(array_max, array);
 array_fn!(array_min, array);
 array_fn!(array_reverse, array);
-array_fn!(string_to_array, string delimiter null_string);
-array_fn!(gen_series, start stop step);
 array_fn!(cardinality, array);
 array_fn!(flatten, array);
 array_fn!(range, start stop step);
diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
index e1954d4d9..a09e6b4fa 100644
--- a/python/datafusion/functions.py
+++ b/python/datafusion/functions.py
@@ -317,6 +317,7 @@
     "var_samp",
     "var_sample",
     "when",
+    # Window Functions
     "window",
 ]
 
@@ -2848,22 +2849,22 @@ def array_has_any(first_array: Expr, second_array: Expr) -> Expr:
     return Expr(f.array_has_any(first_array.expr, second_array.expr))
 
 
-def array_contains(first_array: Expr, second_array: Expr) -> Expr:
-    """Returns true if the element appears in the first array, otherwise false.
+def array_contains(array: Expr, element: Expr) -> Expr:
+    """Returns true if the element appears in the array, otherwise false.
 
     See Also:
         This is an alias for :py:func:`array_has`.
     """
-    return array_has(first_array, second_array)
+    return array_has(array, element)
 
 
-def list_has(first_array: Expr, second_array: Expr) -> Expr:
-    """Returns true if the element appears in the first array, otherwise false.
+def list_has(array: Expr, element: Expr) -> Expr:
+    """Returns true if the element appears in the array, otherwise false.
 
     See Also:
         This is an alias for :py:func:`array_has`.
     """
-    return array_has(first_array, second_array)
+    return array_has(array, element)
 
 
 def list_has_all(first_array: Expr, second_array: Expr) -> Expr:
@@ -2884,13 +2885,13 @@ def list_has_any(first_array: Expr, second_array: Expr) -> Expr:
     return array_has_any(first_array, second_array)
 
 
-def list_contains(first_array: Expr, second_array: Expr) -> Expr:
-    """Returns true if the element appears in the first array, otherwise false.
+def list_contains(array: Expr, element: Expr) -> Expr:
+    """Returns true if the element appears in the array, otherwise false.
 
     See Also:
         This is an alias for :py:func:`array_has`.
     """
-    return array_has(first_array, second_array)
+    return array_has(array, element)
 
 
 def array_position(array: Expr, element: Expr, index: int | None = 1) -> Expr:
@@ -3590,25 +3591,30 @@ def list_zip(*arrays: Expr) -> Expr:
     return arrays_zip(*arrays)
 
 
-def string_to_array(string: Expr, delimiter: Expr, null_string: Expr) -> Expr:
+def string_to_array(
+    string: Expr, delimiter: Expr, null_string: Expr | None = None
+) -> Expr:
     """Splits a string based on a delimiter and returns an array of parts.
 
-    Any parts matching the ``null_string`` will be replaced with ``NULL``.
+    Any parts matching the optional ``null_string`` will be replaced with ``NULL``.
 
     Examples:
         >>> ctx = dfn.SessionContext()
         >>> df = ctx.from_pydict({"a": ["hello,world"]})
         >>> result = df.select(
         ...     dfn.functions.string_to_array(
-        ...         dfn.col("a"), dfn.lit(","), dfn.lit(""),
+        ...         dfn.col("a"), dfn.lit(","),
         ...     ).alias("result"))
         >>> result.collect_column("result")[0].as_py()
         ['hello', 'world']
     """
-    return Expr(f.string_to_array(string.expr, delimiter.expr, null_string.expr))
+    null_expr = null_string.expr if null_string is not None else None
+    return Expr(f.string_to_array(string.expr, delimiter.expr, null_expr))
 
 
-def string_to_list(string: Expr, delimiter: Expr, null_string: Expr) -> Expr:
+def string_to_list(
+    string: Expr, delimiter: Expr, null_string: Expr | None = None
+) -> Expr:
     """Splits a string based on a delimiter and returns an array of parts.
 
     See Also:
@@ -3617,7 +3623,7 @@ def string_to_list(string: Expr, delimiter: Expr, null_string: Expr) -> Expr:
     return string_to_array(string, delimiter, null_string)
 
 
-def gen_series(start: Expr, stop: Expr, step: Expr) -> Expr:
+def gen_series(start: Expr, stop: Expr, step: Expr | None = None) -> Expr:
     """Creates a list of values in the range between start and stop.
 
     Unlike :py:func:`range`, this includes the upper bound.
@@ -3627,15 +3633,16 @@ def gen_series(start: Expr, stop: Expr, step: Expr) -> Expr:
         >>> df = ctx.from_pydict({"a": [0]})
         >>> result = df.select(
         ...     dfn.functions.gen_series(
-        ...         dfn.lit(1), dfn.lit(5), dfn.lit(1),
+        ...         dfn.lit(1), dfn.lit(5),
         ...     ).alias("result"))
         >>> result.collect_column("result")[0].as_py()
         [1, 2, 3, 4, 5]
     """
-    return Expr(f.gen_series(start.expr, stop.expr, step.expr))
+    step_expr = step.expr if step is not None else None
+    return Expr(f.gen_series(start.expr, stop.expr, step_expr))
 
 
-def generate_series(start: Expr, stop: Expr, step: Expr) -> Expr:
+def generate_series(start: Expr, stop: Expr, step: Expr | None = None) -> Expr:
     """Creates a list of values in the range between start and stop.
 
     Unlike :py:func:`range`, this includes the upper bound.
diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
index 8f3c9b856..7b642326b 100644
--- a/python/tests/test_functions.py
+++ b/python/tests/test_functions.py
@@ -1567,35 +1567,48 @@ def test_string_to_array():
     ctx = SessionContext()
     df = ctx.from_pydict({"a": ["hello,world,foo"]})
     result = df.select(
-        f.string_to_array(column("a"), literal(","), literal("")).alias("v")
+        f.string_to_array(column("a"), literal(",")).alias("v")
     ).collect()
     assert result[0].column(0)[0].as_py() == ["hello", "world", "foo"]
 
 
-def test_string_to_list():
+def test_string_to_array_with_null_string():
     ctx = SessionContext()
-    df = ctx.from_pydict({"a": ["a-b-c"]})
+    df = ctx.from_pydict({"a": ["hello,NA,world"]})
     result = df.select(
-        f.string_to_list(column("a"), literal("-"), literal("")).alias("v")
+        f.string_to_array(column("a"), literal(","), literal("NA")).alias("v")
     ).collect()
+    values = result[0].column(0)[0].as_py()
+    assert values == ["hello", None, "world"]
+
+
+def test_string_to_list():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": ["a-b-c"]})
+    result = df.select(f.string_to_list(column("a"), literal("-")).alias("v")).collect()
     assert result[0].column(0)[0].as_py() == ["a", "b", "c"]
 
 
 def test_gen_series():
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [0]})
-    result = df.select(
-        f.gen_series(literal(1), literal(5), literal(1)).alias("v")
-    ).collect()
+    result = df.select(f.gen_series(literal(1), literal(5)).alias("v")).collect()
     assert result[0].column(0)[0].as_py() == [1, 2, 3, 4, 5]
 
 
-def test_generate_series():
+def test_gen_series_with_step():
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [0]})
     result = df.select(
-        f.generate_series(literal(1), literal(3), literal(1)).alias("v")
+        f.gen_series(literal(1), literal(10), literal(3)).alias("v")
     ).collect()
+    assert result[0].column(0)[0].as_py() == [1, 4, 7, 10]
+
+
+def test_generate_series():
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [0]})
+    result = df.select(f.generate_series(literal(1), literal(3)).alias("v")).collect()
     assert result[0].column(0)[0].as_py() == [1, 2, 3]
 
 

From 6581b4f82ef570e44fdec563585c774aed17c98b Mon Sep 17 00:00:00 2001
From: Tim Saucer <timsaucer@gmail.com>
Date: Fri, 3 Apr 2026 14:25:15 -0400
Subject: [PATCH 4/9] Consolidate array/list function tests using pytest
 parametrize

Reduce 26 individual tests to 14 test functions with parametrized
cases, eliminating boilerplate while maintaining full coverage.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 python/tests/test_functions.py | 173 ++++++++++++---------------------
 1 file changed, 61 insertions(+), 112 deletions(-)

diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
index 7b642326b..49c86c8fb 100644
--- a/python/tests/test_functions.py
+++ b/python/tests/test_functions.py
@@ -1481,94 +1481,62 @@ def test_array_any_value():
     assert values[2] == 1
 
 
-def test_list_any_value():
+@pytest.mark.parametrize("func", [f.array_any_value, f.list_any_value])
+def test_any_value_aliases(func):
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [[None, 5]]})
-    result = df.select(f.list_any_value(column("a")).alias("v")).collect()
+    result = df.select(func(column("a")).alias("v")).collect()
     assert result[0].column(0)[0].as_py() == 5
 
 
-def test_array_distance():
+@pytest.mark.parametrize("func", [f.array_distance, f.list_distance])
+def test_array_distance_aliases(func):
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [[1.0, 2.0]], "b": [[1.0, 4.0]]})
-    result = df.select(f.array_distance(column("a"), column("b")).alias("v")).collect()
+    result = df.select(func(column("a"), column("b")).alias("v")).collect()
     assert result[0].column(0)[0].as_py() == pytest.approx(2.0)
 
 
-def test_list_distance():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[3.0, 0.0]], "b": [[0.0, 4.0]]})
-    result = df.select(f.list_distance(column("a"), column("b")).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() == pytest.approx(5.0)
-
-
-def test_array_max():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[1, 5, 3], [10, 2]]})
-    result = df.select(f.array_max(column("a")).alias("v")).collect()
-    values = [row.as_py() for row in result[0].column(0)]
-    assert values == [5, 10]
-
-
-def test_list_max():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[7, 2, 9]]})
-    result = df.select(f.list_max(column("a")).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() == 9
-
-
-def test_array_min():
+@pytest.mark.parametrize(
+    ("func", "expected"),
+    [
+        (f.array_max, [5, 10]),
+        (f.list_max, [5, 10]),
+        (f.array_min, [1, 2]),
+        (f.list_min, [1, 2]),
+    ],
+)
+def test_array_min_max(func, expected):
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [[1, 5, 3], [10, 2]]})
-    result = df.select(f.array_min(column("a")).alias("v")).collect()
+    result = df.select(func(column("a")).alias("v")).collect()
     values = [row.as_py() for row in result[0].column(0)]
-    assert values == [1, 2]
+    assert values == expected
 
 
-def test_list_min():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[7, 2, 9]]})
-    result = df.select(f.list_min(column("a")).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() == 2
-
-
-def test_array_reverse():
+@pytest.mark.parametrize("func", [f.array_reverse, f.list_reverse])
+def test_array_reverse_aliases(func):
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5]]})
-    result = df.select(f.array_reverse(column("a")).alias("v")).collect()
+    result = df.select(func(column("a")).alias("v")).collect()
     values = [row.as_py() for row in result[0].column(0)]
     assert values == [[3, 2, 1], [5, 4]]
 
 
-def test_list_reverse():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[10, 20, 30]]})
-    result = df.select(f.list_reverse(column("a")).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() == [30, 20, 10]
-
-
-def test_arrays_zip():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]})
-    result = df.select(f.arrays_zip(column("a"), column("b")).alias("v")).collect()
-    values = result[0].column(0)[0].as_py()
-    assert values == [{"c0": 1, "c1": 3}, {"c0": 2, "c1": 4}]
-
-
-def test_list_zip():
+@pytest.mark.parametrize("func", [f.arrays_zip, f.list_zip])
+def test_arrays_zip_aliases(func):
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]})
-    result = df.select(f.list_zip(column("a"), column("b")).alias("v")).collect()
+    result = df.select(func(column("a"), column("b")).alias("v")).collect()
     values = result[0].column(0)[0].as_py()
     assert values == [{"c0": 1, "c1": 3}, {"c0": 2, "c1": 4}]
 
 
-def test_string_to_array():
+@pytest.mark.parametrize("func", [f.string_to_array, f.string_to_list])
+def test_string_to_array_aliases(func):
     ctx = SessionContext()
     df = ctx.from_pydict({"a": ["hello,world,foo"]})
-    result = df.select(
-        f.string_to_array(column("a"), literal(",")).alias("v")
-    ).collect()
+    result = df.select(func(column("a"), literal(",")).alias("v")).collect()
     assert result[0].column(0)[0].as_py() == ["hello", "world", "foo"]
 
 
@@ -1582,17 +1550,11 @@ def test_string_to_array_with_null_string():
     assert values == ["hello", None, "world"]
 
 
-def test_string_to_list():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": ["a-b-c"]})
-    result = df.select(f.string_to_list(column("a"), literal("-")).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() == ["a", "b", "c"]
-
-
-def test_gen_series():
+@pytest.mark.parametrize("func", [f.gen_series, f.generate_series])
+def test_gen_series_aliases(func):
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [0]})
-    result = df.select(f.gen_series(literal(1), literal(5)).alias("v")).collect()
+    result = df.select(func(literal(1), literal(5)).alias("v")).collect()
     assert result[0].column(0)[0].as_py() == [1, 2, 3, 4, 5]
 
 
@@ -1605,25 +1567,37 @@ def test_gen_series_with_step():
     assert result[0].column(0)[0].as_py() == [1, 4, 7, 10]
 
 
-def test_generate_series():
+@pytest.mark.parametrize(
+    ("func", "element", "expected"),
+    [
+        (f.array_contains, literal(2), True),
+        (f.list_contains, literal(99), False),
+        (f.list_has, literal(2), True),
+    ],
+)
+def test_element_containment(func, element, expected):
     ctx = SessionContext()
-    df = ctx.from_pydict({"a": [0]})
-    result = df.select(f.generate_series(literal(1), literal(3)).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() == [1, 2, 3]
+    df = ctx.from_pydict({"a": [[1, 2, 3]]})
+    result = df.select(func(column("a"), element).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() is expected
 
 
-def test_array_contains():
+def test_list_has_all():
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [[1, 2, 3]]})
-    result = df.select(f.array_contains(column("a"), literal(2)).alias("v")).collect()
+    result = df.select(
+        f.list_has_all(column("a"), f.make_array(literal(1), literal(2))).alias("v")
+    ).collect()
     assert result[0].column(0)[0].as_py() is True
 
 
-def test_list_contains():
+def test_list_has_any():
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [[1, 2, 3]]})
-    result = df.select(f.list_contains(column("a"), literal(99)).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() is False
+    result = df.select(
+        f.list_has_any(column("a"), f.make_array(literal(5), literal(2))).alias("v")
+    ).collect()
+    assert result[0].column(0)[0].as_py() is True
 
 
 def test_list_empty():
@@ -1634,40 +1608,15 @@ def test_list_empty():
     assert values == [True, False]
 
 
-def test_list_pop_back():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[1, 2, 3]]})
-    result = df.select(f.list_pop_back(column("a")).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() == [1, 2]
-
-
-def test_list_pop_front():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[1, 2, 3]]})
-    result = df.select(f.list_pop_front(column("a")).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() == [2, 3]
-
-
-def test_list_has():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[1, 2, 3]]})
-    result = df.select(f.list_has(column("a"), literal(2)).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() is True
-
-
-def test_list_has_all():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[1, 2, 3]]})
-    result = df.select(
-        f.list_has_all(column("a"), f.make_array(literal(1), literal(2))).alias("v")
-    ).collect()
-    assert result[0].column(0)[0].as_py() is True
-
-
-def test_list_has_any():
+@pytest.mark.parametrize(
+    ("func", "expected"),
+    [
+        (f.list_pop_back, [1, 2]),
+        (f.list_pop_front, [2, 3]),
+    ],
+)
+def test_list_pop(func, expected):
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [[1, 2, 3]]})
-    result = df.select(
-        f.list_has_any(column("a"), f.make_array(literal(5), literal(2))).alias("v")
-    ).collect()
-    assert result[0].column(0)[0].as_py() is True
+    result = df.select(func(column("a")).alias("v")).collect()
+    assert result[0].column(0)[0].as_py() == expected

From b4b8775c02ac3d8646cd8e83859c7d42a8bdc745 Mon Sep 17 00:00:00 2001
From: Tim Saucer <timsaucer@gmail.com>
Date: Fri, 3 Apr 2026 14:29:43 -0400
Subject: [PATCH 5/9] Move list alias tests into existing test_array_functions
 parametrize block

Merge standalone tests for list_empty, list_pop_back, list_pop_front,
list_has, array_contains, list_contains, list_has_all, and list_has_any
into the existing parametrized test_array_functions block alongside
their array_* counterparts.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 python/tests/test_functions.py | 91 ++++++++++++++--------------------
 1 file changed, 36 insertions(+), 55 deletions(-)

diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
index 49c86c8fb..9bfde1b91 100644
--- a/python/tests/test_functions.py
+++ b/python/tests/test_functions.py
@@ -330,6 +330,10 @@ def py_flatten(arr):
             f.empty,
             lambda data: [len(r) == 0 for r in data],
         ),
+        (
+            f.list_empty,
+            lambda data: [len(r) == 0 for r in data],
+        ),
         (
             lambda col: f.array_extract(col, literal(1)),
             lambda data: [r[0] for r in data],
@@ -354,18 +358,42 @@ def py_flatten(arr):
             lambda col: f.array_has(col, literal(1.0)),
             lambda data: [1.0 in r for r in data],
         ),
+        (
+            lambda col: f.list_has(col, literal(1.0)),
+            lambda data: [1.0 in r for r in data],
+        ),
+        (
+            lambda col: f.array_contains(col, literal(1.0)),
+            lambda data: [1.0 in r for r in data],
+        ),
+        (
+            lambda col: f.list_contains(col, literal(1.0)),
+            lambda data: [1.0 in r for r in data],
+        ),
         (
             lambda col: f.array_has_all(
                 col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]])
             ),
             lambda data: [np.all([v in r for v in [1.0, 3.0, 5.0]]) for r in data],
         ),
+        (
+            lambda col: f.list_has_all(
+                col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]])
+            ),
+            lambda data: [np.all([v in r for v in [1.0, 3.0, 5.0]]) for r in data],
+        ),
         (
             lambda col: f.array_has_any(
                 col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]])
             ),
             lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data],
         ),
+        (
+            lambda col: f.list_has_any(
+                col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]])
+            ),
+            lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data],
+        ),
         (
             lambda col: f.array_position(col, literal(1.0)),
             lambda data: [py_indexof(r, 1.0) for r in data],
@@ -418,10 +446,18 @@ def py_flatten(arr):
             f.array_pop_back,
             lambda data: [arr[:-1] for arr in data],
         ),
+        (
+            f.list_pop_back,
+            lambda data: [arr[:-1] for arr in data],
+        ),
         (
             f.array_pop_front,
             lambda data: [arr[1:] for arr in data],
         ),
+        (
+            f.list_pop_front,
+            lambda data: [arr[1:] for arr in data],
+        ),
         (
             lambda col: f.array_remove(col, literal(3.0)),
             lambda data: [py_arr_remove(arr, 3.0, 1) for arr in data],
@@ -1565,58 +1601,3 @@ def test_gen_series_with_step():
         f.gen_series(literal(1), literal(10), literal(3)).alias("v")
     ).collect()
     assert result[0].column(0)[0].as_py() == [1, 4, 7, 10]
-
-
-@pytest.mark.parametrize(
-    ("func", "element", "expected"),
-    [
-        (f.array_contains, literal(2), True),
-        (f.list_contains, literal(99), False),
-        (f.list_has, literal(2), True),
-    ],
-)
-def test_element_containment(func, element, expected):
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[1, 2, 3]]})
-    result = df.select(func(column("a"), element).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() is expected
-
-
-def test_list_has_all():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[1, 2, 3]]})
-    result = df.select(
-        f.list_has_all(column("a"), f.make_array(literal(1), literal(2))).alias("v")
-    ).collect()
-    assert result[0].column(0)[0].as_py() is True
-
-
-def test_list_has_any():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[1, 2, 3]]})
-    result = df.select(
-        f.list_has_any(column("a"), f.make_array(literal(5), literal(2))).alias("v")
-    ).collect()
-    assert result[0].column(0)[0].as_py() is True
-
-
-def test_list_empty():
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[], [1, 2]]})
-    result = df.select(f.list_empty(column("a")).alias("v")).collect()
-    values = [row.as_py() for row in result[0].column(0)]
-    assert values == [True, False]
-
-
-@pytest.mark.parametrize(
-    ("func", "expected"),
-    [
-        (f.list_pop_back, [1, 2]),
-        (f.list_pop_front, [2, 3]),
-    ],
-)
-def test_list_pop(func, expected):
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[1, 2, 3]]})
-    result = df.select(func(column("a")).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() == expected

From ef48dd984c7d281c8bdaf1a6976f7a21ef296aaf Mon Sep 17 00:00:00 2001
From: Tim Saucer <timsaucer@gmail.com>
Date: Fri, 3 Apr 2026 14:32:38 -0400
Subject: [PATCH 6/9] Merge test_array_any_value into parametrized
 test_any_value_aliases

Use the richer multi-row dataset (including all-nulls case) for both
array_any_value and list_any_value via the parametrized test.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 python/tests/test_functions.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
index 9bfde1b91..e2817c74e 100644
--- a/python/tests/test_functions.py
+++ b/python/tests/test_functions.py
@@ -1507,24 +1507,17 @@ def test_coalesce(df):
     )
 
 
-def test_array_any_value():
+@pytest.mark.parametrize("func", [f.array_any_value, f.list_any_value])
+def test_any_value_aliases(func):
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [[None, 2, 3], [None, None, None], [1, 2, 3]]})
-    result = df.select(f.array_any_value(column("a")).alias("v")).collect()
+    result = df.select(func(column("a")).alias("v")).collect()
     values = [row.as_py() for row in result[0].column(0)]
     assert values[0] == 2
     assert values[1] is None
     assert values[2] == 1
 
 
-@pytest.mark.parametrize("func", [f.array_any_value, f.list_any_value])
-def test_any_value_aliases(func):
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": [[None, 5]]})
-    result = df.select(func(column("a")).alias("v")).collect()
-    assert result[0].column(0)[0].as_py() == 5
-
-
 @pytest.mark.parametrize("func", [f.array_distance, f.list_distance])
 def test_array_distance_aliases(func):
     ctx = SessionContext()

From db9163851d09886a4f756da999f57d5bc958973c Mon Sep 17 00:00:00 2001
From: Tim Saucer <timsaucer@gmail.com>
Date: Fri, 3 Apr 2026 15:08:19 -0400
Subject: [PATCH 7/9] Add arrays_overlap and list_overlap as aliases for
 array_has_any

These aliases match the upstream DataFusion SQL-level aliases, completing
the set of missing array functions from issue #1452.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 python/datafusion/functions.py | 20 ++++++++++++++++++++
 python/tests/test_functions.py | 12 ++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
index a09e6b4fa..118797e0f 100644
--- a/python/datafusion/functions.py
+++ b/python/datafusion/functions.py
@@ -95,6 +95,7 @@
     "array_sort",
     "array_to_string",
     "array_union",
+    "arrays_overlap",
     "arrays_zip",
     "arrow_cast",
     "arrow_typeof",
@@ -194,6 +195,7 @@
     "list_max",
     "list_min",
     "list_ndims",
+    "list_overlap",
     "list_pop_back",
     "list_pop_front",
     "list_position",
@@ -2885,6 +2887,24 @@ def list_has_any(first_array: Expr, second_array: Expr) -> Expr:
     return array_has_any(first_array, second_array)
 
 
+def arrays_overlap(first_array: Expr, second_array: Expr) -> Expr:
+    """Returns true if any element appears in both arrays.
+
+    See Also:
+        This is an alias for :py:func:`array_has_any`.
+    """
+    return array_has_any(first_array, second_array)
+
+
+def list_overlap(first_array: Expr, second_array: Expr) -> Expr:
+    """Returns true if any element appears in both arrays.
+
+    See Also:
+        This is an alias for :py:func:`array_has_any`.
+    """
+    return array_has_any(first_array, second_array)
+
+
 def list_contains(array: Expr, element: Expr) -> Expr:
     """Returns true if the element appears in the array, otherwise false.
 
diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
index e2817c74e..07278d791 100644
--- a/python/tests/test_functions.py
+++ b/python/tests/test_functions.py
@@ -394,6 +394,18 @@ def py_flatten(arr):
             ),
             lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data],
         ),
+        (
+            lambda col: f.arrays_overlap(
+                col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]])
+            ),
+            lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data],
+        ),
+        (
+            lambda col: f.list_overlap(
+                col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]])
+            ),
+            lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data],
+        ),
         (
             lambda col: f.array_position(col, literal(1.0)),
             lambda data: [py_indexof(r, 1.0) for r in data],

From fe4e2f7ea954c8a71687d1daf013171087a394ec Mon Sep 17 00:00:00 2001
From: Tim Saucer <timsaucer@gmail.com>
Date: Mon, 6 Apr 2026 07:10:23 -0400
Subject: [PATCH 8/9] Add docstring examples for optional params in
 string_to_array and gen_series

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 python/datafusion/functions.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
index 9bf6fab09..1b267731e 100644
--- a/python/datafusion/functions.py
+++ b/python/datafusion/functions.py
@@ -3702,6 +3702,15 @@ def string_to_array(
         ...     ).alias("result"))
         >>> result.collect_column("result")[0].as_py()
         ['hello', 'world']
+
+        Replace parts matching a ``null_string`` with ``NULL``:
+
+        >>> result = df.select(
+        ...     dfn.functions.string_to_array(
+        ...         dfn.col("a"), dfn.lit(","), null_string=dfn.lit("world"),
+        ...     ).alias("result"))
+        >>> result.collect_column("result")[0].as_py()
+        ['hello', None]
     """
     null_expr = null_string.expr if null_string is not None else None
     return Expr(f.string_to_array(string.expr, delimiter.expr, null_expr))
@@ -3732,6 +3741,15 @@ def gen_series(start: Expr, stop: Expr, step: Expr | None = None) -> Expr:
         ...     ).alias("result"))
         >>> result.collect_column("result")[0].as_py()
         [1, 2, 3, 4, 5]
+
+        Specify a custom ``step``:
+
+        >>> result = df.select(
+        ...     dfn.functions.gen_series(
+        ...         dfn.lit(1), dfn.lit(10), step=dfn.lit(3),
+        ...     ).alias("result"))
+        >>> result.collect_column("result")[0].as_py()
+        [1, 4, 7, 10]
     """
     step_expr = step.expr if step is not None else None
     return Expr(f.gen_series(start.expr, stop.expr, step_expr))

From dee9719242d25f2a309c23d8dae899a412e0c946 Mon Sep 17 00:00:00 2001
From: Tim Saucer <timsaucer@gmail.com>
Date: Mon, 6 Apr 2026 07:23:45 -0400
Subject: [PATCH 9/9] Update AGENTS file to demonstrate preferred method of
 documenting python functions

---
 AGENTS.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/AGENTS.md b/AGENTS.md
index 1853a84cd..f6fdfbd90 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -25,3 +25,20 @@ Skills follow the [Agent Skills](https://agentskills.io) open standard. Each ski
 
 - `SKILL.md` — The skill definition with YAML frontmatter (name, description, argument-hint) and detailed instructions.
 - Additional supporting files as needed.
+
+## Python Function Docstrings
+
+Every Python function must include a docstring with usage examples.
+
+- **Examples are required**: Each function needs at least one doctest-style example
+  demonstrating basic usage.
+- **Optional parameters**: If a function has optional parameters, include separate
+  examples that show usage both without and with the optional arguments. Pass
+  optional arguments using their keyword name (e.g., `step=dfn.lit(3)`) so readers
+  can immediately see which parameter is being demonstrated.
+- **Reuse input data**: Use the same input data across examples wherever possible.
+  The examples should demonstrate how different optional arguments change the output
+  for the same input, making the effect of each option easy to understand.
+- **Alias functions**: Functions that are simple aliases (e.g., `list_sort` aliasing
+  `array_sort`) only need a one-line description and a `See Also` reference to the
+  primary function. They do not need their own examples.