From b9d5f6539fb4449b2a758de265db3af2de5c9510 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 30 Mar 2026 16:06:41 -0400 Subject: [PATCH 1/9] Add missing array/list functions and aliases (#1452) Add new array functions from upstream DataFusion v53: array_any_value, array_distance, array_max, array_min, array_reverse, arrays_zip, string_to_array, and gen_series. Add corresponding list_* aliases and missing list_* aliases for existing functions (list_empty, list_pop_back, list_pop_front, list_has, list_has_all, list_has_any). Also add array_contains/list_contains as aliases for array_has, generate_series as alias for gen_series, and string_to_list as alias for string_to_array. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/core/src/functions.rs | 30 ++++ python/datafusion/functions.py | 294 ++++++++++++++++++++++++++++++++- 2 files changed, 323 insertions(+), 1 deletion(-) diff --git a/crates/core/src/functions.rs b/crates/core/src/functions.rs index fefe14b3e..94759ac4a 100644 --- a/crates/core/src/functions.rs +++ b/crates/core/src/functions.rs @@ -93,6 +93,22 @@ fn array_cat(exprs: Vec) -> PyExpr { array_concat(exprs) } +#[pyfunction] +fn array_distance(array1: PyExpr, array2: PyExpr) -> PyExpr { + let args = vec![array1.into(), array2.into()]; + Expr::ScalarFunction(datafusion::logical_expr::expr::ScalarFunction::new_udf( + datafusion::functions_nested::distance::array_distance_udf(), + args, + )) + .into() +} + +#[pyfunction] +fn arrays_zip(exprs: Vec) -> PyExpr { + let exprs = exprs.into_iter().map(|x| x.into()).collect(); + datafusion::functions_nested::expr_fn::arrays_zip(exprs).into() +} + #[pyfunction] #[pyo3(signature = (array, element, index=None))] fn array_position(array: PyExpr, element: PyExpr, index: Option) -> PyExpr { @@ -667,6 +683,12 @@ array_fn!(array_intersect, first_array second_array); array_fn!(array_union, array1 array2); array_fn!(array_except, first_array second_array); array_fn!(array_resize, array size value); +array_fn!(array_any_value, array); +array_fn!(array_max, array); +array_fn!(array_min, array); +array_fn!(array_reverse, array); +array_fn!(string_to_array, string delimiter null_string); +array_fn!(gen_series, start stop step); array_fn!(cardinality, array); array_fn!(flatten, array); array_fn!(range, start stop step); @@ -1129,6 +1151,14 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(array_replace_all))?; m.add_wrapped(wrap_pyfunction!(array_sort))?; m.add_wrapped(wrap_pyfunction!(array_slice))?; + m.add_wrapped(wrap_pyfunction!(array_any_value))?; + m.add_wrapped(wrap_pyfunction!(array_distance))?; + m.add_wrapped(wrap_pyfunction!(array_max))?; + m.add_wrapped(wrap_pyfunction!(array_min))?; + m.add_wrapped(wrap_pyfunction!(array_reverse))?; + m.add_wrapped(wrap_pyfunction!(arrays_zip))?; + m.add_wrapped(wrap_pyfunction!(string_to_array))?; + m.add_wrapped(wrap_pyfunction!(gen_series))?; m.add_wrapped(wrap_pyfunction!(flatten))?; m.add_wrapped(wrap_pyfunction!(cardinality))?; diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 2ef2f0473..e1954d4d9 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -53,10 +53,13 @@ "approx_percentile_cont_with_weight", "array", "array_agg", + "array_any_value", "array_append", "array_cat", "array_concat", + "array_contains", "array_dims", + "array_distance", "array_distinct", "array_element", "array_empty", @@ -69,6 +72,8 @@ "array_intersect", "array_join", "array_length", + "array_max", + "array_min", "array_ndims", "array_pop_back", "array_pop_front", @@ -85,10 +90,12 @@ "array_replace_all", "array_replace_n", "array_resize", + "array_reverse", "array_slice", "array_sort", "array_to_string", "array_union", + "arrays_zip", "arrow_cast", "arrow_typeof", "ascii", @@ -152,6 +159,8 @@ "floor", "from_unixtime", "gcd", + "gen_series", + "generate_series", "in_list", "initcap", "isnan", @@ -163,19 +172,30 @@ "left", "length", "levenshtein", + "list_any_value", "list_append", "list_cat", "list_concat", + "list_contains", "list_dims", + "list_distance", "list_distinct", "list_element", + "list_empty", "list_except", "list_extract", + "list_has", + "list_has_all", + "list_has_any", "list_indexof", "list_intersect", "list_join", "list_length", + "list_max", + "list_min", "list_ndims", + "list_pop_back", + "list_pop_front", "list_position", "list_positions", "list_prepend", @@ -189,10 +209,12 @@ "list_replace_all", "list_replace_n", "list_resize", + "list_reverse", "list_slice", "list_sort", "list_to_string", "list_union", + "list_zip", "ln", "log", "log2", @@ -263,6 +285,8 @@ "stddev_pop", "stddev_samp", "string_agg", + "string_to_array", + "string_to_list", "strpos", "struct", "substr", @@ -293,7 +317,6 @@ "var_samp", "var_sample", "when", - # Window Functions "window", ] @@ -2719,6 +2742,15 @@ def array_empty(array: Expr) -> Expr: return Expr(f.array_empty(array.expr)) +def list_empty(array: Expr) -> Expr: + """Returns a boolean indicating whether the array is empty. + + See Also: + This is an alias for :py:func:`array_empty`. + """ + return array_empty(array) + + def array_extract(array: Expr, n: Expr) -> Expr: """Extracts the element with the index n from the array. @@ -2816,6 +2848,51 @@ def array_has_any(first_array: Expr, second_array: Expr) -> Expr: return Expr(f.array_has_any(first_array.expr, second_array.expr)) +def array_contains(first_array: Expr, second_array: Expr) -> Expr: + """Returns true if the element appears in the first array, otherwise false. + + See Also: + This is an alias for :py:func:`array_has`. + """ + return array_has(first_array, second_array) + + +def list_has(first_array: Expr, second_array: Expr) -> Expr: + """Returns true if the element appears in the first array, otherwise false. + + See Also: + This is an alias for :py:func:`array_has`. + """ + return array_has(first_array, second_array) + + +def list_has_all(first_array: Expr, second_array: Expr) -> Expr: + """Determines if there is complete overlap ``second_array`` in ``first_array``. + + See Also: + This is an alias for :py:func:`array_has_all`. + """ + return array_has_all(first_array, second_array) + + +def list_has_any(first_array: Expr, second_array: Expr) -> Expr: + """Determine if there is an overlap between ``first_array`` and ``second_array``. + + See Also: + This is an alias for :py:func:`array_has_any`. + """ + return array_has_any(first_array, second_array) + + +def list_contains(first_array: Expr, second_array: Expr) -> Expr: + """Returns true if the element appears in the first array, otherwise false. + + See Also: + This is an alias for :py:func:`array_has`. + """ + return array_has(first_array, second_array) + + def array_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. @@ -2983,6 +3060,24 @@ def array_pop_front(array: Expr) -> Expr: return Expr(f.array_pop_front(array.expr)) +def list_pop_back(array: Expr) -> Expr: + """Returns the array without the last element. + + See Also: + This is an alias for :py:func:`array_pop_back`. + """ + return array_pop_back(array) + + +def list_pop_front(array: Expr) -> Expr: + """Returns the array without the first element. + + See Also: + This is an alias for :py:func:`array_pop_front`. + """ + return array_pop_front(array) + + def array_remove(array: Expr, element: Expr) -> Expr: """Removes the first element from the array equal to the given value. @@ -3354,6 +3449,203 @@ def list_resize(array: Expr, size: Expr, value: Expr) -> Expr: return array_resize(array, size, value) +def array_any_value(array: Expr) -> Expr: + """Returns the first non-null element in the array. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[None, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_any_value(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + 2 + """ + return Expr(f.array_any_value(array.expr)) + + +def list_any_value(array: Expr) -> Expr: + """Returns the first non-null element in the array. + + See Also: + This is an alias for :py:func:`array_any_value`. + """ + return array_any_value(array) + + +def array_distance(array1: Expr, array2: Expr) -> Expr: + """Returns the Euclidean distance between two numeric arrays. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1.0, 2.0]], "b": [[1.0, 4.0]]}) + >>> result = df.select( + ... dfn.functions.array_distance( + ... dfn.col("a"), dfn.col("b"), + ... ).alias("result")) + >>> result.collect_column("result")[0].as_py() + 2.0 + """ + return Expr(f.array_distance(array1.expr, array2.expr)) + + +def list_distance(array1: Expr, array2: Expr) -> Expr: + """Returns the Euclidean distance between two numeric arrays. + + See Also: + This is an alias for :py:func:`array_distance`. + """ + return array_distance(array1, array2) + + +def array_max(array: Expr) -> Expr: + """Returns the maximum value in the array. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_max(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + 3 + """ + return Expr(f.array_max(array.expr)) + + +def list_max(array: Expr) -> Expr: + """Returns the maximum value in the array. + + See Also: + This is an alias for :py:func:`array_max`. + """ + return array_max(array) + + +def array_min(array: Expr) -> Expr: + """Returns the minimum value in the array. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_min(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + 1 + """ + return Expr(f.array_min(array.expr)) + + +def list_min(array: Expr) -> Expr: + """Returns the minimum value in the array. + + See Also: + This is an alias for :py:func:`array_min`. + """ + return array_min(array) + + +def array_reverse(array: Expr) -> Expr: + """Reverses the order of elements in the array. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_reverse(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [3, 2, 1] + """ + return Expr(f.array_reverse(array.expr)) + + +def list_reverse(array: Expr) -> Expr: + """Reverses the order of elements in the array. + + See Also: + This is an alias for :py:func:`array_reverse`. + """ + return array_reverse(array) + + +def arrays_zip(*arrays: Expr) -> Expr: + """Combines multiple arrays into a single array of structs. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]}) + >>> result = df.select( + ... dfn.functions.arrays_zip(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [{'c0': 1, 'c1': 3}, {'c0': 2, 'c1': 4}] + """ + args = [a.expr for a in arrays] + return Expr(f.arrays_zip(args)) + + +def list_zip(*arrays: Expr) -> Expr: + """Combines multiple arrays into a single array of structs. + + See Also: + This is an alias for :py:func:`arrays_zip`. + """ + return arrays_zip(*arrays) + + +def string_to_array(string: Expr, delimiter: Expr, null_string: Expr) -> Expr: + """Splits a string based on a delimiter and returns an array of parts. + + Any parts matching the ``null_string`` will be replaced with ``NULL``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello,world"]}) + >>> result = df.select( + ... dfn.functions.string_to_array( + ... dfn.col("a"), dfn.lit(","), dfn.lit(""), + ... ).alias("result")) + >>> result.collect_column("result")[0].as_py() + ['hello', 'world'] + """ + return Expr(f.string_to_array(string.expr, delimiter.expr, null_string.expr)) + + +def string_to_list(string: Expr, delimiter: Expr, null_string: Expr) -> Expr: + """Splits a string based on a delimiter and returns an array of parts. + + See Also: + This is an alias for :py:func:`string_to_array`. + """ + return string_to_array(string, delimiter, null_string) + + +def gen_series(start: Expr, stop: Expr, step: Expr) -> Expr: + """Creates a list of values in the range between start and stop. + + Unlike :py:func:`range`, this includes the upper bound. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0]}) + >>> result = df.select( + ... dfn.functions.gen_series( + ... dfn.lit(1), dfn.lit(5), dfn.lit(1), + ... ).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 2, 3, 4, 5] + """ + return Expr(f.gen_series(start.expr, stop.expr, step.expr)) + + +def generate_series(start: Expr, stop: Expr, step: Expr) -> Expr: + """Creates a list of values in the range between start and stop. + + Unlike :py:func:`range`, this includes the upper bound. + + See Also: + This is an alias for :py:func:`gen_series`. + """ + return gen_series(start, stop, step) + + def flatten(array: Expr) -> Expr: """Flattens an array of arrays into a single array. From a7c7de497b7821e9b38449803e13f34bb7902a52 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 30 Mar 2026 16:14:13 -0400 Subject: [PATCH 2/9] Add unit tests for new array/list functions and aliases Tests cover all functions and aliases added in the previous commit: array_any_value, array_distance, array_max, array_min, array_reverse, arrays_zip, string_to_array, gen_series, generate_series, array_contains, list_contains, list_empty, list_pop_back, list_pop_front, list_has, list_has_all, list_has_any, and list_* aliases for the new functions. Co-Authored-By: Claude Opus 4.6 (1M context) --- python/tests/test_functions.py | 189 +++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index db141fbe0..8f3c9b856 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -1469,3 +1469,192 @@ def test_coalesce(df): assert result.column(0) == pa.array( ["Hello", "fallback", "!"], type=pa.string_view() ) + + +def test_array_any_value(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[None, 2, 3], [None, None, None], [1, 2, 3]]}) + result = df.select(f.array_any_value(column("a")).alias("v")).collect() + values = [row.as_py() for row in result[0].column(0)] + assert values[0] == 2 + assert values[1] is None + assert values[2] == 1 + + +def test_list_any_value(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[None, 5]]}) + result = df.select(f.list_any_value(column("a")).alias("v")).collect() + assert result[0].column(0)[0].as_py() == 5 + + +def test_array_distance(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1.0, 2.0]], "b": [[1.0, 4.0]]}) + result = df.select(f.array_distance(column("a"), column("b")).alias("v")).collect() + assert result[0].column(0)[0].as_py() == pytest.approx(2.0) + + +def test_list_distance(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[3.0, 0.0]], "b": [[0.0, 4.0]]}) + result = df.select(f.list_distance(column("a"), column("b")).alias("v")).collect() + assert result[0].column(0)[0].as_py() == pytest.approx(5.0) + + +def test_array_max(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 5, 3], [10, 2]]}) + result = df.select(f.array_max(column("a")).alias("v")).collect() + values = [row.as_py() for row in result[0].column(0)] + assert values == [5, 10] + + +def test_list_max(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[7, 2, 9]]}) + result = df.select(f.list_max(column("a")).alias("v")).collect() + assert result[0].column(0)[0].as_py() == 9 + + +def test_array_min(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 5, 3], [10, 2]]}) + result = df.select(f.array_min(column("a")).alias("v")).collect() + values = [row.as_py() for row in result[0].column(0)] + assert values == [1, 2] + + +def test_list_min(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[7, 2, 9]]}) + result = df.select(f.list_min(column("a")).alias("v")).collect() + assert result[0].column(0)[0].as_py() == 2 + + +def test_array_reverse(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5]]}) + result = df.select(f.array_reverse(column("a")).alias("v")).collect() + values = [row.as_py() for row in result[0].column(0)] + assert values == [[3, 2, 1], [5, 4]] + + +def test_list_reverse(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[10, 20, 30]]}) + result = df.select(f.list_reverse(column("a")).alias("v")).collect() + assert result[0].column(0)[0].as_py() == [30, 20, 10] + + +def test_arrays_zip(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]}) + result = df.select(f.arrays_zip(column("a"), column("b")).alias("v")).collect() + values = result[0].column(0)[0].as_py() + assert values == [{"c0": 1, "c1": 3}, {"c0": 2, "c1": 4}] + + +def test_list_zip(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]}) + result = df.select(f.list_zip(column("a"), column("b")).alias("v")).collect() + values = result[0].column(0)[0].as_py() + assert values == [{"c0": 1, "c1": 3}, {"c0": 2, "c1": 4}] + + +def test_string_to_array(): + ctx = SessionContext() + df = ctx.from_pydict({"a": ["hello,world,foo"]}) + result = df.select( + f.string_to_array(column("a"), literal(","), literal("")).alias("v") + ).collect() + assert result[0].column(0)[0].as_py() == ["hello", "world", "foo"] + + +def test_string_to_list(): + ctx = SessionContext() + df = ctx.from_pydict({"a": ["a-b-c"]}) + result = df.select( + f.string_to_list(column("a"), literal("-"), literal("")).alias("v") + ).collect() + assert result[0].column(0)[0].as_py() == ["a", "b", "c"] + + +def test_gen_series(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [0]}) + result = df.select( + f.gen_series(literal(1), literal(5), literal(1)).alias("v") + ).collect() + assert result[0].column(0)[0].as_py() == [1, 2, 3, 4, 5] + + +def test_generate_series(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [0]}) + result = df.select( + f.generate_series(literal(1), literal(3), literal(1)).alias("v") + ).collect() + assert result[0].column(0)[0].as_py() == [1, 2, 3] + + +def test_array_contains(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3]]}) + result = df.select(f.array_contains(column("a"), literal(2)).alias("v")).collect() + assert result[0].column(0)[0].as_py() is True + + +def test_list_contains(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3]]}) + result = df.select(f.list_contains(column("a"), literal(99)).alias("v")).collect() + assert result[0].column(0)[0].as_py() is False + + +def test_list_empty(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[], [1, 2]]}) + result = df.select(f.list_empty(column("a")).alias("v")).collect() + values = [row.as_py() for row in result[0].column(0)] + assert values == [True, False] + + +def test_list_pop_back(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3]]}) + result = df.select(f.list_pop_back(column("a")).alias("v")).collect() + assert result[0].column(0)[0].as_py() == [1, 2] + + +def test_list_pop_front(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3]]}) + result = df.select(f.list_pop_front(column("a")).alias("v")).collect() + assert result[0].column(0)[0].as_py() == [2, 3] + + +def test_list_has(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3]]}) + result = df.select(f.list_has(column("a"), literal(2)).alias("v")).collect() + assert result[0].column(0)[0].as_py() is True + + +def test_list_has_all(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3]]}) + result = df.select( + f.list_has_all(column("a"), f.make_array(literal(1), literal(2))).alias("v") + ).collect() + assert result[0].column(0)[0].as_py() is True + + +def test_list_has_any(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [[1, 2, 3]]}) + result = df.select( + f.list_has_any(column("a"), f.make_array(literal(5), literal(2))).alias("v") + ).collect() + assert result[0].column(0)[0].as_py() is True From 84d739339514464e18dce2656aeb931be3a88667 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 3 Apr 2026 14:17:08 -0400 Subject: [PATCH 3/9] Improve array function APIs: optional params, better naming, restore comment - Make null_string optional in string_to_array/string_to_list - Make step optional in gen_series/generate_series - Rename second_array to element in array_contains/list_has/list_contains - Restore # Window Functions section comment in __all__ - Add tests for optional parameter variants Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/core/src/functions.rs | 30 ++++++++++++++++++++++-- python/datafusion/functions.py | 43 ++++++++++++++++++++-------------- python/tests/test_functions.py | 31 +++++++++++++++++------- 3 files changed, 75 insertions(+), 29 deletions(-) diff --git a/crates/core/src/functions.rs b/crates/core/src/functions.rs index 94759ac4a..286c94291 100644 --- a/crates/core/src/functions.rs +++ b/crates/core/src/functions.rs @@ -109,6 +109,34 @@ fn arrays_zip(exprs: Vec) -> PyExpr { datafusion::functions_nested::expr_fn::arrays_zip(exprs).into() } +#[pyfunction] +#[pyo3(signature = (string, delimiter, null_string=None))] +fn string_to_array(string: PyExpr, delimiter: PyExpr, null_string: Option) -> PyExpr { + let mut args = vec![string.into(), delimiter.into()]; + if let Some(null_string) = null_string { + args.push(null_string.into()); + } + Expr::ScalarFunction(datafusion::logical_expr::expr::ScalarFunction::new_udf( + datafusion::functions_nested::string::string_to_array_udf(), + args, + )) + .into() +} + +#[pyfunction] +#[pyo3(signature = (start, stop, step=None))] +fn gen_series(start: PyExpr, stop: PyExpr, step: Option) -> PyExpr { + let mut args = vec![start.into(), stop.into()]; + if let Some(step) = step { + args.push(step.into()); + } + Expr::ScalarFunction(datafusion::logical_expr::expr::ScalarFunction::new_udf( + datafusion::functions_nested::range::gen_series_udf(), + args, + )) + .into() +} + #[pyfunction] #[pyo3(signature = (array, element, index=None))] fn array_position(array: PyExpr, element: PyExpr, index: Option) -> PyExpr { @@ -687,8 +715,6 @@ array_fn!(array_any_value, array); array_fn!(array_max, array); array_fn!(array_min, array); array_fn!(array_reverse, array); -array_fn!(string_to_array, string delimiter null_string); -array_fn!(gen_series, start stop step); array_fn!(cardinality, array); array_fn!(flatten, array); array_fn!(range, start stop step); diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index e1954d4d9..a09e6b4fa 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -317,6 +317,7 @@ "var_samp", "var_sample", "when", + # Window Functions "window", ] @@ -2848,22 +2849,22 @@ def array_has_any(first_array: Expr, second_array: Expr) -> Expr: return Expr(f.array_has_any(first_array.expr, second_array.expr)) -def array_contains(first_array: Expr, second_array: Expr) -> Expr: - """Returns true if the element appears in the first array, otherwise false. +def array_contains(array: Expr, element: Expr) -> Expr: + """Returns true if the element appears in the array, otherwise false. See Also: This is an alias for :py:func:`array_has`. """ - return array_has(first_array, second_array) + return array_has(array, element) -def list_has(first_array: Expr, second_array: Expr) -> Expr: - """Returns true if the element appears in the first array, otherwise false. +def list_has(array: Expr, element: Expr) -> Expr: + """Returns true if the element appears in the array, otherwise false. See Also: This is an alias for :py:func:`array_has`. """ - return array_has(first_array, second_array) + return array_has(array, element) def list_has_all(first_array: Expr, second_array: Expr) -> Expr: @@ -2884,13 +2885,13 @@ def list_has_any(first_array: Expr, second_array: Expr) -> Expr: return array_has_any(first_array, second_array) -def list_contains(first_array: Expr, second_array: Expr) -> Expr: - """Returns true if the element appears in the first array, otherwise false. +def list_contains(array: Expr, element: Expr) -> Expr: + """Returns true if the element appears in the array, otherwise false. See Also: This is an alias for :py:func:`array_has`. """ - return array_has(first_array, second_array) + return array_has(array, element) def array_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: @@ -3590,25 +3591,30 @@ def list_zip(*arrays: Expr) -> Expr: return arrays_zip(*arrays) -def string_to_array(string: Expr, delimiter: Expr, null_string: Expr) -> Expr: +def string_to_array( + string: Expr, delimiter: Expr, null_string: Expr | None = None +) -> Expr: """Splits a string based on a delimiter and returns an array of parts. - Any parts matching the ``null_string`` will be replaced with ``NULL``. + Any parts matching the optional ``null_string`` will be replaced with ``NULL``. Examples: >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": ["hello,world"]}) >>> result = df.select( ... dfn.functions.string_to_array( - ... dfn.col("a"), dfn.lit(","), dfn.lit(""), + ... dfn.col("a"), dfn.lit(","), ... ).alias("result")) >>> result.collect_column("result")[0].as_py() ['hello', 'world'] """ - return Expr(f.string_to_array(string.expr, delimiter.expr, null_string.expr)) + null_expr = null_string.expr if null_string is not None else None + return Expr(f.string_to_array(string.expr, delimiter.expr, null_expr)) -def string_to_list(string: Expr, delimiter: Expr, null_string: Expr) -> Expr: +def string_to_list( + string: Expr, delimiter: Expr, null_string: Expr | None = None +) -> Expr: """Splits a string based on a delimiter and returns an array of parts. See Also: @@ -3617,7 +3623,7 @@ def string_to_list(string: Expr, delimiter: Expr, null_string: Expr) -> Expr: return string_to_array(string, delimiter, null_string) -def gen_series(start: Expr, stop: Expr, step: Expr) -> Expr: +def gen_series(start: Expr, stop: Expr, step: Expr | None = None) -> Expr: """Creates a list of values in the range between start and stop. Unlike :py:func:`range`, this includes the upper bound. @@ -3627,15 +3633,16 @@ def gen_series(start: Expr, stop: Expr, step: Expr) -> Expr: >>> df = ctx.from_pydict({"a": [0]}) >>> result = df.select( ... dfn.functions.gen_series( - ... dfn.lit(1), dfn.lit(5), dfn.lit(1), + ... dfn.lit(1), dfn.lit(5), ... ).alias("result")) >>> result.collect_column("result")[0].as_py() [1, 2, 3, 4, 5] """ - return Expr(f.gen_series(start.expr, stop.expr, step.expr)) + step_expr = step.expr if step is not None else None + return Expr(f.gen_series(start.expr, stop.expr, step_expr)) -def generate_series(start: Expr, stop: Expr, step: Expr) -> Expr: +def generate_series(start: Expr, stop: Expr, step: Expr | None = None) -> Expr: """Creates a list of values in the range between start and stop. Unlike :py:func:`range`, this includes the upper bound. diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 8f3c9b856..7b642326b 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -1567,35 +1567,48 @@ def test_string_to_array(): ctx = SessionContext() df = ctx.from_pydict({"a": ["hello,world,foo"]}) result = df.select( - f.string_to_array(column("a"), literal(","), literal("")).alias("v") + f.string_to_array(column("a"), literal(",")).alias("v") ).collect() assert result[0].column(0)[0].as_py() == ["hello", "world", "foo"] -def test_string_to_list(): +def test_string_to_array_with_null_string(): ctx = SessionContext() - df = ctx.from_pydict({"a": ["a-b-c"]}) + df = ctx.from_pydict({"a": ["hello,NA,world"]}) result = df.select( - f.string_to_list(column("a"), literal("-"), literal("")).alias("v") + f.string_to_array(column("a"), literal(","), literal("NA")).alias("v") ).collect() + values = result[0].column(0)[0].as_py() + assert values == ["hello", None, "world"] + + +def test_string_to_list(): + ctx = SessionContext() + df = ctx.from_pydict({"a": ["a-b-c"]}) + result = df.select(f.string_to_list(column("a"), literal("-")).alias("v")).collect() assert result[0].column(0)[0].as_py() == ["a", "b", "c"] def test_gen_series(): ctx = SessionContext() df = ctx.from_pydict({"a": [0]}) - result = df.select( - f.gen_series(literal(1), literal(5), literal(1)).alias("v") - ).collect() + result = df.select(f.gen_series(literal(1), literal(5)).alias("v")).collect() assert result[0].column(0)[0].as_py() == [1, 2, 3, 4, 5] -def test_generate_series(): +def test_gen_series_with_step(): ctx = SessionContext() df = ctx.from_pydict({"a": [0]}) result = df.select( - f.generate_series(literal(1), literal(3), literal(1)).alias("v") + f.gen_series(literal(1), literal(10), literal(3)).alias("v") ).collect() + assert result[0].column(0)[0].as_py() == [1, 4, 7, 10] + + +def test_generate_series(): + ctx = SessionContext() + df = ctx.from_pydict({"a": [0]}) + result = df.select(f.generate_series(literal(1), literal(3)).alias("v")).collect() assert result[0].column(0)[0].as_py() == [1, 2, 3] From 6581b4f82ef570e44fdec563585c774aed17c98b Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 3 Apr 2026 14:25:15 -0400 Subject: [PATCH 4/9] Consolidate array/list function tests using pytest parametrize Reduce 26 individual tests to 14 test functions with parametrized cases, eliminating boilerplate while maintaining full coverage. Co-Authored-By: Claude Opus 4.6 (1M context) --- python/tests/test_functions.py | 173 ++++++++++++--------------------- 1 file changed, 61 insertions(+), 112 deletions(-) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 7b642326b..49c86c8fb 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -1481,94 +1481,62 @@ def test_array_any_value(): assert values[2] == 1 -def test_list_any_value(): +@pytest.mark.parametrize("func", [f.array_any_value, f.list_any_value]) +def test_any_value_aliases(func): ctx = SessionContext() df = ctx.from_pydict({"a": [[None, 5]]}) - result = df.select(f.list_any_value(column("a")).alias("v")).collect() + result = df.select(func(column("a")).alias("v")).collect() assert result[0].column(0)[0].as_py() == 5 -def test_array_distance(): +@pytest.mark.parametrize("func", [f.array_distance, f.list_distance]) +def test_array_distance_aliases(func): ctx = SessionContext() df = ctx.from_pydict({"a": [[1.0, 2.0]], "b": [[1.0, 4.0]]}) - result = df.select(f.array_distance(column("a"), column("b")).alias("v")).collect() + result = df.select(func(column("a"), column("b")).alias("v")).collect() assert result[0].column(0)[0].as_py() == pytest.approx(2.0) -def test_list_distance(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[3.0, 0.0]], "b": [[0.0, 4.0]]}) - result = df.select(f.list_distance(column("a"), column("b")).alias("v")).collect() - assert result[0].column(0)[0].as_py() == pytest.approx(5.0) - - -def test_array_max(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 5, 3], [10, 2]]}) - result = df.select(f.array_max(column("a")).alias("v")).collect() - values = [row.as_py() for row in result[0].column(0)] - assert values == [5, 10] - - -def test_list_max(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[7, 2, 9]]}) - result = df.select(f.list_max(column("a")).alias("v")).collect() - assert result[0].column(0)[0].as_py() == 9 - - -def test_array_min(): +@pytest.mark.parametrize( + ("func", "expected"), + [ + (f.array_max, [5, 10]), + (f.list_max, [5, 10]), + (f.array_min, [1, 2]), + (f.list_min, [1, 2]), + ], +) +def test_array_min_max(func, expected): ctx = SessionContext() df = ctx.from_pydict({"a": [[1, 5, 3], [10, 2]]}) - result = df.select(f.array_min(column("a")).alias("v")).collect() + result = df.select(func(column("a")).alias("v")).collect() values = [row.as_py() for row in result[0].column(0)] - assert values == [1, 2] + assert values == expected -def test_list_min(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[7, 2, 9]]}) - result = df.select(f.list_min(column("a")).alias("v")).collect() - assert result[0].column(0)[0].as_py() == 2 - - -def test_array_reverse(): +@pytest.mark.parametrize("func", [f.array_reverse, f.list_reverse]) +def test_array_reverse_aliases(func): ctx = SessionContext() df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5]]}) - result = df.select(f.array_reverse(column("a")).alias("v")).collect() + result = df.select(func(column("a")).alias("v")).collect() values = [row.as_py() for row in result[0].column(0)] assert values == [[3, 2, 1], [5, 4]] -def test_list_reverse(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[10, 20, 30]]}) - result = df.select(f.list_reverse(column("a")).alias("v")).collect() - assert result[0].column(0)[0].as_py() == [30, 20, 10] - - -def test_arrays_zip(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]}) - result = df.select(f.arrays_zip(column("a"), column("b")).alias("v")).collect() - values = result[0].column(0)[0].as_py() - assert values == [{"c0": 1, "c1": 3}, {"c0": 2, "c1": 4}] - - -def test_list_zip(): +@pytest.mark.parametrize("func", [f.arrays_zip, f.list_zip]) +def test_arrays_zip_aliases(func): ctx = SessionContext() df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]}) - result = df.select(f.list_zip(column("a"), column("b")).alias("v")).collect() + result = df.select(func(column("a"), column("b")).alias("v")).collect() values = result[0].column(0)[0].as_py() assert values == [{"c0": 1, "c1": 3}, {"c0": 2, "c1": 4}] -def test_string_to_array(): +@pytest.mark.parametrize("func", [f.string_to_array, f.string_to_list]) +def test_string_to_array_aliases(func): ctx = SessionContext() df = ctx.from_pydict({"a": ["hello,world,foo"]}) - result = df.select( - f.string_to_array(column("a"), literal(",")).alias("v") - ).collect() + result = df.select(func(column("a"), literal(",")).alias("v")).collect() assert result[0].column(0)[0].as_py() == ["hello", "world", "foo"] @@ -1582,17 +1550,11 @@ def test_string_to_array_with_null_string(): assert values == ["hello", None, "world"] -def test_string_to_list(): - ctx = SessionContext() - df = ctx.from_pydict({"a": ["a-b-c"]}) - result = df.select(f.string_to_list(column("a"), literal("-")).alias("v")).collect() - assert result[0].column(0)[0].as_py() == ["a", "b", "c"] - - -def test_gen_series(): +@pytest.mark.parametrize("func", [f.gen_series, f.generate_series]) +def test_gen_series_aliases(func): ctx = SessionContext() df = ctx.from_pydict({"a": [0]}) - result = df.select(f.gen_series(literal(1), literal(5)).alias("v")).collect() + result = df.select(func(literal(1), literal(5)).alias("v")).collect() assert result[0].column(0)[0].as_py() == [1, 2, 3, 4, 5] @@ -1605,25 +1567,37 @@ def test_gen_series_with_step(): assert result[0].column(0)[0].as_py() == [1, 4, 7, 10] -def test_generate_series(): +@pytest.mark.parametrize( + ("func", "element", "expected"), + [ + (f.array_contains, literal(2), True), + (f.list_contains, literal(99), False), + (f.list_has, literal(2), True), + ], +) +def test_element_containment(func, element, expected): ctx = SessionContext() - df = ctx.from_pydict({"a": [0]}) - result = df.select(f.generate_series(literal(1), literal(3)).alias("v")).collect() - assert result[0].column(0)[0].as_py() == [1, 2, 3] + df = ctx.from_pydict({"a": [[1, 2, 3]]}) + result = df.select(func(column("a"), element).alias("v")).collect() + assert result[0].column(0)[0].as_py() is expected -def test_array_contains(): +def test_list_has_all(): ctx = SessionContext() df = ctx.from_pydict({"a": [[1, 2, 3]]}) - result = df.select(f.array_contains(column("a"), literal(2)).alias("v")).collect() + result = df.select( + f.list_has_all(column("a"), f.make_array(literal(1), literal(2))).alias("v") + ).collect() assert result[0].column(0)[0].as_py() is True -def test_list_contains(): +def test_list_has_any(): ctx = SessionContext() df = ctx.from_pydict({"a": [[1, 2, 3]]}) - result = df.select(f.list_contains(column("a"), literal(99)).alias("v")).collect() - assert result[0].column(0)[0].as_py() is False + result = df.select( + f.list_has_any(column("a"), f.make_array(literal(5), literal(2))).alias("v") + ).collect() + assert result[0].column(0)[0].as_py() is True def test_list_empty(): @@ -1634,40 +1608,15 @@ def test_list_empty(): assert values == [True, False] -def test_list_pop_back(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3]]}) - result = df.select(f.list_pop_back(column("a")).alias("v")).collect() - assert result[0].column(0)[0].as_py() == [1, 2] - - -def test_list_pop_front(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3]]}) - result = df.select(f.list_pop_front(column("a")).alias("v")).collect() - assert result[0].column(0)[0].as_py() == [2, 3] - - -def test_list_has(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3]]}) - result = df.select(f.list_has(column("a"), literal(2)).alias("v")).collect() - assert result[0].column(0)[0].as_py() is True - - -def test_list_has_all(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3]]}) - result = df.select( - f.list_has_all(column("a"), f.make_array(literal(1), literal(2))).alias("v") - ).collect() - assert result[0].column(0)[0].as_py() is True - - -def test_list_has_any(): +@pytest.mark.parametrize( + ("func", "expected"), + [ + (f.list_pop_back, [1, 2]), + (f.list_pop_front, [2, 3]), + ], +) +def test_list_pop(func, expected): ctx = SessionContext() df = ctx.from_pydict({"a": [[1, 2, 3]]}) - result = df.select( - f.list_has_any(column("a"), f.make_array(literal(5), literal(2))).alias("v") - ).collect() - assert result[0].column(0)[0].as_py() is True + result = df.select(func(column("a")).alias("v")).collect() + assert result[0].column(0)[0].as_py() == expected From b4b8775c02ac3d8646cd8e83859c7d42a8bdc745 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 3 Apr 2026 14:29:43 -0400 Subject: [PATCH 5/9] Move list alias tests into existing test_array_functions parametrize block Merge standalone tests for list_empty, list_pop_back, list_pop_front, list_has, array_contains, list_contains, list_has_all, and list_has_any into the existing parametrized test_array_functions block alongside their array_* counterparts. Co-Authored-By: Claude Opus 4.6 (1M context) --- python/tests/test_functions.py | 91 ++++++++++++++-------------------- 1 file changed, 36 insertions(+), 55 deletions(-) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 49c86c8fb..9bfde1b91 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -330,6 +330,10 @@ def py_flatten(arr): f.empty, lambda data: [len(r) == 0 for r in data], ), + ( + f.list_empty, + lambda data: [len(r) == 0 for r in data], + ), ( lambda col: f.array_extract(col, literal(1)), lambda data: [r[0] for r in data], @@ -354,18 +358,42 @@ def py_flatten(arr): lambda col: f.array_has(col, literal(1.0)), lambda data: [1.0 in r for r in data], ), + ( + lambda col: f.list_has(col, literal(1.0)), + lambda data: [1.0 in r for r in data], + ), + ( + lambda col: f.array_contains(col, literal(1.0)), + lambda data: [1.0 in r for r in data], + ), + ( + lambda col: f.list_contains(col, literal(1.0)), + lambda data: [1.0 in r for r in data], + ), ( lambda col: f.array_has_all( col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]]) ), lambda data: [np.all([v in r for v in [1.0, 3.0, 5.0]]) for r in data], ), + ( + lambda col: f.list_has_all( + col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]]) + ), + lambda data: [np.all([v in r for v in [1.0, 3.0, 5.0]]) for r in data], + ), ( lambda col: f.array_has_any( col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]]) ), lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data], ), + ( + lambda col: f.list_has_any( + col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]]) + ), + lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data], + ), ( lambda col: f.array_position(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], @@ -418,10 +446,18 @@ def py_flatten(arr): f.array_pop_back, lambda data: [arr[:-1] for arr in data], ), + ( + f.list_pop_back, + lambda data: [arr[:-1] for arr in data], + ), ( f.array_pop_front, lambda data: [arr[1:] for arr in data], ), + ( + f.list_pop_front, + lambda data: [arr[1:] for arr in data], + ), ( lambda col: f.array_remove(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0, 1) for arr in data], @@ -1565,58 +1601,3 @@ def test_gen_series_with_step(): f.gen_series(literal(1), literal(10), literal(3)).alias("v") ).collect() assert result[0].column(0)[0].as_py() == [1, 4, 7, 10] - - -@pytest.mark.parametrize( - ("func", "element", "expected"), - [ - (f.array_contains, literal(2), True), - (f.list_contains, literal(99), False), - (f.list_has, literal(2), True), - ], -) -def test_element_containment(func, element, expected): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3]]}) - result = df.select(func(column("a"), element).alias("v")).collect() - assert result[0].column(0)[0].as_py() is expected - - -def test_list_has_all(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3]]}) - result = df.select( - f.list_has_all(column("a"), f.make_array(literal(1), literal(2))).alias("v") - ).collect() - assert result[0].column(0)[0].as_py() is True - - -def test_list_has_any(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3]]}) - result = df.select( - f.list_has_any(column("a"), f.make_array(literal(5), literal(2))).alias("v") - ).collect() - assert result[0].column(0)[0].as_py() is True - - -def test_list_empty(): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[], [1, 2]]}) - result = df.select(f.list_empty(column("a")).alias("v")).collect() - values = [row.as_py() for row in result[0].column(0)] - assert values == [True, False] - - -@pytest.mark.parametrize( - ("func", "expected"), - [ - (f.list_pop_back, [1, 2]), - (f.list_pop_front, [2, 3]), - ], -) -def test_list_pop(func, expected): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3]]}) - result = df.select(func(column("a")).alias("v")).collect() - assert result[0].column(0)[0].as_py() == expected From ef48dd984c7d281c8bdaf1a6976f7a21ef296aaf Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 3 Apr 2026 14:32:38 -0400 Subject: [PATCH 6/9] Merge test_array_any_value into parametrized test_any_value_aliases Use the richer multi-row dataset (including all-nulls case) for both array_any_value and list_any_value via the parametrized test. Co-Authored-By: Claude Opus 4.6 (1M context) --- python/tests/test_functions.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 9bfde1b91..e2817c74e 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -1507,24 +1507,17 @@ def test_coalesce(df): ) -def test_array_any_value(): +@pytest.mark.parametrize("func", [f.array_any_value, f.list_any_value]) +def test_any_value_aliases(func): ctx = SessionContext() df = ctx.from_pydict({"a": [[None, 2, 3], [None, None, None], [1, 2, 3]]}) - result = df.select(f.array_any_value(column("a")).alias("v")).collect() + result = df.select(func(column("a")).alias("v")).collect() values = [row.as_py() for row in result[0].column(0)] assert values[0] == 2 assert values[1] is None assert values[2] == 1 -@pytest.mark.parametrize("func", [f.array_any_value, f.list_any_value]) -def test_any_value_aliases(func): - ctx = SessionContext() - df = ctx.from_pydict({"a": [[None, 5]]}) - result = df.select(func(column("a")).alias("v")).collect() - assert result[0].column(0)[0].as_py() == 5 - - @pytest.mark.parametrize("func", [f.array_distance, f.list_distance]) def test_array_distance_aliases(func): ctx = SessionContext() From db9163851d09886a4f756da999f57d5bc958973c Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 3 Apr 2026 15:08:19 -0400 Subject: [PATCH 7/9] Add arrays_overlap and list_overlap as aliases for array_has_any These aliases match the upstream DataFusion SQL-level aliases, completing the set of missing array functions from issue #1452. Co-Authored-By: Claude Opus 4.6 (1M context) --- python/datafusion/functions.py | 20 ++++++++++++++++++++ python/tests/test_functions.py | 12 ++++++++++++ 2 files changed, 32 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index a09e6b4fa..118797e0f 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -95,6 +95,7 @@ "array_sort", "array_to_string", "array_union", + "arrays_overlap", "arrays_zip", "arrow_cast", "arrow_typeof", @@ -194,6 +195,7 @@ "list_max", "list_min", "list_ndims", + "list_overlap", "list_pop_back", "list_pop_front", "list_position", @@ -2885,6 +2887,24 @@ def list_has_any(first_array: Expr, second_array: Expr) -> Expr: return array_has_any(first_array, second_array) +def arrays_overlap(first_array: Expr, second_array: Expr) -> Expr: + """Returns true if any element appears in both arrays. + + See Also: + This is an alias for :py:func:`array_has_any`. + """ + return array_has_any(first_array, second_array) + + +def list_overlap(first_array: Expr, second_array: Expr) -> Expr: + """Returns true if any element appears in both arrays. + + See Also: + This is an alias for :py:func:`array_has_any`. + """ + return array_has_any(first_array, second_array) + + def list_contains(array: Expr, element: Expr) -> Expr: """Returns true if the element appears in the array, otherwise false. diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index e2817c74e..07278d791 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -394,6 +394,18 @@ def py_flatten(arr): ), lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data], ), + ( + lambda col: f.arrays_overlap( + col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]]) + ), + lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data], + ), + ( + lambda col: f.list_overlap( + col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]]) + ), + lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data], + ), ( lambda col: f.array_position(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], From fe4e2f7ea954c8a71687d1daf013171087a394ec Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 6 Apr 2026 07:10:23 -0400 Subject: [PATCH 8/9] Add docstring examples for optional params in string_to_array and gen_series Co-Authored-By: Claude Opus 4.6 (1M context) --- python/datafusion/functions.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 9bf6fab09..1b267731e 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -3702,6 +3702,15 @@ def string_to_array( ... ).alias("result")) >>> result.collect_column("result")[0].as_py() ['hello', 'world'] + + Replace parts matching a ``null_string`` with ``NULL``: + + >>> result = df.select( + ... dfn.functions.string_to_array( + ... dfn.col("a"), dfn.lit(","), null_string=dfn.lit("world"), + ... ).alias("result")) + >>> result.collect_column("result")[0].as_py() + ['hello', None] """ null_expr = null_string.expr if null_string is not None else None return Expr(f.string_to_array(string.expr, delimiter.expr, null_expr)) @@ -3732,6 +3741,15 @@ def gen_series(start: Expr, stop: Expr, step: Expr | None = None) -> Expr: ... ).alias("result")) >>> result.collect_column("result")[0].as_py() [1, 2, 3, 4, 5] + + Specify a custom ``step``: + + >>> result = df.select( + ... dfn.functions.gen_series( + ... dfn.lit(1), dfn.lit(10), step=dfn.lit(3), + ... ).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 4, 7, 10] """ step_expr = step.expr if step is not None else None return Expr(f.gen_series(start.expr, stop.expr, step_expr)) From dee9719242d25f2a309c23d8dae899a412e0c946 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 6 Apr 2026 07:23:45 -0400 Subject: [PATCH 9/9] Update AGENTS file to demonstrate preferred method of documenting python functions --- AGENTS.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index 1853a84cd..f6fdfbd90 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -25,3 +25,20 @@ Skills follow the [Agent Skills](https://agentskills.io) open standard. Each ski - `SKILL.md` — The skill definition with YAML frontmatter (name, description, argument-hint) and detailed instructions. - Additional supporting files as needed. + +## Python Function Docstrings + +Every Python function must include a docstring with usage examples. + +- **Examples are required**: Each function needs at least one doctest-style example + demonstrating basic usage. +- **Optional parameters**: If a function has optional parameters, include separate + examples that show usage both without and with the optional arguments. Pass + optional arguments using their keyword name (e.g., `step=dfn.lit(3)`) so readers + can immediately see which parameter is being demonstrated. +- **Reuse input data**: Use the same input data across examples wherever possible. + The examples should demonstrate how different optional arguments change the output + for the same input, making the effect of each option easy to understand. +- **Alias functions**: Functions that are simple aliases (e.g., `list_sort` aliasing + `array_sort`) only need a one-line description and a `See Also` reference to the + primary function. They do not need their own examples.