From 12e8a24c6de75b86f3ff1d5da2179897923c8a8d Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Mon, 16 Feb 2026 15:39:05 +0100 Subject: [PATCH 1/3] feat: add regexp_instr function The current implementation of regexp_instr in Datafusion, does not support endoption. Hence None is passed in the implementation of the function exposing it to Python. --- python/datafusion/functions.py | 33 +++++++++++++++++++++++++++++++++ python/tests/test_functions.py | 8 ++++++++ src/functions.rs | 24 ++++++++++++++++++++++++ 3 files changed, 65 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 2aed9dd39..b596af0fe 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -225,6 +225,7 @@ "range", "rank", "regexp_count", + "regexp_instr", "regexp_like", "regexp_match", "regexp_replace", @@ -829,6 +830,38 @@ def regexp_count( return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) +def regexp_instr( + values: Expr, + regex: Expr, + start: Expr | None = None, + n: Expr | None = None, + flags: Expr | None = None, + sub_expr: Expr | None = None, +) -> Expr: + """Returns the position of a regular expression match in a string. + + Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at position + ``start`` (the first position is 1). Returns the starting or ending position based + on ``end_position``. Use ``flags`` to control regex behavior and ``sub_expr`` to + return the position of a specific capture group instead of the entire match. + """ + start = start.expr if start is not None else None + n = n.expr if n is not None else None + flags = flags.expr if flags is not None else None + sub_expr = sub_expr.expr if sub_expr is not None else None + + return Expr( + f.regexp_instr( + values.expr, + regex.expr, + start, + n, + flags, + sub_expr, + ) + ) + + def repeat(string: Expr, n: Expr) -> Expr: """Repeats the ``string`` to ``n`` times.""" return Expr(f.repeat(string.expr, n.expr)) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 34c8c5c9f..45223870f 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -772,6 +772,14 @@ def test_array_function_obj_tests(stmt, py_expr): f.regexp_count(column("a"), literal("(ell|orl)"), literal(1)), pa.array([1, 1, 0], type=pa.int64()), ), + ( + f.regexp_instr(column("a"), literal("(ell|orl)")), + pa.array([2, 2, 0], type=pa.int64()), + ), + ( + f.regexp_instr(column("a"), literal("([lr])"), n=literal(2)), + pa.array([4, 4, 0], type=pa.int64()), + ), ], ) def test_string_functions(df, function, expected_result): diff --git a/src/functions.rs b/src/functions.rs index 5c802920b..90b3a0a4b 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -189,6 +189,29 @@ fn regexp_count( .into()) } +#[pyfunction] +#[pyo3(signature = (values, regex, start=None, n=None, flags=None, subexpr=None))] +/// Returns the position in a string where the specified occurrence of a regular expression is located +fn regexp_instr( + values: PyExpr, + regex: PyExpr, + start: Option, + n: Option, + flags: Option, + subexpr: Option, +) -> PyResult { + Ok(functions::expr_fn::regexp_instr( + values.into(), + regex.into(), + start.map(|x| x.expr).or(Some(lit(1))), + n.map(|x| x.expr).or(Some(lit(1))), + None, + flags.map(|x| x.expr).or(Some(lit(""))), + subexpr.map(|x| x.expr).or(Some(lit(0))), + ) + .into()) +} + /// Creates a new Sort Expr #[pyfunction] fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult { @@ -988,6 +1011,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(radians))?; m.add_wrapped(wrap_pyfunction!(random))?; m.add_wrapped(wrap_pyfunction!(regexp_count))?; + m.add_wrapped(wrap_pyfunction!(regexp_instr))?; m.add_wrapped(wrap_pyfunction!(regexp_like))?; m.add_wrapped(wrap_pyfunction!(regexp_match))?; m.add_wrapped(wrap_pyfunction!(regexp_replace))?; From 8d7d9f265d7dc7303b3031acab0f7043a5c6f239 Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Wed, 18 Feb 2026 15:45:36 +0100 Subject: [PATCH 2/3] chore: add test for all optional arguments --- python/tests/test_functions.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 45223870f..c79520947 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -780,6 +780,25 @@ def test_array_function_obj_tests(stmt, py_expr): f.regexp_instr(column("a"), literal("([lr])"), n=literal(2)), pa.array([4, 4, 0], type=pa.int64()), ), + ( + f.regexp_instr( + column("a"), + literal("(x)?([hw])"), + start=literal(1), + n=literal(1), + flags=literal("i"), + sub_expr=literal(2), + ), + pa.array([1, 1, 0], type=pa.int64()), + ), + ( + f.regexp_instr(column("a"), literal("([hw])"), flags=literal("i")), + pa.array([1, 1, 0], type=pa.int64()), + ), + ( + f.regexp_instr(column("a"), literal("(x)?([HW])"), sub_expr=literal(2)), + pa.array([1, 1, 0], type=pa.int64()), + ), ], ) def test_string_functions(df, function, expected_result): From 842bfea833e5052a4e5d2078baad4e488e3510f8 Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Wed, 18 Feb 2026 17:11:08 +0100 Subject: [PATCH 3/3] fix: make start truly optional in regexp_count --- python/datafusion/functions.py | 4 ++-- python/tests/test_functions.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index b596af0fe..431afcc30 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -817,7 +817,7 @@ def regexp_replace( def regexp_count( - string: Expr, pattern: Expr, start: Expr, flags: Expr | None = None + string: Expr, pattern: Expr, start: Expr | None = None, flags: Expr | None = None ) -> Expr: """Returns the number of matches in a string. @@ -826,7 +826,7 @@ def regexp_count( """ if flags is not None: flags = flags.expr - start = start.expr if start is not None else Expr.expr + start = start.expr if start is not None else start return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index c79520947..7b3332ed7 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -769,7 +769,11 @@ def test_array_function_obj_tests(stmt, py_expr): pa.array(["H-o", "W-d", "!"], type=pa.string_view()), ), ( - f.regexp_count(column("a"), literal("(ell|orl)"), literal(1)), + f.regexp_count(column("a"), literal("(ell|orl)"), start=literal(1)), + pa.array([1, 1, 0], type=pa.int64()), + ), + ( + f.regexp_count(column("a"), literal("(ell|orl)")), pa.array([1, 1, 0], type=pa.int64()), ), (