Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions python/datafusion/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@
"range",
"rank",
"regexp_count",
"regexp_instr",
"regexp_like",
"regexp_match",
"regexp_replace",
Expand Down Expand Up @@ -816,7 +817,7 @@ def regexp_replace(


def regexp_count(
string: Expr, pattern: Expr, start: Expr, flags: Expr | None = None
string: Expr, pattern: Expr, start: Expr | None = None, flags: Expr | None = None
) -> Expr:
"""Returns the number of matches in a string.

Expand All @@ -825,10 +826,42 @@ def regexp_count(
"""
if flags is not None:
flags = flags.expr
start = start.expr if start is not None else Expr.expr
start = start.expr if start is not None else start
return Expr(f.regexp_count(string.expr, pattern.expr, start, flags))


def regexp_instr(
values: Expr,
regex: Expr,
start: Expr | None = None,
n: Expr | None = None,
flags: Expr | None = None,
sub_expr: Expr | None = None,
) -> Expr:
"""Returns the position of a regular expression match in a string.

Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at position
``start`` (the first position is 1). Returns the starting or ending position based
on ``end_position``. Use ``flags`` to control regex behavior and ``sub_expr`` to
return the position of a specific capture group instead of the entire match.
"""
start = start.expr if start is not None else None
n = n.expr if n is not None else None
flags = flags.expr if flags is not None else None
sub_expr = sub_expr.expr if sub_expr is not None else None

return Expr(
f.regexp_instr(
values.expr,
regex.expr,
start,
n,
flags,
sub_expr,
)
)


def repeat(string: Expr, n: Expr) -> Expr:
"""Repeats the ``string`` to ``n`` times."""
return Expr(f.repeat(string.expr, n.expr))
Expand Down
33 changes: 32 additions & 1 deletion python/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,38 @@ def test_array_function_obj_tests(stmt, py_expr):
pa.array(["H-o", "W-d", "!"], type=pa.string_view()),
),
(
f.regexp_count(column("a"), literal("(ell|orl)"), literal(1)),
f.regexp_count(column("a"), literal("(ell|orl)"), start=literal(1)),
pa.array([1, 1, 0], type=pa.int64()),
),
(
f.regexp_count(column("a"), literal("(ell|orl)")),
pa.array([1, 1, 0], type=pa.int64()),
),
(
f.regexp_instr(column("a"), literal("(ell|orl)")),
pa.array([2, 2, 0], type=pa.int64()),
),
(
f.regexp_instr(column("a"), literal("([lr])"), n=literal(2)),
pa.array([4, 4, 0], type=pa.int64()),
),
(
f.regexp_instr(
column("a"),
literal("(x)?([hw])"),
start=literal(1),
n=literal(1),
flags=literal("i"),
sub_expr=literal(2),
),
pa.array([1, 1, 0], type=pa.int64()),
),
(
f.regexp_instr(column("a"), literal("([hw])"), flags=literal("i")),
pa.array([1, 1, 0], type=pa.int64()),
),
(
f.regexp_instr(column("a"), literal("(x)?([HW])"), sub_expr=literal(2)),
pa.array([1, 1, 0], type=pa.int64()),
),
],
Comment on lines 784 to 806
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I recommend adding in tests for all options. I don't expect problems here but it's come up in the past to make sure there are no mistakes along the path.

Copy link
Contributor Author

@mesejo mesejo Feb 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. Added a few more tests, with different option settings.

Expand Down
24 changes: 24 additions & 0 deletions src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,29 @@ fn regexp_count(
.into())
}

#[pyfunction]
#[pyo3(signature = (values, regex, start=None, n=None, flags=None, subexpr=None))]
/// Returns the position in a string where the specified occurrence of a regular expression is located
fn regexp_instr(
values: PyExpr,
regex: PyExpr,
start: Option<PyExpr>,
n: Option<PyExpr>,
flags: Option<PyExpr>,
subexpr: Option<PyExpr>,
) -> PyResult<PyExpr> {
Ok(functions::expr_fn::regexp_instr(
values.into(),
regex.into(),
start.map(|x| x.expr).or(Some(lit(1))),
n.map(|x| x.expr).or(Some(lit(1))),
None,
flags.map(|x| x.expr).or(Some(lit(""))),
subexpr.map(|x| x.expr).or(Some(lit(0))),
)
.into())
}

/// Creates a new Sort Expr
#[pyfunction]
fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult<PySortExpr> {
Expand Down Expand Up @@ -988,6 +1011,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(radians))?;
m.add_wrapped(wrap_pyfunction!(random))?;
m.add_wrapped(wrap_pyfunction!(regexp_count))?;
m.add_wrapped(wrap_pyfunction!(regexp_instr))?;
m.add_wrapped(wrap_pyfunction!(regexp_like))?;
m.add_wrapped(wrap_pyfunction!(regexp_match))?;
m.add_wrapped(wrap_pyfunction!(regexp_replace))?;
Expand Down