Skip to content

Commit 12e8a24

Browse files
committed
feat: add regexp_instr function
The current implementation of regexp_instr in Datafusion, does not support endoption. Hence None is passed in the implementation of the function exposing it to Python.
1 parent 4cd5674 commit 12e8a24

File tree

3 files changed

+65
-0
lines changed

3 files changed

+65
-0
lines changed

python/datafusion/functions.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@
225225
"range",
226226
"rank",
227227
"regexp_count",
228+
"regexp_instr",
228229
"regexp_like",
229230
"regexp_match",
230231
"regexp_replace",
@@ -829,6 +830,38 @@ def regexp_count(
829830
return Expr(f.regexp_count(string.expr, pattern.expr, start, flags))
830831

831832

833+
def regexp_instr(
834+
values: Expr,
835+
regex: Expr,
836+
start: Expr | None = None,
837+
n: Expr | None = None,
838+
flags: Expr | None = None,
839+
sub_expr: Expr | None = None,
840+
) -> Expr:
841+
"""Returns the position of a regular expression match in a string.
842+
843+
Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at position
844+
``start`` (the first position is 1). Returns the starting or ending position based
845+
on ``end_position``. Use ``flags`` to control regex behavior and ``sub_expr`` to
846+
return the position of a specific capture group instead of the entire match.
847+
"""
848+
start = start.expr if start is not None else None
849+
n = n.expr if n is not None else None
850+
flags = flags.expr if flags is not None else None
851+
sub_expr = sub_expr.expr if sub_expr is not None else None
852+
853+
return Expr(
854+
f.regexp_instr(
855+
values.expr,
856+
regex.expr,
857+
start,
858+
n,
859+
flags,
860+
sub_expr,
861+
)
862+
)
863+
864+
832865
def repeat(string: Expr, n: Expr) -> Expr:
833866
"""Repeats the ``string`` to ``n`` times."""
834867
return Expr(f.repeat(string.expr, n.expr))

python/tests/test_functions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,14 @@ def test_array_function_obj_tests(stmt, py_expr):
772772
f.regexp_count(column("a"), literal("(ell|orl)"), literal(1)),
773773
pa.array([1, 1, 0], type=pa.int64()),
774774
),
775+
(
776+
f.regexp_instr(column("a"), literal("(ell|orl)")),
777+
pa.array([2, 2, 0], type=pa.int64()),
778+
),
779+
(
780+
f.regexp_instr(column("a"), literal("([lr])"), n=literal(2)),
781+
pa.array([4, 4, 0], type=pa.int64()),
782+
),
775783
],
776784
)
777785
def test_string_functions(df, function, expected_result):

src/functions.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,29 @@ fn regexp_count(
189189
.into())
190190
}
191191

192+
#[pyfunction]
193+
#[pyo3(signature = (values, regex, start=None, n=None, flags=None, subexpr=None))]
194+
/// Returns the position in a string where the specified occurrence of a regular expression is located
195+
fn regexp_instr(
196+
values: PyExpr,
197+
regex: PyExpr,
198+
start: Option<PyExpr>,
199+
n: Option<PyExpr>,
200+
flags: Option<PyExpr>,
201+
subexpr: Option<PyExpr>,
202+
) -> PyResult<PyExpr> {
203+
Ok(functions::expr_fn::regexp_instr(
204+
values.into(),
205+
regex.into(),
206+
start.map(|x| x.expr).or(Some(lit(1))),
207+
n.map(|x| x.expr).or(Some(lit(1))),
208+
None,
209+
flags.map(|x| x.expr).or(Some(lit(""))),
210+
subexpr.map(|x| x.expr).or(Some(lit(0))),
211+
)
212+
.into())
213+
}
214+
192215
/// Creates a new Sort Expr
193216
#[pyfunction]
194217
fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult<PySortExpr> {
@@ -988,6 +1011,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
9881011
m.add_wrapped(wrap_pyfunction!(radians))?;
9891012
m.add_wrapped(wrap_pyfunction!(random))?;
9901013
m.add_wrapped(wrap_pyfunction!(regexp_count))?;
1014+
m.add_wrapped(wrap_pyfunction!(regexp_instr))?;
9911015
m.add_wrapped(wrap_pyfunction!(regexp_like))?;
9921016
m.add_wrapped(wrap_pyfunction!(regexp_match))?;
9931017
m.add_wrapped(wrap_pyfunction!(regexp_replace))?;

0 commit comments

Comments
 (0)